This file was created with JabRef 1.4. Encoding: ISO8859_1 @ARTICLE{Becker1988, author = {R. A. Becker and J. M. Chambers, J. M.}, title = {Auditing of data analyses}, journal = {SIAM Journal of Scientific and Statistical Computing}, year = {1988}, volume = {9}, number = {4}, pages = {747-760}, owner = {pgroth}, abstract = {The AUDIT utility allows the user to review what has happened during a set of S-PLUS sessions. When the AUDIT utility is invoked, it reads the default audit file (or auditfile if given) finding all top-level expressions, and which objects were read and written by each expression. It then allows the user (through an arcane syntax) to inquire about which expressions read or wrote a specific object, to backtrack from a specific expression, or to create a source file that will recreate an expression. }, } @ARTICLE{Lanter1991a, author = {D.P. Lanter}, title = {Design of a Lineage-Based Meta-Data Base for GIS}, journal = {Cartography and Geographic Information Systems}, year = {1991}, volume = {18}, number = {4}, pages = {255-261}, abstract = {This paper presents the conceptual design of a meta-database system for documenting data sources and GIS transformations applied to derive cartographic products. Artificial intelligence techniques of semantic networks are used to organize input-output relationships between map layers and frames to organize lineage attributes characterizing source, intermediate, and product layers. An illustrative example indicates that a lineage meta-database enables GIS users to engage in source assessment throughout their analysis of spatial data sets.}, owner = {pgroth}, comment = {This paper is found on page 16 of Lanter1991}, } @TECHREPORT{Lanter1991, author = {D.P. Lanter}, title = {Lineage in GIS: The Problem and a Solution}, institution = {National Center for Geographic Information and Analysis (NCGIA), UCSB}, year = {1991}, number = {90-6}, address = {Santa Barbara, CA}, volume = {18}, owner = {pgroth}, pages = {255-261}, abstract = {This paper focuses attention on a fundamental geographic structure: the GIS application. Lineage documentation specifies an application's source data, transformations, and input/output specifications. Such information is inherently causal, communicating the theory embodied in a GIS application and the meaning of its product. A number of techniques for automating lineage information are examined. None are found to be capable of documenting data lineage.}, journal = {Cartography and Geographic Information Systems}, } @TECHREPORT{Lanter1991b, author = {D.P. Lanter and R. Essinger}, title = {User-Centered Graphical User Interface Design for GIS}, institution = {National Center for Geographic Information and Analysis (NCGIA). UCSB}, year = {1991}, number = {91-6}, owner = {pgroth}, } @INPROCEEDINGS{Alonso1993, author = {G. Alonso and A. El Abbadi}, title = {GOOSE: Geographic Object Oriented Support Environment}, booktitle = {Proc. of the ACM workshop on Advances in Geographic Information Systems}, year = {1993}, pages = {38-49}, address = {Arlington, Virginia}, month = {November}, owner = {pgroth}, } @INPROCEEDINGS{Alonso1997, author = {G. Alonso and C. Hagen}, title = {Geo-Opera: Workflow Concepts for Spatial Processes}, booktitle = {Proc. 5th Intl. Symposium on Spatial Databases (SSD '97)}, year = {1997}, address = {Berlin, Germany}, month = {June}, abstract = {A Process Support System provides the tools and mechanisms necessary to define, implement and control processes, i.e., complex sequences of program invocations and data exchanges. Due to the generality of the notion of process and the high demand for the functionality they provide, process support systems are starting to be used in a variety of application areas, from business re-engineering to experiment management. In particular, recent results have shown the advantages of using such systems in scientific applications and the work reported in this paper is to be interpreted as one more step in that direction. The paper describes Geo-Opera, a process support system tailored to spatial modeling and GIS engineering. Geo-Opera facilitates the task of coordinating and managing the development and execution of large, computer-based geographic models. It provides a flexible environment for experiment management, incorporating many characteristics of workflow management systems as well as a simple but expressive process modeling language, exception handling, and data and metadata indexing and querying capabilities. }, owner = {pgroth}, } @INPROCEEDINGS{Woodruff1997, author = {A. Woodruff and M. Stonebraker}, title = {Supporting Fine-grained Data Lineage in a Database Visualization Environment}, booktitle = {Proc. of the 13th International Conference on Data Engineering}, year = {1997}, pages = {91-102}, address = {Birmingham, England}, month = {April}, owner = {pgroth}, citeseerurl = {citeseer.ist.psu.edu/article/woodruff97supporting.html}, } @INPROCEEDINGS{Vahdat1998, author = {A. Vahdat and T. Anderson}, title = {Transparent Result Caching}, booktitle = {Proc. of the 1998 USENIX Technical Conference}, year = {1998}, address = {New Orleans, Louisiana}, month = {June}, owner = {pgroth}, citeseerurl = {http://citeseer.ist.psu.edu/vahdat98transparent.html}, } @PHDTHESIS{Woodruff1998, author = {Allison Gyle Woodruff}, title = {Data Lineage and Information Density in Database Visualization}, school = {University of California at Berkeley}, year = {1998}, owner = {pgroth}, url = {http://db.cs.berkeley.edu/papers/UCB-PhD-woodruff.pdf}, } @INPROCEEDINGS{Buneman2000, author = {P. Buneman and S. Khanna and W.C. Tan}, title = {Data Provenance: Some Basic Issues}, booktitle = {Foundations of Software Technology and Theoretical Computer Science}, year = {2000}, abstract = {The ease with which one can copy and transform data on the Web, has made it increasingly dificult to determine the origins of a piece of data. We use the term data provenance to refer to the process of tracing and recording the origins of data and its movement between databases. Provenance is now an acute issue in scientific databases where it is central to the validation of data. In this paper we discuss some of the technical issues that have emerged in an initial exploration of the topic}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/data_provenance_some_basic_issues.pdf}, } @INPROCEEDINGS{Cui2000a, author = {Y. Cui and J. Widom}, title = {Practical Lineage Tracing in Data Warehouses}, booktitle = {Proceedings of the 16th International Conference on Data Engineering (ICDE'00)}, year = {2000}, address = {San Diego, California}, month = {February}, owner = {pgroth}, abstract = {We consider the view data lineage problem in a warehousing environment: For a given data item in a materialized warehouse view, we want to identify the set of source data items that produced the view item. We formalize the problem, and we present a lineage tracing algorithm for relational views with aggregation. Based on our tracing algorithm, we propose a number of schemes for storing auxiliary views that enable consistent and efficient lineage tracing in a multi-source data warehouse. We report on a performance study of the various schemes, identifying which schemes perform best in which settings. Based on our results, we have implemented a lineage tracing package in the WHIPS data warehousing system prototype at Stanford. With this package, users can select view tuples of interest, then efficiently ``drill through'' to examine the exact source tuples that produced the view tuples of interest.}, keywords = {Data Warehousing}, url = {http://dbpubs.stanford.edu:8090/pub/1999-55}, } @ARTICLE{Cui2000, author = {Y. Cui and J. Widom and J. L. Wiener}, title = {Tracing the lineage of view data in a warehousing environment}, journal = {ACM Trans. Database Syst.}, year = {2000}, volume = {25}, number = {2}, pages = {179--227}, issn = {0362-5915}, doi = {http://doi.acm.org/10.1145/357775.357777}, publisher = {ACM Press}, } @INPROCEEDINGS{Buneman2001, author = {P. Buneman and S. Khanna and W.C. Tan}, title = {Why and Where: A Characterization of Data Provenance}, booktitle = {Int. Conf. on Databases Theory (ICDT)}, year = {2001}, abstract = {With the proliferation of database views and curated data- bases, the issue of data provenance (where a piece of data came from and the process by which it arrived in the database) is becoming increasingly important, especially in scientic databases where understanding provenance is crucial to the accuracy and currency of data. In this pa- per we describe an approach to computing provenance when the data of interest has been created by a database query. We adopt a syntactic approach and present results for a general data model that applies to re- lational databases as well as to hierarchical data such as XML. A novel aspect of our work is a distinction between "why" provenance (refers to the source data that had some influence on the existence of the data) and "where" provenance (refers to the location(s) in the source databases from which the data was extracted).}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/why_and_where_a_characterization_of_data_provenance.pdf}, } @PHDTHESIS{Cui2001, author = {Y. Cui}, title = {Lineage Tracing in Data Warehouses}, school = {Stanford University}, year = {2001}, month = {December}, abstract = {Data warehousing systems collect data from multiple distributed data sources and store integrated and summarized information in local databases for efficient data analysis and mining. Sometimes, when analyzing data at a warehouse, it is useful to ?drill down? and investigate the source data from which certain warehouse data was derived. For a given warehouse data item, identifying the exact set of source data items that produced the warehouse data item is termed the data lineage problem. This thesis presents our research results on tracing data lineage in a warehousing environment: -Formal definitions of data lineage for data warehouses defined as relational materialized views over relational sources, and for warehouses defined using graphs of general data transformations. - Algorithms for lineage tracing, again considering both relational and transformational warehouses, along with a suite of optimization techniques. - Performance evaluations through simulations, and a lineage tracing prototype developed within the WHIPS (WareHousing Information Processing System) project at Stanford. - Applying data lineage techniques to obtain improved algorithms for the well-known database view update problem. }, owner = {pgroth}, } @INPROCEEDINGS{Foster2001, author = {I. Foster and E. Alpert and A. Chervenak and B. Drach and C. Kesselman and V. Nefedova and D. Middleton and A. Shoshani and A. Sim and D. Williams. }, title = {The Earth System Grid II: Turning Climate Datasets Into Community Resources.}, booktitle = {Proc. of the American Meterologcal Society Conference}, year = {2001}, owner = {pgroth}, } @INPROCEEDINGS{Foster2001a, author = {I. Foster and C. Kesselman and S. Tuecke}, title = {The Anatomy of the Grid: Enabling Scalable Virtual Organizations}, booktitle = {Int. J. Supercomputer Applications}, year = {2001}, pages = {15-18}, abstract = { computing has emerged as an important new field, distinguished from conventional distributed computing by its focus on large-scale resource sharing, innovative applications, and, in some cases, high-performance orientation. In this article, we define this new field. First, we review theGrid problem, which we define as flexible, secure, coordinated resource sharing among dynamic collections of individuals, institutions, and resources what we refer to as virtual organizations. In such settings, we encounter unique authentication, authorization, resource access, resource discovery, and other challenges. It is this class of problem that is addressed by Grid technologies. Next, we present an extensible and open Grid architecture, in which protocols, services, application programming interfaces, and software development kits are categorized according to their roles in enabling resource sharing. We describe requirements that we believe any such mechanisms must satisfy and we discuss the importance of defining a compact set of intergrid protocols to enable interoperability among different Grid systems. Finally, we discuss how Grid technologies relate to other contemporary technologies, including enterprise integration, application service provider, storage service provider, and peer-to-peer computing. We maintain that Grid concepts and technologies complement and have much to contribute to these other approaches.}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/anatomy.pdf}, } @INPROCEEDINGS{Frew2001, author = {J. Frew and R. Bose}, title = {Earth System Science Workbench: A Data Management Infrastructure for Earth Science Products}, booktitle = {Proceedings of the 13th International Conference on Scientific and Statistical Database Management}, year = {2001}, pages = {180-189}, address = {Fairfax, VA}, month = {July}, abstract = {The Earth System Science Workbench (ESSW) is a nonintrusive data management infrastructure for researchers who must also be data publishers. An implementation of ESSW to track the processing of locally received satellite imagery is presented, demonstrating the Workbench?s transparent and robust support for archiving and publishing data products. ESSW features a Lab Notebook metadata service, a No Duplicate-Write Once Read Many (ND-WORM) storage service, and Web user interface tools. The Lab Notebook logs processes (experiments) and their relationships via a custom API to XML documents stored in a relational database. The NDWORM provides a managed storage archive for the Lab Notebook by keeping unique file digests and namespace metadata, also in a relational database. ESSW Notebook tools allow product searching and ordering, and file and metadata management.}, owner = {pgroth}, } @ARTICLE{Marathe2001, author = {A. P. Marathe}, title = {Tracing Lineage of Array Data}, journal = {J. Intell. Inf. Syst.}, year = {2001}, volume = {17}, number = {2-3}, pages = {193--214}, issn = {0925-9902}, abstract = {Arrays are a common and important class of data in many applications. Arrays can model data such as digital images, digital video, scientific and experimental data, matrices, and finite element grids. Although array manipulations are diverse and domain-specific, they often exhibit structural regularities. This paper describes an algorithm called sub-pushdown to trace data lineage in such array computations. Lineage tracing is a type of data-flow analysis that relates parts of a result array to those parts of the argument (base) arrays that have bearings on the result array parts. Sub-pushdown can be used to trace data lineage in array-manipulating computations expressed in the Array Manipulation Language (AML) that was introduced previously. Sub-pushdown has several useful features. First, the lineage computation is expressed as an AML query. Second, it is not necessary to evaluate the AML lineage query to compute the array data lineage. Third, sub-pushdown never gives false-negative answers. Sub-pushdown has been implemented as part of the ArrayDB prototype array database system that we have built. }, publisher = {Kluwer Academic Publishers}, } @INPROCEEDINGS{Bose2002, author = {R. Bose}, title = {A Conceptual Framework for Composing and Managing Scientific Data Lineage}, booktitle = {Proceedings of the 14th International Conference on Scientific and Statistical Database Management}, year = {2002}, pages = {15-19}, address = {Edinburgh, Scotland}, month = {July}, abstract = {Scientific research relies as much on the dissemination and exchange of data sets as on the publication of conclusions. Accurately tracking the lineage (origin and subsequent processing history) of scientific data sets is thus imperative for the complete documentation of scientific work. However, the lack of a definitive data model for lineage, and the poor fit between current data management tools and scientific software, effectively prevent researchers from determining, preserving, or providing the lineage of the data products they use and create. Based on a comprehensive review of lineagerelated research and previous prototype systems, a conceptual framework is presented to help identify and assess basic lineage system components. Within this framework, a direction is outlined for future work on general methods for composing and managing lineage for scientific data.}, owner = {pgroth}, } @INPROCEEDINGS{Buneman2002, author = {P. Buneman and S. Khanna and K.Tajima and W.C. Tan}, title = {Archiving scientific data}, booktitle = {Proc. of the 2002 ACM SIGMOD International Conference on Management of Data}, year = {2002}, pages = {1--12}, publisher = {ACM Press}, doi = {http://doi.acm.org/10.1145/564691.564693}, isbn = {1-58113-497-5}, location = {Madison, Wisconsin}, } @INPROCEEDINGS{Eder2002, author = {J. Eder and G. E. Olivotto and W. Gruber}, title = {A Data Warehouse for Workflow Logs}, booktitle = {Engineering and Deployment of Cooperative Information Systems: First Int. Conf., EDCIS 2002}, year = {2002}, editor = {Y.Han and S.Tai and D.Wikarski}, month = {September}, publisher = {Springer}, issn = {0302-9743}, abstract = {Workflow Logs provide a very valuable source of information about the actual execution of business processes in organizations. We propose to use data warehouse technology to exploit this information resources for organizational developments, monitoring and process im- provements. We introduce a general data warehouse design for workflow warehouses and discuss the results from an industrial case study showing the validity of this approach.}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/datawarehouse_for_workflow_logs.pdf}, } @INPROCEEDINGS{Foster2002, author = {I. Foster and J. Voeckler and M. Wilde and Y.Zhao}, title = {Chimera: A Virtual Data System for Representing, Querying and Automating Data Derivation}, booktitle = {Proc. of the 14th Conf. on Scientific and Statistical Database Management}, year = {2002}, month = {July}, } @INPROCEEDINGS{Goble2002, author = {C. Goble}, title = {Position Statement: Musings on provenance, workflow and (semantic web) annotations for bioinformatics.}, booktitle = {Workshop on Data Provenance and Derivation }, year = {2002}, month = {October}, owner = {pgroth}, } @PROCEEDINGS{ProvWorkshop2002, title = {Data Provenance/Derivation Workshop}, year = {2002}, month = {October}, url = {http://people.cs.uchicago.edu/~yongzh/position_papers.html}, owner = {pgroth}, } @ARTICLE{Cui2003, author = {Y. Cui and J. Widom}, title = {Lineage tracing for general data warehouse transformations}, journal = {The VLDB Journal}, year = {2003}, volume = {12}, number = {1}, pages = {41--58}, issn = {1066-8888}, doi = {http://dx.doi.org/10.1007/s00778-002-0083-8}, owner = {pgroth}, publisher = {Springer-Verlag New York, Inc.}, abstract = {Data warehousing systems integrate information from operational data sources into a central repository to enable analysis and mining of the integrated information. During the integration process, source data typically undergoes a series of transformations, which may vary from simple algebraic operations or aggregations to complex ?data cleansing? procedures. In a warehousing environment, the data lineage problem is that of tracing warehouse data items back to the original source items from which they were derived. We formally define the lineage tracing problem in the presence of general data warehouse transformations, and we present algorithms for lineage tracing in this environment. Our tracing procedures take advantage of known structure or properties of transformations when present, but also work in the absence of such information. Our results can be used as the basis for a lineage tracing tool in a general warehousing setting, and also can guide the design of data warehouses that enable efficient lineage tracing.}, } @INCOLLECTION{Fan2003, author = {H. Fan and A. Poulovassilis}, title = {Tracing data lineage using schema transformation pathways}, booktitle = {Knowledge transformation for the Semantic Web}, publisher = {IOS Press}, year = {2003}, editor = {B. Omelayenko and M. Klein}, pages = {64-79}, owner = {pgroth}, __markedentry = {0}, } @INPROCEEDINGS{Foster2003, author = {I. Foster and J. Vockler and M. Wilde and Y. Zhao}, title = {The virtual data grid: A new model and architecture for data-intensive collaboration}, booktitle = {In Proc. of the CIDR 2003 First Biennial Conference on Innovative Data Systems Research}, year = {2003}, month = {January}, abstract = {It is now common to encounter communities engaged in the collaborative analysis and transformation of large quantities of data over extended time periods. We argue that these communities require a scalable system for managing, tracing, communicating, and exploring the derivation and analysis of diverse data objects. Such a system could bring significant productivity increases, facilitating discovery, understanding, assessment, and sharing of both data and transformation resources, as well as the productive use of distributed resources for computation, storage, and collaboration. We define a model and architecture for a virtual data grid to address this requirement. Using a broadly applicable ?typed dataset? as the unit of derivation tracking, we introduce simple constructs for describing how datasets are derived from transformations and from other datasets. We also define mechanisms for integrating with, and adapting to, existing data management systems and transformation and analysis tools, as well as Grid mechanisms for distributed resource management and computation planning. We report on successful application results obtained with a prototype system called Chimera that implements some of these concepts, involving challenging analyses of high-energy physics and astronomy data.}, owner = {pgroth}, citeseerurl = {http://citeseer.ist.psu.edu/foster03virtual.html}, } @INPROCEEDINGS{Greenwood2003, author = {M. Greenwood and C. Goble and R. Stevens and J. Zhao and M. Addis and D. Marvin and L. Moreau and T. Oinn}, title = {Provenance of e-Science Experiments - experience from Bioinformatics}, booktitle = {Proc. UK e-Science All Hands Meeting 2003}, year = {2003}, editor = {Simon J Cox}, pages = {223--226}, month = {September}, abstract = {Like experiments performed at a laboratory bench, the data associated with an e-Science experiment are of reduced value if other scientists are not able to identify the origin, or provenance, of those data. Provenance information is essential if experiments are to be validated and verified by others, or even by those who originally performed them. In this article, we give an overview of our initial work on the provenance of bioinformatics e-Science experiments within myGrid. We use two kinds of provenance: the derivation path of information and annotation. We show how this kind of provenance can be delivered within the myGrid demonstrator WorkBench and we explore how the resulting Webs of experimental data holdings can be mined for useful information and presentations for the e-Scientist. }, isbn = {1-904425-11-9}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/prov_of_eScience_Experiments_Experience_Bioinformatics.pdf}, } @INPROCEEDINGS{Myers2003a, author = {J. D. Myers and C. Pancerella and C. Lansing and K. L. Schuchardt and B. Didier}, title = {Multi-scale science: supporting emerging practice with semantically derived provenance}, booktitle = {ISWC 2003 Workshop: Semantic Web Technologies for Searching and Retrieving Scientific Data}, year = {2003}, address = {Sanibel Island, Florida, USA}, month = {October }, owner = {pgroth}, } @ARTICLE{Myers2003, author = {J.D. Myers and A.R. Chappell and M. Elder and A. Geist and J. Schwidder}, title = {Re-integrating the research record}, journal = { IEEE Computing in Science \& Engineering}, year = {2003}, pages = {44-50}, owner = {pgroth}, } @ARTICLE{Silva2003, author = {P. P. da Silva and D. L. McGuinness and R. McCool}, title = {Knowledge Provenance Infrastructure}, journal = {Data Engineering Bulletin}, year = {2003}, volume = {26}, number = {4}, pages = {26-32}, month = {December}, abstract = {The web lacks support for explaining information provenance. When web applications return answers, many users do not know what information sources were used, when they were updated, how reliable the source was, or what information was looked up versus derived. Support for information provenance is expected to be a harder problem in the Semantic Web where more answers result from some maniputiaton of information (instead of simple retrieval of information). Manipulation includes, among other things, retrieving, matching, aggregating, filtering, and deriving information possibly from multiple sources. This article defines a broad notion of information provenance called knowledge provenance that includes proof-like information on how a question answering system arrived at its answer(s). The article also describes an approach for a knowledge provenance infrastructure supporting the extraction, maintenance, and usage of knowledge provenance related to answers of web applications and services. }, owner = {pgroth}, } @INPROCEEDINGS{Szomszor2003, author = {M. Szomszor and L. Moreau}, title = {Recording and Reasoning over Data Provenance in Web and Grid Services}, booktitle = {Int. Conf. on Ontologies, Databases and Applications of Semantics}, year = {2003}, volume = {2888}, series = {LNCS}, issn = {0302-9743}, provenance = {yes}, pind = {EZ~02~02~04}, isbn = {3-540-20498-9}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/recording_reasoning_over_data_provenance.pdf}, export = {yes}, abstract = {Large-scale, dynamic and open environments such as the Grid and Web Services build upon existing computing infrastructures to supply dependable and consistent large-scale computational systems. This kind of architecture has been adopted by the business and scientic communities allowing them to exploit extensive and diverse computing resources to perform complex data processing tasks. In such systems, results are often derived by composing multiple, geographically distributed, heterogeneous services as specified by intricate workflow management. This leads to the undesirable situation where the results are known, but the means by which they were achieved is not. With both scientific experiments and business transactions, the notion of lineage and dataset derivation is of paramount importance since without it, information is potentially worthless. We address the issue of data provenance, the description of the origin of a piece of data, in these environments showing the requirements, uses and implementation difficulties. We propose an infrastructure level support for a provenance recording capability for service-oriented architectures such as the Grid and Web Services. We also developed services to view and retrieve provenance and we provide a mechanism by which provenance is used to determine whether previous computed results are still up to date.}, mygrid = {yes}, pagecount = {18}, } @INPROCEEDINGS{Zhao2003, author = {J. Zhao and C. Goble and M. Greenwood and C. Wroe and R. Stevens}, title = {Annotating, linking and browsing provenance logs for e-Science}, booktitle = {Proc. of the Workshop on Semantic Web Technologies for Searching and Retrieving Scientific Data}, year = {2003}, month = {October}, abstract = {Like experiments performed at a laboratory bench, the results of an e-science in silico experiment are of limited value if other scientists are not able to identify the origin, or provenance, of those results. For e-Science, we need more systematic provenance logs across a range of e- Science activities and disciplines as well as a more informed understanding of the information in these provenance data. Semantic Web technology, which enables data to be linked and defined in a way for more effective discovery, integration and cooperation across computers and people, provides an appropriate solution for our current requirement. In this paper we show how we used the COHSE conceptual open hypermedia system to build a dynamically generated hypertext of web of provenance documents arising from the myGrid project based on associated concepts and reasoning over the ontology.}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/myGridOntProv.pdf}, } @PROCEEDINGS{ProvWorkshop2003, title = {Data Provenance and Annotation}, year = {2003}, month = {December}, owner = {pgroth}, url = {http://www.nesc.ac.uk/esi/events/304/}, } @INPROCEEDINGS{Bose2004, author = {R. Bose and J. Frew }, title = {Composing lineage metadata with XML for custom satellite-derived data products}, booktitle = {16th International Conference on Scientific and Statistical Database Management}, year = {2004}, pages = {275 - 284}, month = {June}, doi = {10.1109/SSDM.2004.1311219 }, abstract = {As peer-to-peer dissemination of custom data products evolves among Earth science research groups, investigators and data managers must consider how to compose appropriate metadata for their research computing activities. Because workflows may span multiple groups, it is critical that lineage (provenance) metadata also be assembled to document and preserve the origins and processing history of constituent data products and transformations for future data consumers. To demonstrate methods for composing lineage metadata for custom processing, we introduce our terminology for workflow and employ a case study for the creation of satellite-derived ocean color data products. Our example contributes to a general metadata model for workflow that incorporates lineage. We then discuss metadata requirements for remote sensing-related data products. We propose two techniques for composing lineage metadata, both based on accessory XML metadata documents that are paired with the data products and versioned data transformations they describe. The first technique, implemented as a prototype, features a dedicated lineage server that introduces the indirection and flexibility necessary for Web-based lineage navigation. The second, more promising technique involves defining a simple Resource Description Framework (RDF) vocabulary for lineage metadata, and using extant RDF/XML tools for query and navigation. These methods provide guidelines for composing lineage metadata that are applicable to other domains. }, owner = {pgroth}, } @INPROCEEDINGS{Groth2004, author = {P. Groth and M. Luck and L. Moreau}, title = {Formalising a protocol for recording provenance in Grids}, booktitle = {Proc. of the UK OST e-Science second All Hands Meeting 2004 (AHM'04)}, year = {2004}, address = {Nottingham, UK}, month = {September}, pind = {EZ~03~03~04}, export = {yes}, abstract = {Both the scientific and business communities are beginning to rely on Grids as problemsolving mechanisms. These communities also have requirements in terms of provenance. Provenance is the documentation of process and the necessity for it is apparent in fields ranging from medicine to aerospace. To support provenance capture in Grids, we have developed an implementation-independent protocol for the recording of provenance. We describe the protocol in the context of a service-oriented architecture and formalise the entities involved using an abstract state machine or a three-dimensional state transition diagram. Using these techniques we sketch a liveness property for the system.}, url = {http://www.ecs.soton.ac.uk/~lavm/papers/ahm04-groth.pdf}, pasoa = {yes}, pagecount = {8}, } @INPROCEEDINGS{Groth2004a, author = {Paul Groth and Michael Luck and Luc Moreau}, title = {A protocol for recording provenance in service-oriented Grids}, booktitle = {Proceedings of the 8th International Conference on Principles of Distributed Systems (OPODIS'04)}, year = {2004}, address = {Grenoble, France}, month = DEC, export = {yes}, abstract = {Both the scientific and business communities, which are beginning to rely on Grids as problem-solving mechanisms, have requirements in terms of provenance. The provenance of some data is the documentation of process that led to the data; its necessity is apparent in fields ranging from medicine to aerospace. To support provenance capture in Grids, we have developed an implementation-independent protocol for the recording of provenance. We describe the protocol in the context of a service-oriented architecture and formalise the entities involved using an abstract state machine or a three-dimensional state transition diagram. Using these techniques we sketch a liveness property for the system.}, pind = {EZ~03~03~04}, pasoa = {yes}, } @INPROCEEDINGS{Ruth2004, author = {P. Ruth and D. Xu and B. K. Bhargava and F. Regnier}, title = {E-notebook Middleware for Acccountability and Reputation Based Trust in Distributed Data Sharing Communities}, booktitle = {Proc. 2nd Int. Conf. on Trust Management, Oxford, UK}, year = {2004}, volume = {2995}, series = {LNCS}, publisher = {Springer}, issn = {0302-9743}, isbn = {3-540-21312-0}, local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/enotebook.pdf}, abstract = {This paper presents the design of a new middleware which provides support for trust and accountability in distributed data sharing communities. One application is in the context of scienti{\^O}¬¨¬®¬¨¬{\AE}¬¨¬{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}c collaborations. Multiple researchers share individually collected data, who in turn create new data sets by performing transformations on existing shared data sets. In data sharing communities building trust for the data obtained from others is crucial. However, the {\^O}¬¨¬®¬¨¬{\AE}¬¨¬{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}eld of data provenance does not consider malicious or untrustworthy users. By adding accountability to the provenance of each data set, this middlware ensures data integrity insofar as any errors can be identi{\^O}¬¨¬®¬¨¬{\AE}¬¨¬{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}ed and corrected. The user is further protected from faulty data by a trust view created from past experiences and second-hand recommendations. A trust view is based on real world social interactions and re{\^O}¬¨¬®¬¨¬{\AE}¬¨¬{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\c C}ects each userâ??{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\^o}s own experiences within the community. By identifying the providers of faulty data and removing them from a trust view, the integrity of all data is enhanced}, } @PHDTHESIS{Tan2004, author = {V. H. K. Tan}, title = {Interaction tracing for mobile agent security}, school = {University of Southampton}, year = {2004}, owner = {pgroth}, abstract = {This thesis develops a new technique, interaction tracing, to address the security issue of protecting mobile agents from potentially malicious hosts. In this technique, a mobile agent is modeled as a black box whose behaviour can be captured through a trace of its inputs and outputs during the process of execution. Formalization of the activity of creating and verifying traces is detailed for a simple agent programming language using operational semantics. An interaction protocol is developed to enable secure exchange of traces between entities in the system that are responsible for verifying the validity of the traces. This protocol is formally modeled and verified for specific security properties using a finite-state model checker. The protocol is extended to allow for the activity of trace reconciliation, which protects inter-agent communication between mobile agents operating in a multi-agent context. Implementation of this secure protocol in conjunction with the interaction tracing activity is undertaken in a mobile agent framework and is quantitatively evaluated against a non-secure mobile agent system and standard client-server approach. A trust model is introduced in the context of the protocol that allows trust relationships to be formed between the various entities in the system, permitting a more flexible deployment of the protocol.}, } @INPROCEEDINGS{Zhao2004, author = {Yong Zhao and Michael Wilde and Ian Foster and Jens Voeckler and Thomas Jordan and Elizabeth Quigg and James Dobson}, title = {Grid middleware services for virtual data discovery, composition, and integration}, booktitle = {Proceedings of the 2nd workshop on Middleware for grid computing}, year = {2004}, pages = {57--62}, address = {New York, NY, USA}, publisher = {ACM Press}, doi = {http://doi.acm.org/10.1145/1028493.1028503}, isbn = {1-58113-950-0}, abstract = {We describe the services, architecture and application of the GriPhyN Virtual Data System, a suite of components and services that allow users to describe virtual data products in declarative terms, discover definitions and assemble workflows based on those definitions, and execute the resulting workflows on Grid resources. We show how these middleware-level services have been applied by specific communities to manage scientific data and workflows. In particular, we highlight and introduce Chiron, a portal facility that enables the interactive use of the virtual data system. Chiron has been used within the QuarkNet education project and as an online "educator" for virtual data applications. We also present applications from functional MRI-based neuroscience research.}, location = {Toronto, Ontario, Canada}, } @INPROCEEDINGS{Ledlie2005, author = {Jonathan Ledlie and Chaki Ng and David A. Holland and Kiran-Kumar Muniswamy-Reddy and Uri Braun and Margo Seltzer}, title = {Provenance-Aware Sensor Data Storage}, booktitle = {NetDB 2005}, year = {2005}, month = {April}, abstract = {Sensor network data has both historical and realtime value. Making historical sensor data useful, in particular, requires storage, naming, and indexing. Sensor data presents new challenges in these areas. Such data is location-specific but also distributed; it is collected in a particular physical location and may be most useful there, but it has additional value when combined with other sensor data collections in a larger distributed system. Thus, arranging location-sensitive peer-to-peer storage is one challenge. Sensor data sets do not have obvious names, so naming them in a globally useful fashion is another challenge. The last challenge arises from the need to index these sensor data sets to make them searchable. The key to sensor data identity is provenance, the full history or lineage of the data. We show how provenance addresses the naming and indexing issues and then present a research agenda for constructing distributed, indexed repositories of sensor data.}, } @INPROCEEDINGS{Townend2005, author = {Paul Townend and Paul Groth and Jie Xu}, title = {A Provenance-Aware Weighted Fault Tolerance Scheme for Service-Based Applications}, booktitle = {Proc. of the 8th IEEE International Symposium on Object-oriented Real-time distributed Computing (ISORC 2005)}, year = {2005}, month = {May}, abstract = {Service-orientation has been proposed as away of facilitating the development and integration of increasingly complex and heterogeneous system components. However, there are many new challenges to the dependability community in this new paradigm, such as how individual channels within fault-tolerant systems may invoke common services as part of their workflow, thus increasing the potential for common-mode failure. We propose a scheme that - for the first time - links the technique of provenance with that of multi-version fault tolerance. We implement a large test system and perform experiments with a single-version system, a traditional MVD system, and a provenance-aware MVD system, and compare their results. We show that for this experiment, our provenance-aware scheme results in a much more dependable system than either of the other systems tested, whilst imposing a negligible timing overhead.}, owner = {pgroth}, } @INPROCEEDINGS{Widom2005, author = {J. Widom}, title = {Trio: a system for integrated management of data, accuracy, and lineage}, booktitle = {Second Biennial Conference on Innovative Data Systems Research (CIDR 2005)}, year = {2005}, address = {Asilomar, Calif.}, month = {January}, abstract = {Trio is a new database system that manages not only data, but also the accuracy and lineage of the data. Inexact (uncertain, probabilistic, fuzzy, approximate, incomplete, and imprecise!) databases have been proposed in the past, and the lineage problem also has been studied. The goals of the Trio project are to combine and distill previous work into a simple and usable model, design a query language as an understandable extension to SQL, and most importantly build a working system?a system that augments conventional data management with both accuracy and lineage as an integral part of the data. This paper provides numerous motivating applications for Trio and lays out preliminary plans for the data model, query language, and prototype system.}, owner = {pgroth}, }