This file was created with JabRef 1.4.
Encoding: ISO8859_1

@ARTICLE{Becker1988,
  author = {R. A. Becker and J. M. Chambers, J. M.},
  title = {Auditing of data analyses},
  journal = {SIAM Journal of Scientific and Statistical Computing},
  year = {1988},
  volume = {9},
  number = {4},
  pages = {747-760},
  owner = {pgroth},
  abstract = {The AUDIT utility allows the user to review what has happened during a set of S-PLUS
	 sessions. When the AUDIT utility is invoked, it reads the default
	 audit file (or auditfile if given) finding all top-level expressions,
	 and which objects were read and written by each expression. It then
	 allows the user (through an arcane syntax) to inquire about which
	 expressions read or wrote a specific object, to backtrack from a specific
	 expression, or to create a source file that will recreate an expression. },
}

@ARTICLE{Lanter1991a,
  author = {D.P. Lanter},
  title = {Design of a Lineage-Based Meta-Data Base for GIS},
  journal = {Cartography and Geographic Information Systems},
  year = {1991},
  volume = {18},
  number = {4},
  pages = {255-261},
  abstract = {This paper presents the conceptual design of a meta-database system for
	 documenting data sources and GIS transformations applied to derive
	 cartographic products. Artificial intelligence techniques of semantic
	 networks are used to organize input-output relationships between
	 map layers and frames to organize lineage attributes characterizing
	 source, intermediate, and product layers. An illustrative example
	 indicates that a lineage meta-database enables GIS users to engage in
	 source assessment throughout their analysis of spatial data sets.},
  owner = {pgroth},
  comment = {This paper is found on page 16 of Lanter1991},
}

@TECHREPORT{Lanter1991,
  author = {D.P. Lanter},
  title = {Lineage in GIS: The Problem and a Solution},
  institution = {National Center for Geographic Information and Analysis (NCGIA), UCSB},
  year = {1991},
  number = {90-6},
  address = {Santa Barbara, CA},
  volume = {18},
  owner = {pgroth},
  pages = {255-261},
  abstract = {This paper focuses attention on
	 a fundamental geographic structure: the GIS application. Lineage
	 documentation specifies an application's source data, transformations, and
	 input/output specifications. Such information is inherently causal,
	 communicating the theory embodied in a GIS application and the meaning of its
	 product. A number of techniques for automating lineage information are
	 examined. None are found to be capable of documenting data lineage.},
  journal = {Cartography and Geographic Information Systems},
}

@TECHREPORT{Lanter1991b,
  author = {D.P. Lanter and R. Essinger},
  title = {User-Centered Graphical User Interface Design for GIS},
  institution = {National Center for Geographic Information and Analysis (NCGIA). UCSB},
  year = {1991},
  number = {91-6},
  owner = {pgroth},
}

@INPROCEEDINGS{Alonso1993,
  author = {G. Alonso and A. El Abbadi},
  title = {GOOSE: Geographic Object Oriented Support Environment},
  booktitle = {Proc. of the ACM workshop on Advances in Geographic Information Systems},
  year = {1993},
  pages = {38-49},
  address = {Arlington, Virginia},
  month = {November},
  owner = {pgroth},
}

@INPROCEEDINGS{Alonso1997,
  author = {G. Alonso and C. Hagen},
  title = {Geo-Opera: Workflow Concepts for Spatial Processes},
  booktitle = {Proc. 5th Intl. Symposium on Spatial Databases (SSD '97)},
  year = {1997},
  address = {Berlin, Germany},
  month = {June},
  abstract = {A Process Support System
	 provides the tools and mechanisms necessary to define, implement and
	 control processes, i.e., complex sequences of program invocations and
	 data exchanges. Due to the generality of the notion of process and
	 the high demand for the functionality they provide, process support
	 systems are starting to be used in a variety of application areas, from
	 business re-engineering to experiment management. In particular,
	 recent results have shown the advantages of using such systems in
	 scientific applications and the work reported in this paper is to be
	 interpreted as one more step in that direction. The paper describes
	 Geo-Opera, a process support system tailored to spatial modeling and GIS
	 engineering. Geo-Opera facilitates the task of coordinating and managing
	 the development and execution of large, computer-based geographic
	 models. It provides a flexible environment for experiment management,
	 incorporating many characteristics of workflow management systems as
	 well as a simple but expressive process modeling language, exception
	 handling, and data and metadata indexing and querying capabilities. },
  owner = {pgroth},
}

@INPROCEEDINGS{Woodruff1997,
  author = {A. Woodruff and M. Stonebraker},
  title = {Supporting Fine-grained Data Lineage in a Database Visualization Environment},
  booktitle = {Proc. of the 13th International Conference on Data Engineering},
  year = {1997},
  pages = {91-102},
  address = {Birmingham, England},
  month = {April},
  owner = {pgroth},
  citeseerurl = {citeseer.ist.psu.edu/article/woodruff97supporting.html},
}

@INPROCEEDINGS{Vahdat1998,
  author = {A. Vahdat and T. Anderson},
  title = {Transparent Result Caching},
  booktitle = {Proc. of the 1998 USENIX Technical Conference},
  year = {1998},
  address = {New Orleans, Louisiana},
  month = {June},
  owner = {pgroth},
  citeseerurl = {http://citeseer.ist.psu.edu/vahdat98transparent.html},
}

@PHDTHESIS{Woodruff1998,
  author = {Allison Gyle Woodruff},
  title = {Data Lineage and Information Density in Database Visualization},
  school = {University of California at Berkeley},
  year = {1998},
  owner = {pgroth},
  url = {http://db.cs.berkeley.edu/papers/UCB-PhD-woodruff.pdf},
}

@INPROCEEDINGS{Buneman2000,
  author = {P. Buneman and S. Khanna and W.C. Tan},
  title = {Data Provenance: Some Basic Issues},
  booktitle = {Foundations of Software Technology and Theoretical Computer Science},
  year = {2000},
  abstract = {The ease with which one can copy and transform data on the Web, has made it
	 increasingly dificult to determine the origins of a piece of data. We
	 use the term data provenance to refer to the process of tracing and
	 recording the origins of data and its movement between databases.
	 Provenance is now an acute issue in scientific databases where it is
	 central to the validation of data. In this paper we discuss some of the
	 technical issues that have emerged in an initial exploration of the topic},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/data_provenance_some_basic_issues.pdf},
}

@INPROCEEDINGS{Cui2000a,
  author = {Y. Cui and J. Widom},
  title = {Practical Lineage Tracing in Data Warehouses},
  booktitle = {Proceedings of the 16th International Conference on Data Engineering (ICDE'00)},
  year = {2000},
  address = {San Diego, California},
  month = {February},
  owner = {pgroth},
  abstract = {We consider the view data lineage
	 problem in a warehousing environment: For a given data item in a
	 materialized warehouse view, we want to identify the set of source data
	 items that produced the view item. We formalize the problem, and
	 we present a lineage tracing algorithm for relational views with
	 aggregation. Based on our tracing algorithm, we propose a number
	 of schemes for storing auxiliary views that enable consistent and
	 efficient lineage tracing in a multi-source data warehouse. We report
	 on a performance study of the various schemes, identifying which
	 schemes perform best in which settings. Based on our results, we have
	 implemented a lineage tracing package in the WHIPS data warehousing
	 system prototype at Stanford. With this package, users can select view
	 tuples of interest, then efficiently ``drill through'' to examine
	 the exact source tuples that produced the view tuples of interest.},
  keywords = {Data Warehousing},
  url = {http://dbpubs.stanford.edu:8090/pub/1999-55},
}

@ARTICLE{Cui2000,
  author = {Y. Cui and J. Widom and J. L. Wiener},
  title = {Tracing the lineage of view data in a warehousing environment},
  journal = {ACM Trans. Database Syst.},
  year = {2000},
  volume = {25},
  number = {2},
  pages = {179--227},
  issn = {0362-5915},
  doi = {http://doi.acm.org/10.1145/357775.357777},
  publisher = {ACM Press},
}

@INPROCEEDINGS{Buneman2001,
  author = {P. Buneman and S. Khanna and W.C. Tan},
  title = {Why and Where: A Characterization of Data Provenance},
  booktitle = {Int. Conf. on Databases Theory (ICDT)},
  year = {2001},
  abstract = {With the proliferation of database views and curated data-
	 bases, the issue of data provenance (where a piece of data came from
	 and the process by which it arrived in the database) is becoming
	 increasingly important, especially in scientic databases where understanding
	 provenance is crucial to the accuracy and currency of data. In this
	 pa- per we describe an approach to computing provenance when the
	 data of interest has been created by a database query. We adopt a
	 syntactic approach and present results for a general data model that
	 applies to re- lational databases as well as to hierarchical data such
	 as XML. A novel aspect of our work is a distinction between "why"
	 provenance (refers to the source data that had some influence on
	 the existence of the data) and "where" provenance (refers to the
	 location(s) in the source databases from which the data was extracted).},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/why_and_where_a_characterization_of_data_provenance.pdf},
}

@PHDTHESIS{Cui2001,
  author = {Y. Cui},
  title = {Lineage Tracing in Data Warehouses},
  school = {Stanford University},
  year = {2001},
  month = {December},
  abstract = {Data warehousing systems collect
	 data from multiple distributed data sources and store integrated and
	 summarized information in local databases for efficient data analysis and
	 mining. Sometimes, when analyzing data at a warehouse, it is useful
	 to ?drill down? and investigate the source data from which certain
	 warehouse data was derived. For a given warehouse data item, identifying
	 the exact set of source data items that produced the warehouse data
	 item is termed the data lineage problem. This thesis presents our
	 research results on tracing data lineage in a warehousing environment:
	 -Formal definitions of data lineage for data warehouses defined as
	 relational materialized views over relational sources, and for warehouses
	 defined using graphs of general data transformations. - Algorithms for
	 lineage tracing, again considering both relational and transformational
	 warehouses, along with a suite of optimization techniques. - Performance
	 evaluations through simulations, and a lineage tracing prototype
	 developed within the WHIPS (WareHousing Information Processing System)
	 project at Stanford. - Applying data lineage techniques to obtain
	 improved algorithms for the well-known database view update problem. },
  owner = {pgroth},
}

@INPROCEEDINGS{Foster2001,
  author = {I. Foster and E. Alpert and A. Chervenak and B. Drach and C. Kesselman and V. Nefedova and D. Middleton and A. Shoshani and A. Sim and D. Williams. },
  title = {The Earth System Grid II: Turning Climate Datasets Into Community Resources.},
  booktitle = {Proc. of the American Meterologcal Society Conference},
  year = {2001},
  owner = {pgroth},
}

@INPROCEEDINGS{Foster2001a,
  author = {I. Foster and C. Kesselman and S. Tuecke},
  title = {The Anatomy of the Grid: Enabling Scalable Virtual Organizations},
  booktitle = {Int. J. Supercomputer Applications},
  year = {2001},
  pages = {15-18},
  abstract = { computing has emerged as an important new
	 field, distinguished from conventional distributed computing by its
	 focus on large-scale resource sharing, innovative applications, and,
	 in some cases, high-performance orientation. In this article, we
	 define this new field. First, we review theGrid problem, which we
	 define as flexible, secure, coordinated resource sharing among dynamic
	 collections of individuals, institutions, and resources what we refer
	 to as virtual organizations. In such settings, we encounter unique
	 authentication, authorization, resource access, resource discovery,
	 and other challenges. It is this class of problem that is addressed
	 by Grid technologies. Next, we present an extensible and open Grid
	 architecture, in which protocols, services, application programming
	 interfaces, and software development kits are categorized according to
	 their roles in enabling resource sharing. We describe requirements
	 that we believe any such mechanisms must satisfy and we discuss the
	 importance of defining a compact set of intergrid protocols to enable
	 interoperability among different Grid systems. Finally, we discuss how Grid
	 technologies relate to other contemporary technologies, including enterprise
	 integration, application service provider, storage service provider, and
	 peer-to-peer computing. We maintain that Grid concepts and technologies
	 complement and have much to contribute to these other approaches.},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/anatomy.pdf},
}

@INPROCEEDINGS{Frew2001,
  author = {J. Frew and R. Bose},
  title = {Earth System Science Workbench: A Data Management Infrastructure for Earth Science Products},
  booktitle = {Proceedings of the 13th International Conference on Scientific and Statistical Database Management},
  year = {2001},
  pages = {180-189},
  address = {Fairfax, VA},
  month = {July},
  abstract = {The Earth System Science Workbench (ESSW) is a nonintrusive
	 data management infrastructure for researchers who must also be data
	 publishers. An implementation of ESSW to track the processing of
	 locally received satellite imagery is presented, demonstrating the
	 Workbench?s transparent and robust support for archiving and publishing
	 data products. ESSW features a Lab Notebook metadata service, a No
	 Duplicate-Write Once Read Many (ND-WORM) storage service, and Web user
	 interface tools. The Lab Notebook logs processes (experiments) and
	 their relationships via a custom API to XML documents stored in a
	 relational database. The NDWORM provides a managed storage archive
	 for the Lab Notebook by keeping unique file digests and namespace
	 metadata, also in a relational database. ESSW Notebook tools allow
	 product searching and ordering, and file and metadata management.},
  owner = {pgroth},
}

@ARTICLE{Marathe2001,
  author = {A. P. Marathe},
  title = {Tracing Lineage of Array Data},
  journal = {J. Intell. Inf. Syst.},
  year = {2001},
  volume = {17},
  number = {2-3},
  pages = {193--214},
  issn = {0925-9902},
  abstract = {Arrays are a common and important class of data in many applications. Arrays can
	 model data such as digital images, digital video, scientific and
	 experimental data, matrices, and finite element grids. Although array
	 manipulations are diverse and domain-specific, they often exhibit structural
	 regularities. This paper describes an algorithm called sub-pushdown to
	 trace data lineage in such array computations. Lineage tracing is a
	 type of data-flow analysis that relates parts of a result array to
	 those parts of the argument (base) arrays that have bearings on the
	 result array parts. Sub-pushdown can be used to trace data lineage in
	 array-manipulating computations expressed in the Array Manipulation
	 Language (AML) that was introduced previously. Sub-pushdown has several
	 useful features. First, the lineage computation is expressed as an AML
	 query. Second, it is not necessary to evaluate the AML lineage query
	 to compute the array data lineage. Third, sub-pushdown never gives
	 false-negative answers. Sub-pushdown has been implemented as part of
	 the ArrayDB prototype array database system that we have built. },
  publisher = {Kluwer Academic Publishers},
}

@INPROCEEDINGS{Bose2002,
  author = {R. Bose},
  title = {A Conceptual Framework for Composing and Managing Scientific Data Lineage},
  booktitle = {Proceedings of the 14th International Conference on Scientific and Statistical Database Management},
  year = {2002},
  pages = {15-19},
  address = {Edinburgh, Scotland},
  month = {July},
  abstract = {Scientific research relies as much on the
	 dissemination and exchange of data sets as on the publication of conclusions.
	 Accurately tracking the lineage (origin and subsequent processing
	 history) of scientific data sets is thus imperative for the complete
	 documentation of scientific work. However, the lack of a definitive data
	 model for lineage, and the poor fit between current data management
	 tools and scientific software, effectively prevent researchers from
	 determining, preserving, or providing the lineage of the data products they
	 use and create. Based on a comprehensive review of lineagerelated
	 research and previous prototype systems, a conceptual framework is
	 presented to help identify and assess basic lineage system components.
	 Within this framework, a direction is outlined for future work on
	 general methods for composing and managing lineage for scientific data.},
  owner = {pgroth},
}

@INPROCEEDINGS{Buneman2002,
  author = {P. Buneman and S. Khanna and K.Tajima and W.C. Tan},
  title = {Archiving scientific data},
  booktitle = {Proc. of the 2002 ACM SIGMOD International Conference on Management of Data},
  year = {2002},
  pages = {1--12},
  publisher = {ACM Press},
  doi = {http://doi.acm.org/10.1145/564691.564693},
  isbn = {1-58113-497-5},
  location = {Madison, Wisconsin},
}

@INPROCEEDINGS{Eder2002,
  author = {J. Eder and G. E. Olivotto and W. Gruber},
  title = {A Data Warehouse for Workflow Logs},
  booktitle = {Engineering and Deployment of Cooperative Information Systems: First Int. Conf., EDCIS 2002},
  year = {2002},
  editor = {Y.Han and S.Tai and D.Wikarski},
  month = {September},
  publisher = {Springer},
  issn = {0302-9743},
  abstract = {Workflow Logs provide a very valuable
	 source of information about the actual execution of business processes
	 in organizations. We propose to use data warehouse technology to
	 exploit this information resources for organizational developments,
	 monitoring and process im- provements. We introduce a general data
	 warehouse design for workflow warehouses and discuss the results
	 from an industrial case study showing the validity of this approach.},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/datawarehouse_for_workflow_logs.pdf},
}

@INPROCEEDINGS{Foster2002,
  author = {I. Foster and J. Voeckler and M. Wilde and Y.Zhao},
  title = {Chimera: A Virtual Data System for Representing, Querying and Automating Data Derivation},
  booktitle = {Proc. of the 14th Conf. on Scientific and Statistical Database Management},
  year = {2002},
  month = {July},
}

@INPROCEEDINGS{Goble2002,
  author = {C. Goble},
  title = {Position Statement: Musings on provenance, workflow and (semantic web) annotations for bioinformatics.},
  booktitle = {Workshop on Data Provenance and Derivation },
  year = {2002},
  month = {October},
  owner = {pgroth},
}

@PROCEEDINGS{ProvWorkshop2002,
  title = {Data Provenance/Derivation Workshop},
  year = {2002},
  month = {October},
    url = {http://people.cs.uchicago.edu/~yongzh/position_papers.html},
  owner = {pgroth},
}

@ARTICLE{Cui2003,
  author = {Y. Cui and J. Widom},
  title = {Lineage tracing for general data warehouse transformations},
  journal = {The VLDB Journal},
  year = {2003},
  volume = {12},
  number = {1},
  pages = {41--58},
  issn = {1066-8888},
  doi = {http://dx.doi.org/10.1007/s00778-002-0083-8},
  owner = {pgroth},
  publisher = {Springer-Verlag New York, Inc.},
  abstract = {Data warehousing systems integrate information from operational data sources into a
	 central repository to enable analysis and mining of the integrated
	 information. During the integration process, source data typically
	 undergoes a series of transformations, which may vary from simple
	 algebraic operations or aggregations to complex ?data cleansing?
	 procedures. In a warehousing environment, the data lineage problem is that
	 of tracing warehouse data items back to the original source items
	 from which they were derived. We formally define the lineage tracing
	 problem in the presence of general data warehouse transformations, and
	 we present algorithms for lineage tracing in this environment. Our
	 tracing procedures take advantage of known structure or properties
	 of transformations when present, but also work in the absence of
	 such information. Our results can be used as the basis for a lineage
	 tracing tool in a general warehousing setting, and also can guide the
	 design of data warehouses that enable efficient lineage tracing.},
}

@INCOLLECTION{Fan2003,
  author = {H. Fan and A. Poulovassilis},
  title = {Tracing data lineage using schema transformation pathways},
  booktitle = {Knowledge transformation for the Semantic Web},
  publisher = {IOS Press},
  year = {2003},
  editor = {B. Omelayenko and M. Klein},
  pages = {64-79},
  owner = {pgroth},
  __markedentry = {0},
}

@INPROCEEDINGS{Foster2003,
  author = {I. Foster and J. Vockler and M. Wilde and Y. Zhao},
  title = {The virtual data grid: A new model and architecture for data-intensive collaboration},
  booktitle = {In Proc. of the CIDR 2003 First Biennial Conference on Innovative Data Systems Research},
  year = {2003},
  month = {January},
  abstract = {It is now common to encounter communities engaged in the
	 collaborative analysis and transformation of large quantities of data over
	 extended time periods. We argue that these communities require a
	 scalable system for managing, tracing, communicating, and exploring the
	 derivation and analysis of diverse data objects. Such a system could
	 bring significant productivity increases, facilitating discovery,
	 understanding, assessment, and sharing of both data and transformation
	 resources, as well as the productive use of distributed resources
	 for computation, storage, and collaboration. We define a model and
	 architecture for a virtual data grid to address this requirement.
	 Using a broadly applicable ?typed dataset? as the unit of derivation
	 tracking, we introduce simple constructs for describing how datasets
	 are derived from transformations and from other datasets. We also
	 define mechanisms for integrating with, and adapting to, existing data
	 management systems and transformation and analysis tools, as well as
	 Grid mechanisms for distributed resource management and computation
	 planning. We report on successful application results obtained with a
	 prototype system called Chimera that implements some of these concepts,
	 involving challenging analyses of high-energy physics and astronomy data.},
  owner = {pgroth},
  citeseerurl = {http://citeseer.ist.psu.edu/foster03virtual.html},
}

@INPROCEEDINGS{Greenwood2003,
  author = {M. Greenwood and C. Goble and R. Stevens and J. Zhao and M. Addis and D. Marvin and L. Moreau and T. Oinn},
  title = {Provenance of e-Science Experiments - experience from Bioinformatics},
  booktitle = {Proc. UK e-Science All Hands Meeting 2003},
  year = {2003},
  editor = {Simon J Cox},
  pages = {223--226},
  month = {September},
  abstract = {Like experiments performed at a laboratory bench, the
	 data associated with an e-Science experiment are of reduced value if
	 other scientists are not able to identify the origin, or provenance,
	 of those data. Provenance information is essential if experiments
	 are to be validated and verified by others, or even by those who
	 originally performed them. In this article, we give an overview of our
	 initial work on the provenance of bioinformatics e-Science experiments
	 within myGrid. We use two kinds of provenance: the derivation path
	 of information and annotation. We show how this kind of provenance
	 can be delivered within the myGrid demonstrator WorkBench and we
	 explore how the resulting Webs of experimental data holdings can be
	 mined for useful information and presentations for the e-Scientist. },
  isbn = {1-904425-11-9},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/prov_of_eScience_Experiments_Experience_Bioinformatics.pdf},
}

@INPROCEEDINGS{Myers2003a,
  author = {J. D. Myers and C. Pancerella and C. Lansing and K. L. Schuchardt and B. Didier},
  title = {Multi-scale science: supporting emerging practice with semantically derived provenance},
  booktitle = {ISWC 2003 Workshop: Semantic Web Technologies for Searching and Retrieving Scientific Data},
  year = {2003},
  address = {Sanibel Island, Florida, USA},
  month = {October },
  owner = {pgroth},
}

@ARTICLE{Myers2003,
  author = {J.D. Myers and A.R. Chappell and M. Elder and A. Geist and J. Schwidder},
  title = {Re-integrating the research record},
  journal = { IEEE Computing in Science \& Engineering},
  year = {2003},
  pages = {44-50},
  owner = {pgroth},
}

@ARTICLE{Silva2003,
  author = {P. P. da Silva and D. L. McGuinness and R. McCool},
  title = {Knowledge Provenance Infrastructure},
  journal = {Data Engineering Bulletin},
  year = {2003},
  volume = {26},
  number = {4},
  pages = {26-32},
  month = {December},
  abstract = {The web lacks support
	 for explaining information provenance. When web applications return
	 answers, many users do not know what information sources were used, when
	 they were updated, how reliable the source was, or what information
	 was looked up versus derived. Support for information provenance
	 is expected to be a harder problem in the Semantic Web where more
	 answers result from some maniputiaton of information (instead of simple
	 retrieval of information). Manipulation includes, among other things,
	 retrieving, matching, aggregating, filtering, and deriving information
	 possibly from multiple sources. This article defines a broad notion of
	 information provenance called knowledge provenance that includes proof-like
	 information on how a question answering system arrived at its answer(s).
	 The article also describes an approach for a knowledge provenance
	 infrastructure supporting the extraction, maintenance, and usage of knowledge
	 provenance related to answers of web applications and services. },
  owner = {pgroth},
}

@INPROCEEDINGS{Szomszor2003,
  author = {M. Szomszor and L. Moreau},
  title = {Recording and Reasoning over Data Provenance in Web and Grid Services},
  booktitle = {Int. Conf. on Ontologies, Databases and Applications of Semantics},
  year = {2003},
  volume = {2888},
  series = {LNCS},
  issn = {0302-9743},
  provenance = {yes},
  pind = {EZ~02~02~04},
  isbn = {3-540-20498-9},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/recording_reasoning_over_data_provenance.pdf},
  export = {yes},
  abstract = {Large-scale, dynamic and open
	 environments such as the Grid and Web Services build upon existing computing
	 infrastructures to supply dependable and consistent large-scale computational
	 systems. This kind of architecture has been adopted by the business and
	 scientic communities allowing them to exploit extensive and diverse
	 computing resources to perform complex data processing tasks. In such
	 systems, results are often derived by composing multiple, geographically
	 distributed, heterogeneous services as specified by intricate workflow
	 management. This leads to the undesirable situation where the results are
	 known, but the means by which they were achieved is not. With both
	 scientific experiments and business transactions, the notion of lineage
	 and dataset derivation is of paramount importance since without it,
	 information is potentially worthless. We address the issue of data
	 provenance, the description of the origin of a piece of data, in
	 these environments showing the requirements, uses and implementation
	 difficulties. We propose an infrastructure level support for a provenance
	 recording capability for service-oriented architectures such as the Grid
	 and Web Services. We also developed services to view and retrieve
	 provenance and we provide a mechanism by which provenance is used to
	 determine whether previous computed results are still up to date.},
  mygrid = {yes},
  pagecount = {18},
}

@INPROCEEDINGS{Zhao2003,
  author = {J. Zhao and C. Goble and M. Greenwood and C. Wroe and R. Stevens},
  title = {Annotating, linking and browsing provenance logs for e-Science},
  booktitle = {Proc. of the Workshop on Semantic Web Technologies for Searching and Retrieving Scientific Data},
  year = {2003},
  month = {October},
  abstract = {Like experiments performed at a laboratory bench, the results
	 of an e-science in silico experiment are of limited value if other
	 scientists are not able to identify the origin, or provenance, of those
	 results. For e-Science, we need more systematic provenance logs across a
	 range of e- Science activities and disciplines as well as a more
	 informed understanding of the information in these provenance data.
	 Semantic Web technology, which enables data to be linked and defined
	 in a way for more effective discovery, integration and cooperation
	 across computers and people, provides an appropriate solution for our
	 current requirement. In this paper we show how we used the COHSE
	 conceptual open hypermedia system to build a dynamically generated
	 hypertext of web of provenance documents arising from the myGrid
	 project based on associated concepts and reasoning over the ontology.},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/myGridOntProv.pdf},
}

@PROCEEDINGS{ProvWorkshop2003,
  title = {Data Provenance and Annotation},
  year = {2003},
  month = {December},
  owner = {pgroth},
  url = {http://www.nesc.ac.uk/esi/events/304/},
}

@INPROCEEDINGS{Bose2004,
  author = {R. Bose and J. Frew },
  title = {Composing lineage metadata with XML for custom satellite-derived data products},
  booktitle = {16th International Conference on Scientific and Statistical Database Management},
  year = {2004},
  pages = {275 - 284},
  month = {June},
  doi = {10.1109/SSDM.2004.1311219 },
  abstract = {As peer-to-peer dissemination of custom data products
	 evolves among Earth science research groups, investigators and data
	 managers must consider how to compose appropriate metadata for their
	 research computing activities. Because workflows may span multiple
	 groups, it is critical that lineage (provenance) metadata also be
	 assembled to document and preserve the origins and processing history of
	 constituent data products and transformations for future data consumers. To
	 demonstrate methods for composing lineage metadata for custom processing,
	 we introduce our terminology for workflow and employ a case study
	 for the creation of satellite-derived ocean color data products.
	 Our example contributes to a general metadata model for workflow
	 that incorporates lineage. We then discuss metadata requirements for
	 remote sensing-related data products. We propose two techniques for
	 composing lineage metadata, both based on accessory XML metadata
	 documents that are paired with the data products and versioned data
	 transformations they describe. The first technique, implemented as a prototype,
	 features a dedicated lineage server that introduces the indirection and
	 flexibility necessary for Web-based lineage navigation. The second, more
	 promising technique involves defining a simple Resource Description
	 Framework (RDF) vocabulary for lineage metadata, and using extant RDF/XML
	 tools for query and navigation. These methods provide guidelines for
	 composing lineage metadata that are applicable to other domains. },
  owner = {pgroth},
}

@INPROCEEDINGS{Groth2004,
  author = {P. Groth and M. Luck and L. Moreau},
  title = {Formalising a protocol for recording provenance in Grids},
  booktitle = {Proc. of the UK OST e-Science second All Hands Meeting 2004 (AHM'04)},
  year = {2004},
  address = {Nottingham, UK},
  month = {September},
  pind = {EZ~03~03~04},
  export = {yes},
  abstract = {Both the scientific and business communities are beginning to rely
	 on Grids as problemsolving mechanisms. These communities also have
	 requirements in terms of provenance. Provenance is the documentation
	 of process and the necessity for it is apparent in fields ranging
	 from medicine to aerospace. To support provenance capture in Grids,
	 we have developed an implementation-independent protocol for the
	 recording of provenance. We describe the protocol in the context of a
	 service-oriented architecture and formalise the entities involved using an
	 abstract state machine or a three-dimensional state transition diagram.
	 Using these techniques we sketch a liveness property for the system.},
  url = {http://www.ecs.soton.ac.uk/~lavm/papers/ahm04-groth.pdf},
  pasoa = {yes},
  pagecount = {8},
}

@INPROCEEDINGS{Groth2004a,
  author = {Paul Groth and Michael Luck and Luc Moreau},
  title = {A protocol for recording provenance in service-oriented Grids},
  booktitle = {Proceedings of the 8th International Conference on Principles of Distributed Systems (OPODIS'04)},
  year = {2004},
  address = {Grenoble, France},
  month = DEC,
  export = {yes},
  abstract = {Both the scientific and business communities, which are beginning to rely on
	 Grids as problem-solving mechanisms, have requirements in terms of
	 provenance. The provenance of some data is the documentation of process
	 that led to the data; its necessity is apparent in fields ranging
	 from medicine to aerospace. To support provenance capture in Grids,
	 we have developed an implementation-independent protocol for the
	 recording of provenance. We describe the protocol in the context of a
	 service-oriented architecture and formalise the entities involved using an
	 abstract state machine or a three-dimensional state transition diagram.
	 Using these techniques we sketch a liveness property for the system.},
  pind = {EZ~03~03~04},
  pasoa = {yes},
}

@INPROCEEDINGS{Ruth2004,
  author = {P. Ruth and D. Xu and B. K. Bhargava and F. Regnier},
  title = {E-notebook Middleware for Acccountability and Reputation Based Trust in Distributed Data Sharing Communities},
  booktitle = {Proc. 2nd Int. Conf. on Trust Management, Oxford, UK},
  year = {2004},
  volume = {2995},
  series = {LNCS},
  publisher = {Springer},
  issn = {0302-9743},
  isbn = {3-540-21312-0},
  local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/enotebook.pdf},
  abstract = {This paper presents the design of a new
	 middleware which provides support for trust and accountability in
	 distributed data sharing communities. One application is in the context of
	 scienti{\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}c
	 collaborations. Multiple researchers share individually collected
	 data, who in turn create new data sets by performing transformations
	 on existing shared data sets. In data sharing communities building
	 trust for the data obtained from others is crucial. However, the
	 {\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}eld of data provenance does not consider malicious or untrustworthy
	 users. By adding accountability to the provenance of each data set,
	 this middlware ensures data integrity insofar as any errors can be
	 identi{\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}ed and corrected. The user is further protected from faulty
	 data by a trust view created from past experiences and second-hand
	 recommendations. A trust view is based on real world social interactions and
	 re{\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\c C}ects each
	 user�??{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\^o}s own experiences
	 within the community. By identifying the providers of faulty data and
	 removing them from a trust view, the integrity of all data is enhanced},
}

@PHDTHESIS{Tan2004,
  author = {V. H. K. Tan},
  title = {Interaction tracing for mobile agent security},
  school = {University of Southampton},
  year = {2004},
  owner = {pgroth},
  abstract = {This thesis develops a new technique, interaction tracing, to address the security
	 issue of protecting mobile agents from potentially malicious hosts.
	 In this technique, a mobile agent is modeled as a black box whose
	 behaviour can be captured through a trace of its inputs and outputs
	 during the process of execution. Formalization of the activity of
	 creating and verifying traces is detailed for a simple agent programming
	 language using operational semantics. An interaction protocol is
	 developed to enable secure exchange of traces between entities in
	 the system that are responsible for verifying the validity of the
	 traces. This protocol is formally modeled and verified for specific
	 security properties using a finite-state model checker. The protocol is
	 extended to allow for the activity of trace reconciliation, which
	 protects inter-agent communication between mobile agents operating in
	 a multi-agent context. Implementation of this secure protocol in
	 conjunction with the interaction tracing activity is undertaken in a
	 mobile agent framework and is quantitatively evaluated against a
	 non-secure mobile agent system and standard client-server approach. A
	 trust model is introduced in the context of the protocol that allows
	 trust relationships to be formed between the various entities in
	 the system, permitting a more flexible deployment of the protocol.},
}

@INPROCEEDINGS{Zhao2004,
  author = {Yong Zhao and Michael Wilde and Ian Foster and Jens Voeckler and Thomas Jordan and Elizabeth Quigg and James Dobson},
  title = {Grid middleware services for virtual data discovery, composition, and integration},
  booktitle = {Proceedings of the 2nd workshop on Middleware for grid computing},
  year = {2004},
  pages = {57--62},
  address = {New York, NY, USA},
  publisher = {ACM Press},
  doi = {http://doi.acm.org/10.1145/1028493.1028503},
  isbn = {1-58113-950-0},
  abstract = {We describe the services,
	 architecture and application of the GriPhyN Virtual Data System, a suite of
	 components and services that allow users to describe virtual data products
	 in declarative terms, discover definitions and assemble workflows
	 based on those definitions, and execute the resulting workflows on
	 Grid resources. We show how these middleware-level services have
	 been applied by specific communities to manage scientific data and
	 workflows. In particular, we highlight and introduce <i>Chiron</i>, a
	 portal facility that enables the interactive use of the virtual data
	 system. Chiron has been used within the QuarkNet education project
	 and as an online "educator" for virtual data applications. We also
	 present applications from functional MRI-based neuroscience research.},
  location = {Toronto, Ontario, Canada},
}

@INPROCEEDINGS{Ledlie2005,
  author = {Jonathan Ledlie and Chaki Ng and David A. Holland and Kiran-Kumar Muniswamy-Reddy and Uri Braun and Margo Seltzer},
  title = {Provenance-Aware Sensor Data Storage},
  booktitle = {NetDB 2005},
  year = {2005},
  month = {April},
  abstract = {Sensor network data has both historical and realtime value. Making historical sensor
	 data useful, in particular, requires storage, naming, and indexing.
	 Sensor data presents new challenges in these areas. Such data is
	 location-specific but also distributed; it is collected in a particular physical
	 location and may be most useful there, but it has additional value when
	 combined with other sensor data collections in a larger distributed
	 system. Thus, arranging location-sensitive peer-to-peer storage is one
	 challenge. Sensor data sets do not have obvious names, so naming them in a
	 globally useful fashion is another challenge. The last challenge
	 arises from the need to index these sensor data sets to make them
	 searchable. The key to sensor data identity is provenance, the full
	 history or lineage of the data. We show how provenance addresses
	 the naming and indexing issues and then present a research agenda
	 for constructing distributed, indexed repositories of sensor data.},
}

@INPROCEEDINGS{Townend2005,
  author = {Paul Townend and Paul Groth and Jie Xu},
  title = {A Provenance-Aware Weighted Fault Tolerance Scheme for Service-Based Applications},
  booktitle = {Proc. of the 8th IEEE International Symposium on Object-oriented Real-time distributed Computing (ISORC 2005)},
  year = {2005},
  month = {May},
  abstract = {Service-orientation has been proposed as away of facilitating the development
	 and integration of increasingly complex and heterogeneous system
	 components. However, there are many new challenges to the dependability
	 community in this new paradigm, such as how individual channels within
	 fault-tolerant systems may invoke common services as part of their workflow,
	 thus increasing the potential for common-mode failure. We propose a
	 scheme that - for the first time - links the technique of provenance
	 with that of multi-version fault tolerance. We implement a large
	 test system and perform experiments with a single-version system, a
	 traditional MVD system, and a provenance-aware MVD system, and compare
	 their results. We show that for this experiment, our provenance-aware
	 scheme results in a much more dependable system than either of the
	 other systems tested, whilst imposing a negligible timing overhead.},
  owner = {pgroth},
}

@INPROCEEDINGS{Widom2005,
  author = {J. Widom},
  title = {Trio: a system for integrated management of data, accuracy, and lineage},
  booktitle = {Second Biennial Conference on Innovative Data Systems Research (CIDR 2005)},
  year = {2005},
  address = {Asilomar, Calif.},
  month = {January},
  abstract = {Trio is a new database system that manages not only data,
	 but also the accuracy and lineage of the data. Inexact (uncertain,
	 probabilistic, fuzzy, approximate, incomplete, and imprecise!) databases have
	 been proposed in the past, and the lineage problem also has been
	 studied. The goals of the Trio project are to combine and distill
	 previous work into a simple and usable model, design a query language as
	 an understandable extension to SQL, and most importantly build a
	 working system?a system that augments conventional data management
	 with both accuracy and lineage as an integral part of the data. This
	 paper provides numerous motivating applications for Trio and lays out
	 preliminary plans for the data model, query language, and prototype system.},
  owner = {pgroth},
}

<script src='https://archive-bar.soton.ac.uk/archive-bar.js'></script>
<script src='https://archive-bar.soton.ac.uk/google-analytics.js'></script>