This file was created with JabRef 1.4.
Encoding: ISO8859_1
@ARTICLE{Becker1988,
author = {R. A. Becker and J. M. Chambers, J. M.},
title = {Auditing of data analyses},
journal = {SIAM Journal of Scientific and Statistical Computing},
year = {1988},
volume = {9},
number = {4},
pages = {747-760},
owner = {pgroth},
abstract = {The AUDIT utility allows the user to review what has happened during a set of S-PLUS
sessions. When the AUDIT utility is invoked, it reads the default
audit file (or auditfile if given) finding all top-level expressions,
and which objects were read and written by each expression. It then
allows the user (through an arcane syntax) to inquire about which
expressions read or wrote a specific object, to backtrack from a specific
expression, or to create a source file that will recreate an expression. },
}
@ARTICLE{Lanter1991a,
author = {D.P. Lanter},
title = {Design of a Lineage-Based Meta-Data Base for GIS},
journal = {Cartography and Geographic Information Systems},
year = {1991},
volume = {18},
number = {4},
pages = {255-261},
abstract = {This paper presents the conceptual design of a meta-database system for
documenting data sources and GIS transformations applied to derive
cartographic products. Artificial intelligence techniques of semantic
networks are used to organize input-output relationships between
map layers and frames to organize lineage attributes characterizing
source, intermediate, and product layers. An illustrative example
indicates that a lineage meta-database enables GIS users to engage in
source assessment throughout their analysis of spatial data sets.},
owner = {pgroth},
comment = {This paper is found on page 16 of Lanter1991},
}
@TECHREPORT{Lanter1991,
author = {D.P. Lanter},
title = {Lineage in GIS: The Problem and a Solution},
institution = {National Center for Geographic Information and Analysis (NCGIA), UCSB},
year = {1991},
number = {90-6},
address = {Santa Barbara, CA},
volume = {18},
owner = {pgroth},
pages = {255-261},
abstract = {This paper focuses attention on
a fundamental geographic structure: the GIS application. Lineage
documentation specifies an application's source data, transformations, and
input/output specifications. Such information is inherently causal,
communicating the theory embodied in a GIS application and the meaning of its
product. A number of techniques for automating lineage information are
examined. None are found to be capable of documenting data lineage.},
journal = {Cartography and Geographic Information Systems},
}
@TECHREPORT{Lanter1991b,
author = {D.P. Lanter and R. Essinger},
title = {User-Centered Graphical User Interface Design for GIS},
institution = {National Center for Geographic Information and Analysis (NCGIA). UCSB},
year = {1991},
number = {91-6},
owner = {pgroth},
}
@INPROCEEDINGS{Alonso1993,
author = {G. Alonso and A. El Abbadi},
title = {GOOSE: Geographic Object Oriented Support Environment},
booktitle = {Proc. of the ACM workshop on Advances in Geographic Information Systems},
year = {1993},
pages = {38-49},
address = {Arlington, Virginia},
month = {November},
owner = {pgroth},
}
@INPROCEEDINGS{Alonso1997,
author = {G. Alonso and C. Hagen},
title = {Geo-Opera: Workflow Concepts for Spatial Processes},
booktitle = {Proc. 5th Intl. Symposium on Spatial Databases (SSD '97)},
year = {1997},
address = {Berlin, Germany},
month = {June},
abstract = {A Process Support System
provides the tools and mechanisms necessary to define, implement and
control processes, i.e., complex sequences of program invocations and
data exchanges. Due to the generality of the notion of process and
the high demand for the functionality they provide, process support
systems are starting to be used in a variety of application areas, from
business re-engineering to experiment management. In particular,
recent results have shown the advantages of using such systems in
scientific applications and the work reported in this paper is to be
interpreted as one more step in that direction. The paper describes
Geo-Opera, a process support system tailored to spatial modeling and GIS
engineering. Geo-Opera facilitates the task of coordinating and managing
the development and execution of large, computer-based geographic
models. It provides a flexible environment for experiment management,
incorporating many characteristics of workflow management systems as
well as a simple but expressive process modeling language, exception
handling, and data and metadata indexing and querying capabilities. },
owner = {pgroth},
}
@INPROCEEDINGS{Woodruff1997,
author = {A. Woodruff and M. Stonebraker},
title = {Supporting Fine-grained Data Lineage in a Database Visualization Environment},
booktitle = {Proc. of the 13th International Conference on Data Engineering},
year = {1997},
pages = {91-102},
address = {Birmingham, England},
month = {April},
owner = {pgroth},
citeseerurl = {citeseer.ist.psu.edu/article/woodruff97supporting.html},
}
@INPROCEEDINGS{Vahdat1998,
author = {A. Vahdat and T. Anderson},
title = {Transparent Result Caching},
booktitle = {Proc. of the 1998 USENIX Technical Conference},
year = {1998},
address = {New Orleans, Louisiana},
month = {June},
owner = {pgroth},
citeseerurl = {http://citeseer.ist.psu.edu/vahdat98transparent.html},
}
@PHDTHESIS{Woodruff1998,
author = {Allison Gyle Woodruff},
title = {Data Lineage and Information Density in Database Visualization},
school = {University of California at Berkeley},
year = {1998},
owner = {pgroth},
url = {http://db.cs.berkeley.edu/papers/UCB-PhD-woodruff.pdf},
}
@INPROCEEDINGS{Buneman2000,
author = {P. Buneman and S. Khanna and W.C. Tan},
title = {Data Provenance: Some Basic Issues},
booktitle = {Foundations of Software Technology and Theoretical Computer Science},
year = {2000},
abstract = {The ease with which one can copy and transform data on the Web, has made it
increasingly dificult to determine the origins of a piece of data. We
use the term data provenance to refer to the process of tracing and
recording the origins of data and its movement between databases.
Provenance is now an acute issue in scientific databases where it is
central to the validation of data. In this paper we discuss some of the
technical issues that have emerged in an initial exploration of the topic},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/data_provenance_some_basic_issues.pdf},
}
@INPROCEEDINGS{Cui2000a,
author = {Y. Cui and J. Widom},
title = {Practical Lineage Tracing in Data Warehouses},
booktitle = {Proceedings of the 16th International Conference on Data Engineering (ICDE'00)},
year = {2000},
address = {San Diego, California},
month = {February},
owner = {pgroth},
abstract = {We consider the view data lineage
problem in a warehousing environment: For a given data item in a
materialized warehouse view, we want to identify the set of source data
items that produced the view item. We formalize the problem, and
we present a lineage tracing algorithm for relational views with
aggregation. Based on our tracing algorithm, we propose a number
of schemes for storing auxiliary views that enable consistent and
efficient lineage tracing in a multi-source data warehouse. We report
on a performance study of the various schemes, identifying which
schemes perform best in which settings. Based on our results, we have
implemented a lineage tracing package in the WHIPS data warehousing
system prototype at Stanford. With this package, users can select view
tuples of interest, then efficiently ``drill through'' to examine
the exact source tuples that produced the view tuples of interest.},
keywords = {Data Warehousing},
url = {http://dbpubs.stanford.edu:8090/pub/1999-55},
}
@ARTICLE{Cui2000,
author = {Y. Cui and J. Widom and J. L. Wiener},
title = {Tracing the lineage of view data in a warehousing environment},
journal = {ACM Trans. Database Syst.},
year = {2000},
volume = {25},
number = {2},
pages = {179--227},
issn = {0362-5915},
doi = {http://doi.acm.org/10.1145/357775.357777},
publisher = {ACM Press},
}
@INPROCEEDINGS{Buneman2001,
author = {P. Buneman and S. Khanna and W.C. Tan},
title = {Why and Where: A Characterization of Data Provenance},
booktitle = {Int. Conf. on Databases Theory (ICDT)},
year = {2001},
abstract = {With the proliferation of database views and curated data-
bases, the issue of data provenance (where a piece of data came from
and the process by which it arrived in the database) is becoming
increasingly important, especially in scientic databases where understanding
provenance is crucial to the accuracy and currency of data. In this
pa- per we describe an approach to computing provenance when the
data of interest has been created by a database query. We adopt a
syntactic approach and present results for a general data model that
applies to re- lational databases as well as to hierarchical data such
as XML. A novel aspect of our work is a distinction between "why"
provenance (refers to the source data that had some influence on
the existence of the data) and "where" provenance (refers to the
location(s) in the source databases from which the data was extracted).},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/why_and_where_a_characterization_of_data_provenance.pdf},
}
@PHDTHESIS{Cui2001,
author = {Y. Cui},
title = {Lineage Tracing in Data Warehouses},
school = {Stanford University},
year = {2001},
month = {December},
abstract = {Data warehousing systems collect
data from multiple distributed data sources and store integrated and
summarized information in local databases for efficient data analysis and
mining. Sometimes, when analyzing data at a warehouse, it is useful
to ?drill down? and investigate the source data from which certain
warehouse data was derived. For a given warehouse data item, identifying
the exact set of source data items that produced the warehouse data
item is termed the data lineage problem. This thesis presents our
research results on tracing data lineage in a warehousing environment:
-Formal definitions of data lineage for data warehouses defined as
relational materialized views over relational sources, and for warehouses
defined using graphs of general data transformations. - Algorithms for
lineage tracing, again considering both relational and transformational
warehouses, along with a suite of optimization techniques. - Performance
evaluations through simulations, and a lineage tracing prototype
developed within the WHIPS (WareHousing Information Processing System)
project at Stanford. - Applying data lineage techniques to obtain
improved algorithms for the well-known database view update problem. },
owner = {pgroth},
}
@INPROCEEDINGS{Foster2001,
author = {I. Foster and E. Alpert and A. Chervenak and B. Drach and C. Kesselman and V. Nefedova and D. Middleton and A. Shoshani and A. Sim and D. Williams. },
title = {The Earth System Grid II: Turning Climate Datasets Into Community Resources.},
booktitle = {Proc. of the American Meterologcal Society Conference},
year = {2001},
owner = {pgroth},
}
@INPROCEEDINGS{Foster2001a,
author = {I. Foster and C. Kesselman and S. Tuecke},
title = {The Anatomy of the Grid: Enabling Scalable Virtual Organizations},
booktitle = {Int. J. Supercomputer Applications},
year = {2001},
pages = {15-18},
abstract = { computing has emerged as an important new
field, distinguished from conventional distributed computing by its
focus on large-scale resource sharing, innovative applications, and,
in some cases, high-performance orientation. In this article, we
define this new field. First, we review theGrid problem, which we
define as flexible, secure, coordinated resource sharing among dynamic
collections of individuals, institutions, and resources what we refer
to as virtual organizations. In such settings, we encounter unique
authentication, authorization, resource access, resource discovery,
and other challenges. It is this class of problem that is addressed
by Grid technologies. Next, we present an extensible and open Grid
architecture, in which protocols, services, application programming
interfaces, and software development kits are categorized according to
their roles in enabling resource sharing. We describe requirements
that we believe any such mechanisms must satisfy and we discuss the
importance of defining a compact set of intergrid protocols to enable
interoperability among different Grid systems. Finally, we discuss how Grid
technologies relate to other contemporary technologies, including enterprise
integration, application service provider, storage service provider, and
peer-to-peer computing. We maintain that Grid concepts and technologies
complement and have much to contribute to these other approaches.},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/anatomy.pdf},
}
@INPROCEEDINGS{Frew2001,
author = {J. Frew and R. Bose},
title = {Earth System Science Workbench: A Data Management Infrastructure for Earth Science Products},
booktitle = {Proceedings of the 13th International Conference on Scientific and Statistical Database Management},
year = {2001},
pages = {180-189},
address = {Fairfax, VA},
month = {July},
abstract = {The Earth System Science Workbench (ESSW) is a nonintrusive
data management infrastructure for researchers who must also be data
publishers. An implementation of ESSW to track the processing of
locally received satellite imagery is presented, demonstrating the
Workbench?s transparent and robust support for archiving and publishing
data products. ESSW features a Lab Notebook metadata service, a No
Duplicate-Write Once Read Many (ND-WORM) storage service, and Web user
interface tools. The Lab Notebook logs processes (experiments) and
their relationships via a custom API to XML documents stored in a
relational database. The NDWORM provides a managed storage archive
for the Lab Notebook by keeping unique file digests and namespace
metadata, also in a relational database. ESSW Notebook tools allow
product searching and ordering, and file and metadata management.},
owner = {pgroth},
}
@ARTICLE{Marathe2001,
author = {A. P. Marathe},
title = {Tracing Lineage of Array Data},
journal = {J. Intell. Inf. Syst.},
year = {2001},
volume = {17},
number = {2-3},
pages = {193--214},
issn = {0925-9902},
abstract = {Arrays are a common and important class of data in many applications. Arrays can
model data such as digital images, digital video, scientific and
experimental data, matrices, and finite element grids. Although array
manipulations are diverse and domain-specific, they often exhibit structural
regularities. This paper describes an algorithm called sub-pushdown to
trace data lineage in such array computations. Lineage tracing is a
type of data-flow analysis that relates parts of a result array to
those parts of the argument (base) arrays that have bearings on the
result array parts. Sub-pushdown can be used to trace data lineage in
array-manipulating computations expressed in the Array Manipulation
Language (AML) that was introduced previously. Sub-pushdown has several
useful features. First, the lineage computation is expressed as an AML
query. Second, it is not necessary to evaluate the AML lineage query
to compute the array data lineage. Third, sub-pushdown never gives
false-negative answers. Sub-pushdown has been implemented as part of
the ArrayDB prototype array database system that we have built. },
publisher = {Kluwer Academic Publishers},
}
@INPROCEEDINGS{Bose2002,
author = {R. Bose},
title = {A Conceptual Framework for Composing and Managing Scientific Data Lineage},
booktitle = {Proceedings of the 14th International Conference on Scientific and Statistical Database Management},
year = {2002},
pages = {15-19},
address = {Edinburgh, Scotland},
month = {July},
abstract = {Scientific research relies as much on the
dissemination and exchange of data sets as on the publication of conclusions.
Accurately tracking the lineage (origin and subsequent processing
history) of scientific data sets is thus imperative for the complete
documentation of scientific work. However, the lack of a definitive data
model for lineage, and the poor fit between current data management
tools and scientific software, effectively prevent researchers from
determining, preserving, or providing the lineage of the data products they
use and create. Based on a comprehensive review of lineagerelated
research and previous prototype systems, a conceptual framework is
presented to help identify and assess basic lineage system components.
Within this framework, a direction is outlined for future work on
general methods for composing and managing lineage for scientific data.},
owner = {pgroth},
}
@INPROCEEDINGS{Buneman2002,
author = {P. Buneman and S. Khanna and K.Tajima and W.C. Tan},
title = {Archiving scientific data},
booktitle = {Proc. of the 2002 ACM SIGMOD International Conference on Management of Data},
year = {2002},
pages = {1--12},
publisher = {ACM Press},
doi = {http://doi.acm.org/10.1145/564691.564693},
isbn = {1-58113-497-5},
location = {Madison, Wisconsin},
}
@INPROCEEDINGS{Eder2002,
author = {J. Eder and G. E. Olivotto and W. Gruber},
title = {A Data Warehouse for Workflow Logs},
booktitle = {Engineering and Deployment of Cooperative Information Systems: First Int. Conf., EDCIS 2002},
year = {2002},
editor = {Y.Han and S.Tai and D.Wikarski},
month = {September},
publisher = {Springer},
issn = {0302-9743},
abstract = {Workflow Logs provide a very valuable
source of information about the actual execution of business processes
in organizations. We propose to use data warehouse technology to
exploit this information resources for organizational developments,
monitoring and process im- provements. We introduce a general data
warehouse design for workflow warehouses and discuss the results
from an industrial case study showing the validity of this approach.},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/datawarehouse_for_workflow_logs.pdf},
}
@INPROCEEDINGS{Foster2002,
author = {I. Foster and J. Voeckler and M. Wilde and Y.Zhao},
title = {Chimera: A Virtual Data System for Representing, Querying and Automating Data Derivation},
booktitle = {Proc. of the 14th Conf. on Scientific and Statistical Database Management},
year = {2002},
month = {July},
}
@INPROCEEDINGS{Goble2002,
author = {C. Goble},
title = {Position Statement: Musings on provenance, workflow and (semantic web) annotations for bioinformatics.},
booktitle = {Workshop on Data Provenance and Derivation },
year = {2002},
month = {October},
owner = {pgroth},
}
@PROCEEDINGS{ProvWorkshop2002,
title = {Data Provenance/Derivation Workshop},
year = {2002},
month = {October},
url = {http://people.cs.uchicago.edu/~yongzh/position_papers.html},
owner = {pgroth},
}
@ARTICLE{Cui2003,
author = {Y. Cui and J. Widom},
title = {Lineage tracing for general data warehouse transformations},
journal = {The VLDB Journal},
year = {2003},
volume = {12},
number = {1},
pages = {41--58},
issn = {1066-8888},
doi = {http://dx.doi.org/10.1007/s00778-002-0083-8},
owner = {pgroth},
publisher = {Springer-Verlag New York, Inc.},
abstract = {Data warehousing systems integrate information from operational data sources into a
central repository to enable analysis and mining of the integrated
information. During the integration process, source data typically
undergoes a series of transformations, which may vary from simple
algebraic operations or aggregations to complex ?data cleansing?
procedures. In a warehousing environment, the data lineage problem is that
of tracing warehouse data items back to the original source items
from which they were derived. We formally define the lineage tracing
problem in the presence of general data warehouse transformations, and
we present algorithms for lineage tracing in this environment. Our
tracing procedures take advantage of known structure or properties
of transformations when present, but also work in the absence of
such information. Our results can be used as the basis for a lineage
tracing tool in a general warehousing setting, and also can guide the
design of data warehouses that enable efficient lineage tracing.},
}
@INCOLLECTION{Fan2003,
author = {H. Fan and A. Poulovassilis},
title = {Tracing data lineage using schema transformation pathways},
booktitle = {Knowledge transformation for the Semantic Web},
publisher = {IOS Press},
year = {2003},
editor = {B. Omelayenko and M. Klein},
pages = {64-79},
owner = {pgroth},
__markedentry = {0},
}
@INPROCEEDINGS{Foster2003,
author = {I. Foster and J. Vockler and M. Wilde and Y. Zhao},
title = {The virtual data grid: A new model and architecture for data-intensive collaboration},
booktitle = {In Proc. of the CIDR 2003 First Biennial Conference on Innovative Data Systems Research},
year = {2003},
month = {January},
abstract = {It is now common to encounter communities engaged in the
collaborative analysis and transformation of large quantities of data over
extended time periods. We argue that these communities require a
scalable system for managing, tracing, communicating, and exploring the
derivation and analysis of diverse data objects. Such a system could
bring significant productivity increases, facilitating discovery,
understanding, assessment, and sharing of both data and transformation
resources, as well as the productive use of distributed resources
for computation, storage, and collaboration. We define a model and
architecture for a virtual data grid to address this requirement.
Using a broadly applicable ?typed dataset? as the unit of derivation
tracking, we introduce simple constructs for describing how datasets
are derived from transformations and from other datasets. We also
define mechanisms for integrating with, and adapting to, existing data
management systems and transformation and analysis tools, as well as
Grid mechanisms for distributed resource management and computation
planning. We report on successful application results obtained with a
prototype system called Chimera that implements some of these concepts,
involving challenging analyses of high-energy physics and astronomy data.},
owner = {pgroth},
citeseerurl = {http://citeseer.ist.psu.edu/foster03virtual.html},
}
@INPROCEEDINGS{Greenwood2003,
author = {M. Greenwood and C. Goble and R. Stevens and J. Zhao and M. Addis and D. Marvin and L. Moreau and T. Oinn},
title = {Provenance of e-Science Experiments - experience from Bioinformatics},
booktitle = {Proc. UK e-Science All Hands Meeting 2003},
year = {2003},
editor = {Simon J Cox},
pages = {223--226},
month = {September},
abstract = {Like experiments performed at a laboratory bench, the
data associated with an e-Science experiment are of reduced value if
other scientists are not able to identify the origin, or provenance,
of those data. Provenance information is essential if experiments
are to be validated and verified by others, or even by those who
originally performed them. In this article, we give an overview of our
initial work on the provenance of bioinformatics e-Science experiments
within myGrid. We use two kinds of provenance: the derivation path
of information and annotation. We show how this kind of provenance
can be delivered within the myGrid demonstrator WorkBench and we
explore how the resulting Webs of experimental data holdings can be
mined for useful information and presentations for the e-Scientist. },
isbn = {1-904425-11-9},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/prov_of_eScience_Experiments_Experience_Bioinformatics.pdf},
}
@INPROCEEDINGS{Myers2003a,
author = {J. D. Myers and C. Pancerella and C. Lansing and K. L. Schuchardt and B. Didier},
title = {Multi-scale science: supporting emerging practice with semantically derived provenance},
booktitle = {ISWC 2003 Workshop: Semantic Web Technologies for Searching and Retrieving Scientific Data},
year = {2003},
address = {Sanibel Island, Florida, USA},
month = {October },
owner = {pgroth},
}
@ARTICLE{Myers2003,
author = {J.D. Myers and A.R. Chappell and M. Elder and A. Geist and J. Schwidder},
title = {Re-integrating the research record},
journal = { IEEE Computing in Science \& Engineering},
year = {2003},
pages = {44-50},
owner = {pgroth},
}
@ARTICLE{Silva2003,
author = {P. P. da Silva and D. L. McGuinness and R. McCool},
title = {Knowledge Provenance Infrastructure},
journal = {Data Engineering Bulletin},
year = {2003},
volume = {26},
number = {4},
pages = {26-32},
month = {December},
abstract = {The web lacks support
for explaining information provenance. When web applications return
answers, many users do not know what information sources were used, when
they were updated, how reliable the source was, or what information
was looked up versus derived. Support for information provenance
is expected to be a harder problem in the Semantic Web where more
answers result from some maniputiaton of information (instead of simple
retrieval of information). Manipulation includes, among other things,
retrieving, matching, aggregating, filtering, and deriving information
possibly from multiple sources. This article defines a broad notion of
information provenance called knowledge provenance that includes proof-like
information on how a question answering system arrived at its answer(s).
The article also describes an approach for a knowledge provenance
infrastructure supporting the extraction, maintenance, and usage of knowledge
provenance related to answers of web applications and services. },
owner = {pgroth},
}
@INPROCEEDINGS{Szomszor2003,
author = {M. Szomszor and L. Moreau},
title = {Recording and Reasoning over Data Provenance in Web and Grid Services},
booktitle = {Int. Conf. on Ontologies, Databases and Applications of Semantics},
year = {2003},
volume = {2888},
series = {LNCS},
issn = {0302-9743},
provenance = {yes},
pind = {EZ~02~02~04},
isbn = {3-540-20498-9},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/recording_reasoning_over_data_provenance.pdf},
export = {yes},
abstract = {Large-scale, dynamic and open
environments such as the Grid and Web Services build upon existing computing
infrastructures to supply dependable and consistent large-scale computational
systems. This kind of architecture has been adopted by the business and
scientic communities allowing them to exploit extensive and diverse
computing resources to perform complex data processing tasks. In such
systems, results are often derived by composing multiple, geographically
distributed, heterogeneous services as specified by intricate workflow
management. This leads to the undesirable situation where the results are
known, but the means by which they were achieved is not. With both
scientific experiments and business transactions, the notion of lineage
and dataset derivation is of paramount importance since without it,
information is potentially worthless. We address the issue of data
provenance, the description of the origin of a piece of data, in
these environments showing the requirements, uses and implementation
difficulties. We propose an infrastructure level support for a provenance
recording capability for service-oriented architectures such as the Grid
and Web Services. We also developed services to view and retrieve
provenance and we provide a mechanism by which provenance is used to
determine whether previous computed results are still up to date.},
mygrid = {yes},
pagecount = {18},
}
@INPROCEEDINGS{Zhao2003,
author = {J. Zhao and C. Goble and M. Greenwood and C. Wroe and R. Stevens},
title = {Annotating, linking and browsing provenance logs for e-Science},
booktitle = {Proc. of the Workshop on Semantic Web Technologies for Searching and Retrieving Scientific Data},
year = {2003},
month = {October},
abstract = {Like experiments performed at a laboratory bench, the results
of an e-science in silico experiment are of limited value if other
scientists are not able to identify the origin, or provenance, of those
results. For e-Science, we need more systematic provenance logs across a
range of e- Science activities and disciplines as well as a more
informed understanding of the information in these provenance data.
Semantic Web technology, which enables data to be linked and defined
in a way for more effective discovery, integration and cooperation
across computers and people, provides an appropriate solution for our
current requirement. In this paper we show how we used the COHSE
conceptual open hypermedia system to build a dynamically generated
hypertext of web of provenance documents arising from the myGrid
project based on associated concepts and reasoning over the ontology.},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/myGridOntProv.pdf},
}
@PROCEEDINGS{ProvWorkshop2003,
title = {Data Provenance and Annotation},
year = {2003},
month = {December},
owner = {pgroth},
url = {http://www.nesc.ac.uk/esi/events/304/},
}
@INPROCEEDINGS{Bose2004,
author = {R. Bose and J. Frew },
title = {Composing lineage metadata with XML for custom satellite-derived data products},
booktitle = {16th International Conference on Scientific and Statistical Database Management},
year = {2004},
pages = {275 - 284},
month = {June},
doi = {10.1109/SSDM.2004.1311219 },
abstract = {As peer-to-peer dissemination of custom data products
evolves among Earth science research groups, investigators and data
managers must consider how to compose appropriate metadata for their
research computing activities. Because workflows may span multiple
groups, it is critical that lineage (provenance) metadata also be
assembled to document and preserve the origins and processing history of
constituent data products and transformations for future data consumers. To
demonstrate methods for composing lineage metadata for custom processing,
we introduce our terminology for workflow and employ a case study
for the creation of satellite-derived ocean color data products.
Our example contributes to a general metadata model for workflow
that incorporates lineage. We then discuss metadata requirements for
remote sensing-related data products. We propose two techniques for
composing lineage metadata, both based on accessory XML metadata
documents that are paired with the data products and versioned data
transformations they describe. The first technique, implemented as a prototype,
features a dedicated lineage server that introduces the indirection and
flexibility necessary for Web-based lineage navigation. The second, more
promising technique involves defining a simple Resource Description
Framework (RDF) vocabulary for lineage metadata, and using extant RDF/XML
tools for query and navigation. These methods provide guidelines for
composing lineage metadata that are applicable to other domains. },
owner = {pgroth},
}
@INPROCEEDINGS{Groth2004,
author = {P. Groth and M. Luck and L. Moreau},
title = {Formalising a protocol for recording provenance in Grids},
booktitle = {Proc. of the UK OST e-Science second All Hands Meeting 2004 (AHM'04)},
year = {2004},
address = {Nottingham, UK},
month = {September},
pind = {EZ~03~03~04},
export = {yes},
abstract = {Both the scientific and business communities are beginning to rely
on Grids as problemsolving mechanisms. These communities also have
requirements in terms of provenance. Provenance is the documentation
of process and the necessity for it is apparent in fields ranging
from medicine to aerospace. To support provenance capture in Grids,
we have developed an implementation-independent protocol for the
recording of provenance. We describe the protocol in the context of a
service-oriented architecture and formalise the entities involved using an
abstract state machine or a three-dimensional state transition diagram.
Using these techniques we sketch a liveness property for the system.},
url = {http://www.ecs.soton.ac.uk/~lavm/papers/ahm04-groth.pdf},
pasoa = {yes},
pagecount = {8},
}
@INPROCEEDINGS{Groth2004a,
author = {Paul Groth and Michael Luck and Luc Moreau},
title = {A protocol for recording provenance in service-oriented Grids},
booktitle = {Proceedings of the 8th International Conference on Principles of Distributed Systems (OPODIS'04)},
year = {2004},
address = {Grenoble, France},
month = DEC,
export = {yes},
abstract = {Both the scientific and business communities, which are beginning to rely on
Grids as problem-solving mechanisms, have requirements in terms of
provenance. The provenance of some data is the documentation of process
that led to the data; its necessity is apparent in fields ranging
from medicine to aerospace. To support provenance capture in Grids,
we have developed an implementation-independent protocol for the
recording of provenance. We describe the protocol in the context of a
service-oriented architecture and formalise the entities involved using an
abstract state machine or a three-dimensional state transition diagram.
Using these techniques we sketch a liveness property for the system.},
pind = {EZ~03~03~04},
pasoa = {yes},
}
@INPROCEEDINGS{Ruth2004,
author = {P. Ruth and D. Xu and B. K. Bhargava and F. Regnier},
title = {E-notebook Middleware for Acccountability and Reputation Based Trust in Distributed Data Sharing Communities},
booktitle = {Proc. 2nd Int. Conf. on Trust Management, Oxford, UK},
year = {2004},
volume = {2995},
series = {LNCS},
publisher = {Springer},
issn = {0302-9743},
isbn = {3-540-21312-0},
local-url = {/Users/pgroth/Desktop/phd/outside%20documents/organized/enotebook.pdf},
abstract = {This paper presents the design of a new
middleware which provides support for trust and accountability in
distributed data sharing communities. One application is in the context of
scienti{\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}c
collaborations. Multiple researchers share individually collected
data, who in turn create new data sets by performing transformations
on existing shared data sets. In data sharing communities building
trust for the data obtained from others is crucial. However, the
{\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}eld of data provenance does not consider malicious or untrustworthy
users. By adding accountability to the provenance of each data set,
this middlware ensures data integrity insofar as any errors can be
identi{\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\AA}ed and corrected. The user is further protected from faulty
data by a trust view created from past experiences and second-hand
recommendations. A trust view is based on real world social interactions and
re{\^O}¬¨¬®¬¨¬{\AE}¬¨¬®¬¨{\AE}¬¨¬®¬¨¬{\AE}{\AE}{\c C}ects each
userâ??{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\"o}{\"A}{\^o}s own experiences
within the community. By identifying the providers of faulty data and
removing them from a trust view, the integrity of all data is enhanced},
}
@PHDTHESIS{Tan2004,
author = {V. H. K. Tan},
title = {Interaction tracing for mobile agent security},
school = {University of Southampton},
year = {2004},
owner = {pgroth},
abstract = {This thesis develops a new technique, interaction tracing, to address the security
issue of protecting mobile agents from potentially malicious hosts.
In this technique, a mobile agent is modeled as a black box whose
behaviour can be captured through a trace of its inputs and outputs
during the process of execution. Formalization of the activity of
creating and verifying traces is detailed for a simple agent programming
language using operational semantics. An interaction protocol is
developed to enable secure exchange of traces between entities in
the system that are responsible for verifying the validity of the
traces. This protocol is formally modeled and verified for specific
security properties using a finite-state model checker. The protocol is
extended to allow for the activity of trace reconciliation, which
protects inter-agent communication between mobile agents operating in
a multi-agent context. Implementation of this secure protocol in
conjunction with the interaction tracing activity is undertaken in a
mobile agent framework and is quantitatively evaluated against a
non-secure mobile agent system and standard client-server approach. A
trust model is introduced in the context of the protocol that allows
trust relationships to be formed between the various entities in
the system, permitting a more flexible deployment of the protocol.},
}
@INPROCEEDINGS{Zhao2004,
author = {Yong Zhao and Michael Wilde and Ian Foster and Jens Voeckler and Thomas Jordan and Elizabeth Quigg and James Dobson},
title = {Grid middleware services for virtual data discovery, composition, and integration},
booktitle = {Proceedings of the 2nd workshop on Middleware for grid computing},
year = {2004},
pages = {57--62},
address = {New York, NY, USA},
publisher = {ACM Press},
doi = {http://doi.acm.org/10.1145/1028493.1028503},
isbn = {1-58113-950-0},
abstract = {We describe the services,
architecture and application of the GriPhyN Virtual Data System, a suite of
components and services that allow users to describe virtual data products
in declarative terms, discover definitions and assemble workflows
based on those definitions, and execute the resulting workflows on
Grid resources. We show how these middleware-level services have
been applied by specific communities to manage scientific data and
workflows. In particular, we highlight and introduce Chiron, a
portal facility that enables the interactive use of the virtual data
system. Chiron has been used within the QuarkNet education project
and as an online "educator" for virtual data applications. We also
present applications from functional MRI-based neuroscience research.},
location = {Toronto, Ontario, Canada},
}
@INPROCEEDINGS{Ledlie2005,
author = {Jonathan Ledlie and Chaki Ng and David A. Holland and Kiran-Kumar Muniswamy-Reddy and Uri Braun and Margo Seltzer},
title = {Provenance-Aware Sensor Data Storage},
booktitle = {NetDB 2005},
year = {2005},
month = {April},
abstract = {Sensor network data has both historical and realtime value. Making historical sensor
data useful, in particular, requires storage, naming, and indexing.
Sensor data presents new challenges in these areas. Such data is
location-specific but also distributed; it is collected in a particular physical
location and may be most useful there, but it has additional value when
combined with other sensor data collections in a larger distributed
system. Thus, arranging location-sensitive peer-to-peer storage is one
challenge. Sensor data sets do not have obvious names, so naming them in a
globally useful fashion is another challenge. The last challenge
arises from the need to index these sensor data sets to make them
searchable. The key to sensor data identity is provenance, the full
history or lineage of the data. We show how provenance addresses
the naming and indexing issues and then present a research agenda
for constructing distributed, indexed repositories of sensor data.},
}
@INPROCEEDINGS{Townend2005,
author = {Paul Townend and Paul Groth and Jie Xu},
title = {A Provenance-Aware Weighted Fault Tolerance Scheme for Service-Based Applications},
booktitle = {Proc. of the 8th IEEE International Symposium on Object-oriented Real-time distributed Computing (ISORC 2005)},
year = {2005},
month = {May},
abstract = {Service-orientation has been proposed as away of facilitating the development
and integration of increasingly complex and heterogeneous system
components. However, there are many new challenges to the dependability
community in this new paradigm, such as how individual channels within
fault-tolerant systems may invoke common services as part of their workflow,
thus increasing the potential for common-mode failure. We propose a
scheme that - for the first time - links the technique of provenance
with that of multi-version fault tolerance. We implement a large
test system and perform experiments with a single-version system, a
traditional MVD system, and a provenance-aware MVD system, and compare
their results. We show that for this experiment, our provenance-aware
scheme results in a much more dependable system than either of the
other systems tested, whilst imposing a negligible timing overhead.},
owner = {pgroth},
}
@INPROCEEDINGS{Widom2005,
author = {J. Widom},
title = {Trio: a system for integrated management of data, accuracy, and lineage},
booktitle = {Second Biennial Conference on Innovative Data Systems Research (CIDR 2005)},
year = {2005},
address = {Asilomar, Calif.},
month = {January},
abstract = {Trio is a new database system that manages not only data,
but also the accuracy and lineage of the data. Inexact (uncertain,
probabilistic, fuzzy, approximate, incomplete, and imprecise!) databases have
been proposed in the past, and the lineage problem also has been
studied. The goals of the Trio project are to combine and distill
previous work into a simple and usable model, design a query language as
an understandable extension to SQL, and most importantly build a
working system?a system that augments conventional data management
with both accuracy and lineage as an integral part of the data. This
paper provides numerous motivating applications for Trio and lays out
preliminary plans for the data model, query language, and prototype system.},
owner = {pgroth},
}