Added a few references, and addressed a few comments.

This commit is contained in:
Sears Russell 2006-09-03 21:06:58 +00:00
parent 6cac8a73de
commit 2a69ba6bea
2 changed files with 145 additions and 60 deletions

View file

@ -43,6 +43,25 @@
OPTannote = {} OPTannote = {}
} }
@Book{dtp,
author = {{The Open Group}},
ALTeditor = {},
title = {Distributed Transaction Processing: Reference Model},
publisher = {},
year = {1996},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTseries = {},
OPTaddress = {},
OPTedition = {},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@inproceedings{ejbCritique, @inproceedings{ejbCritique,
author = {Raul Silaghi and Alfred Strohmeier}, author = {Raul Silaghi and Alfred Strohmeier},
title = {Critical Evaluation of the {EJB} Transaction Model}, title = {Critical Evaluation of the {EJB} Transaction Model},
@ -494,6 +513,60 @@
OPTannote = {} OPTannote = {}
} }
@article{systemR,
author = {{Astrahan, M. et al}},
OPTauthor = {Morton M. Astrahan and
Mike W. Blasgen and
Donald D. Chamberlin and
Kapali P. Eswaran and
Jim Gray and
Patricia P. Griffiths and
W. Frank King III and
Raymond A. Lorie and
Paul R. McJones and
James W. Mehl and
Gianfranco R. Putzolu and
Irving L. Traiger and
Bradford W. Wade and
Vera Watson},
title = {System R: Relational Approach to Database Management.},
journal = {ACM Transactions on Database Systems},
volume = {1},
number = {2},
year = {1976},
pages = {97-137},
ee = {http://doi.acm.org/10.1145/320455.320457, db/journals/tods/AstrahanBCEGGKLMMPTWW76.html},
bibsource = {DBLP, http://dblp.uni-trier.de}
}
@inproceedings{DDS,
author = {Steven D. Gribble and
Eric A. Brewer and
Joseph M. Hellerstein and
David E. Culler},
title = {Scalable, Distributed Data Structures for Internet Service
Construction.},
booktitle = {OSDI},
year = {2000},
pages = {319-332},
bibsource = {DBLP, http://dblp.uni-trier.de}
}
@inproceedings{boxwood,
author = {John MacCormick and
Nick Murphy and
Marc Najork and
Chandramohan A. Thekkath and
Lidong Zhou},
title = {Boxwood: Abstractions as the Foundation for Storage Infrastructure.},
booktitle = {OSDI},
year = {2004},
pages = {105-120},
ee = {http://www.usenix.org/events/osdi04/tech/maccormick.html},
bibsource = {DBLP, http://dblp.uni-trier.de}
}
@InProceedings{riscDB, @InProceedings{riscDB,
@ -699,6 +772,22 @@
OPTannote = {} OPTannote = {}
} }
@article{objectstore,
author = {Charles Lamb and
Gordon Landis and
Jack A. Orenstein and
Daniel Weinreb},
title = {The {ObjectStore} Database System.},
journal = {Communications of the ACM},
volume = {34},
number = {10},
year = {1991},
pages = {50-63},
ee = {db/journals/cacm/LambLOW91.html},
bibsource = {DBLP, http://dblp.uni-trier.de}
}
@inproceedings{cricket, @inproceedings{cricket,
author = {Eugene J. Shekita and author = {Eugene J. Shekita and

View file

@ -221,7 +221,7 @@ database and systems researchers for at least 25 years.
\subsection{The Database View} \subsection{The Database View}
The database community approaches the limited range of DBMSs by either The database community approaches the limited range of DBMSs by either
creating new top-down models, such as object-oriented, XML or streaming databases~\cite{OOdb, XMLdb, streaming}, creating new top-down models, such as object-oriented, XML or streaming databases~\cite{objectstore, streaming}, \rcs{which xml database should we cite?}
or by extending the relational model~\cite{codd} along some axis, such or by extending the relational model~\cite{codd} along some axis, such
as new data types~\cite{newDBtypes}. We cover these attempts in more detail in as new data types~\cite{newDBtypes}. We cover these attempts in more detail in
Section~\ref{sec:related-work}. Section~\ref{sec:related-work}.
@ -442,16 +442,10 @@ intend to keep even when transactions abort.
The primary difference between \yad and ARIES for basic transactions The primary difference between \yad and ARIES for basic transactions
is that \yad allows user-defined operations, while ARIES defines a set is that \yad allows user-defined operations, while ARIES defines a set
of operations that support relational database systems. \rcs{merge with 3.4->}An {\em of operations that support relational database systems. An {\em
operation} consists of both a redo and an undo function, both of which Operation} consists of an undo and a redo function. Each time an
take one argument. An update is always the redo function applied to a operation is invoked, a corrseponding log entry is generated. We
page; there is no ``do'' function. This ensures that updates behave describe operations in more detail in Section~\ref{sec:operations}
the same on recovery. The redo log entry consists of the LSN and the
argument. The undo entry is analogous.\endnote{For efficiency, undo
and redo operations are packed into a single log entry. Both must take
the same parameters.} \yad ensures the correct ordering and timing
of all log entries and page writes.\rcs{<--} We describe operations in more
detail in Section~\ref{sec:operations}
%\subsection{Multi-page Transactions} %\subsection{Multi-page Transactions}
@ -583,17 +577,21 @@ constraint in Section~\ref{sec:lsn-free}.
Operations are invoked by registering a callback (the ``operation Operations are invoked by registering a callback (the ``operation
implementation'' in Figure~\ref{fig:structure}) with \yad at startup, implementation'' in Figure~\ref{fig:structure}) with \yad at startup,
and then calling {\tt Tupdate()} to invoke the operation at runtime. and then calling {\tt Tupdate()} to invoke the operation at runtime.
\yad ensures that operations follow the write-ahead logging rules
required for steal/no-force transactions by controlling the timing and
ordering of log and page writes.
\yad ensures that operations follow the The redo log entry consists of the
write-ahead logging rules required for steal/no-force transactions by LSN and an argument that will be passed to redo. The undo entry is
controlling the timing and ordering of log and page writes. \rcs{3.2 stuff goes here} Each analogous.\endnote{For efficiency, undo and redo operations are packed
into a single log entry. Both must take the same parameters.} Each
operation should be deterministic, provide an inverse, and acquire all operation should be deterministic, provide an inverse, and acquire all
of its arguments from a struct that is passed via {\tt Tupdate()}, from of its arguments from the argument passed via {\tt Tupdate()},
the page it updates, or both. The callbacks used from the page it updates, or both. The callbacks used during forward
during forward operation are also used during recovery. Therefore operation are also used during recovery. Therefore operations provide
operations provide a single redo function and a single undo function. a single redo function and a single undo function. There is no ``do''
There is no ``do'' function, which reduces the amount of function, which reduces the amount of recovery-specific code in the
recovery-specific code in the system. system.
%{\tt Tupdate()} writes the struct %{\tt Tupdate()} writes the struct
%that is passed to it to the log before invoking the operation's %that is passed to it to the log before invoking the operation's
@ -629,8 +627,11 @@ implementation must obey a few more invariants:
via {\tt Tupdate()}. Recovery does not support logical redo, via {\tt Tupdate()}. Recovery does not support logical redo,
and physical operation implementations may not invoke {\tt and physical operation implementations may not invoke {\tt
Tupdate()}. Tupdate()}.
\item Page updates atomically update the page's LSN by pinning the page. \item The page's LSN should be updated to reflect the changes (this is
\eab{``pinning'' is not quite right here; we could use latch, but we haven't devined it yet; could swict sections 3.4 and 3.5} generally handled by passing the LSN to the page implementation).
\eab{``pinning'' is not quite right here; we could use latch, but we
haven't devined it yet; could swict sections 3.4 and 3.5} \rcs{We can
ignore atomicity here. \yad pins the page for the operation. The new description is more accurate.}
%\item If the data seen by a wrapper function must match data seen %\item If the data seen by a wrapper function must match data seen
% during redo, then the wrapper should use a latch to protect against % during redo, then the wrapper should use a latch to protect against
@ -910,8 +911,8 @@ could use {\tt mmap()} to map portions of the page file into application
memory~\cite{lrvm}. However, without support for logical log entries memory~\cite{lrvm}. However, without support for logical log entries
and nested top actions, it is difficult to implement a and nested top actions, it is difficult to implement a
concurrent, durable data structure using RVM or Camelot. (The description of concurrent, durable data structure using RVM or Camelot. (The description of
Argus in Section~\ref{sec:transactionalProgramming} sketches the Argus in Section~\ref{sec:argus} sketches the
general approach.)\eab{check this last sentence} general approach.)
In contrast, LSN-free pages allow logical In contrast, LSN-free pages allow logical
undo and therefore nested top actions and concurrent undo and therefore nested top actions and concurrent
@ -972,7 +973,7 @@ logically consistent.
\begin{figure} \begin{figure}
\includegraphics[% \includegraphics[%
bb=0bp 0bp 445bp 275bp, viewport=0bp 0bp 445bp 275bp,
clip, clip,
width=1\columnwidth]{figs/torn-page.pdf} width=1\columnwidth]{figs/torn-page.pdf}
\caption{\sf\label{fig:torn}Torn pages and LSN-free recovery. \caption{\sf\label{fig:torn}Torn pages and LSN-free recovery.
@ -1129,8 +1130,11 @@ structure atomically. It uses a {\em linear} hash
function~\cite{lht}, allowing it to increase capacity incrementally. function~\cite{lht}, allowing it to increase capacity incrementally.
It is based on a number of modular subcomponents. Notably, the It is based on a number of modular subcomponents. Notably, the
physical location of each bucket is stored in a growable array of physical location of each bucket is stored in a growable array of
fixed-length entries. The bucket lists are provided by the user's fixed-length entries. The bucket lists can be provided by either of
choice of two different linked-list implementations.\rcs{Expand on this} \yads linked list implementations. One provides fixed length entries,
yielding a hash table with fixed length keys and values. The list
(and therefore hash table) used in our experiments provides variable
length entries.
The hand-tuned hash table is also built on \yad and also uses a linear hash The hand-tuned hash table is also built on \yad and also uses a linear hash
function. However, it is monolithic and uses carefully ordered writes to function. However, it is monolithic and uses carefully ordered writes to
@ -1163,11 +1167,11 @@ optimize important primitives.
%the transactional data structure implementation. %the transactional data structure implementation.
Figure~\ref{fig:TPS} describes the performance of the two systems under Figure~\ref{fig:TPS} describes the performance of the two systems under
highly concurrent workloads using the ext3 filesystem.\endnote{The multi-threaded benchmarks highly concurrent workloads using the ext3 filesystem.\endnote{Multi-threaded benchmarks
presented here were performed using an ext3 file system, as high were performed using an ext3 file system.
concurrency caused both Berkeley DB and \yad to behave unpredictably Concurrency caused both Berkeley DB and \yad to behave unpredictably
when ReiserFS was used. However, \yads multi-threaded throughput under ReiserFS was used. \yads multi-threaded throughput
was significantly better that Berkeley DB's under both file systems.} was significantly better than Berkeley DB's with both file systems.}
For this test, we used the modular For this test, we used the modular
hash table, since we are interested in the performance of a hash table, since we are interested in the performance of a
simple, clean data structure implementation that a typical system implementor might simple, clean data structure implementation that a typical system implementor might
@ -1321,17 +1325,12 @@ to disk.
To determine the effect of the optimization in memory bound systems, To determine the effect of the optimization in memory bound systems,
we decreased \yads page cache size, and used O\_DIRECT to bypass the we decreased \yads page cache size, and used O\_DIRECT to bypass the
operating system's disk cache. We partitioned the set of objects operating system's disk cache. We partitioned the set of objects
so that 10\% fit in a {\em hot set} \rcs{This doesn't make sense: that is small enough to fit into so that 10\% fit in a {\em hot set}.
memory}. Figure~\ref{fig:OASYS} also presents \yads performance as we varied the Figure~\ref{fig:OASYS} also presents \yads performance as we varied the
percentage of object updates that manipulate the hot set. In the percentage of object updates that manipulate the hot set. In the
memory bound test, we see that update/flush indeed improves memory memory bound test, we see that update/flush indeed improves memory
utilization. utilization.
\subsection{Request reordering} \subsection{Request reordering}
\eab{this section unclear, including title} \eab{this section unclear, including title}
@ -1436,7 +1435,7 @@ Genesis. It uses abstract data type definitions, access methods and
cost models to generate query optimizers and execution cost models to generate query optimizers and execution
engines automatically. engines automatically.
Object-oriented database systems (\rcs{cite something?}) and Object-oriented database systems~\cite{objectstore} and
relational databases with support for user-definable abstract data relational databases with support for user-definable abstract data
types (such as in Postgres~\cite{postgres}) provide functionality types (such as in Postgres~\cite{postgres}) provide functionality
similar to extensible database toolkits. In contrast to database similar to extensible database toolkits. In contrast to database
@ -1511,7 +1510,7 @@ the option to compensate for nested top action. We expect that nested
transactions could be implemented with \yad. transactions could be implemented with \yad.
\subsubsection{Distributed Programming Models} \subsubsection{Distributed Programming Models}
\label{sec:argus}
%System R was one of the first relational database implementations, and %System R was one of the first relational database implementations, and
%defined a clean separation between its query processor and its storage %defined a clean separation between its query processor and its storage
%subsystem. In fact, it supported a simple navigational interface to %subsystem. In fact, it supported a simple navigational interface to
@ -1520,24 +1519,27 @@ transactions could be implemented with \yad.
Nested transactions simplify distributed systems; they isolate Nested transactions simplify distributed systems; they isolate
failures, manage concurrency, and provide durability. In fact, they failures, manage concurrency, and provide durability. In fact, they
were developed as part of Argus, a language for reliable distributed applications. \rcs{This text confuses argus and bill's follow on work.} An Argus were developed as part of Argus, a language for reliable distributed
program consists of guardians, which are essentially objects that applications. An Argus program consists of guardians, which are essentially
encapsulate persistent and atomic data. Although accesses to {\em atomic} data are objects that encapsulate persistent and atomic data. Accesses to {\em
serializable, {\em persistent} data is not protected by the lock manager, atomic} data are serializable, while {\em persistent} data is atomic
and is used to implement concurrent data structures~\cite{argus}. data that is stored on disk~\cite{argus}.
Typically, the data structure is stored in persistent storage, but is augmented with
Originally, Argus only supported limited concurrency via total
isolation, but was extended to support high concurrency data
structures. Concurrent data structures are stored in non-atomic storage, but are augmented with
information in atomic storage. This extra data tracks the information in atomic storage. This extra data tracks the
status of each item stored in the structure. Conceptually, atomic status of each item stored in the structure. Conceptually, atomic
storage used by a hashtable would contain the values ``Not present'', storage used by a hashtable would contain the values ``Not present'',
``Committed'' or ``Aborted; Old Value = x'' for each key in (or ``Committed'' or ``Aborted; Old Value = x'' for each key in (or
missing from) the hash. Before accessing the hash, the operation missing from) the hash. Before accessing the hash, the operation
implementation would consult the appropriate piece of atomic data, and implementation would consult the appropriate piece of atomic data, and
update the persistent storage if necessary. Because the atomic data is update the non-atomic data if necessary. Because the atomic data is
protected by a lock manager, attempts to update the hashtable are serializable. protected by a lock manager, attempts to update the hashtable are serializable.
Therefore, clever use of atomic storage can be used to provide logical locking. Therefore, clever use of atomic storage can be used to provide logical locking.
\rcs{More confusion...} Efficiently Efficiently
tracking such state is not straightforward. For example, the Argus tracking such state is not straightforward. For example, their
hashtable implementation uses a log structure to hashtable implementation uses a log structure to
track the status of keys that have been touched by track the status of keys that have been touched by
active transactions. Also, the hashtable is responsible for setting disk write back active transactions. Also, the hashtable is responsible for setting disk write back
@ -1546,11 +1548,6 @@ complexity by providing logical undos, and by leaving lock management
to higher-level code. This separates write-back and concurrency to higher-level code. This separates write-back and concurrency
control policies from data structure implementations. control policies from data structure implementations.
%The Argus designers assumed that only a few core concurrent
%transactional data structures would be implemented, and that higher
%level code would make use of these structures. Also, Argus assumed
%that transactions should be serializable.
Camelot made a number of important Camelot made a number of important
contributions, both in system design, and in algorithms for contributions, both in system design, and in algorithms for
distributed transactions~\cite{camelot}. It leaves locking to application level code, distributed transactions~\cite{camelot}. It leaves locking to application level code,
@ -1567,7 +1564,7 @@ provides mechanisms for distributed transactions and transactional
RPC. Although Camelot does allow applications to provide their own lock RPC. Although Camelot does allow applications to provide their own lock
managers, implementation strategies for concurrent operations managers, implementation strategies for concurrent operations
in Camelot are similar to those in Camelot are similar to those
in Argus since Camelot does not provide logical undo. Camelot focuses built using Argus since Camelot does not provide logical undo. Camelot focuses
on distributed transactions, and hardcodes on distributed transactions, and hardcodes
assumptions regarding the structure of nested transactions, consensus assumptions regarding the structure of nested transactions, consensus
algorithms, communication mechanisms, and so on. algorithms, communication mechanisms, and so on.
@ -1576,10 +1573,10 @@ More recent transactional programming schemes allow for multiple
transaction implementations to cooperate as part of the same transaction implementations to cooperate as part of the same
distributed transaction. For example, X/Open DTP provides a standard distributed transaction. For example, X/Open DTP provides a standard
networking protocol that allows multiple transactional systems to be networking protocol that allows multiple transactional systems to be
controlled by a single transaction manager~\cite{something}. controlled by a single transaction manager~\cite{dtp}.
Enterprise Java Beans is a standard for developing transactional Enterprise Java Beans is a standard for developing transactional
middle ware on top of heterogeneous storage. Its middle ware on top of heterogeneous storage. Its
transactions may not be nested~\cite{something}. This simplifies its transactions may not be nested. This simplifies its
semantics, and leads to many, short transactions, semantics, and leads to many, short transactions,
improving concurrency. However, flat transactions are somewhat rigid, and lead to improving concurrency. However, flat transactions are somewhat rigid, and lead to
situations where committed transactions have to be manually rolled situations where committed transactions have to be manually rolled
@ -1636,7 +1633,7 @@ layout that we believe \yad could eventually support.
Some large object storage systems allow arbitrary insertion and deletion of bytes~\cite{esm} Some large object storage systems allow arbitrary insertion and deletion of bytes~\cite{esm}
within the object, while typical file systems within the object, while typical file systems
provide append-only allocation~\cite{ffs}. provide append-only allocation~\cite{ffs}.
Record-oriented allocation, including Multics' segments~\cite{multics} and GFS~\cite{GFS}, is an alternative. Record-oriented allocation, including Multics' segments~\cite{multics} and GFS~\cite{gfs}, is an alternative.
Write-optimized file systems lay files out in the order they Write-optimized file systems lay files out in the order they
were written rather than in logically sequential order~\cite{lfs}. were written rather than in logically sequential order~\cite{lfs}.
@ -1726,7 +1723,6 @@ optimization is from Mike Demmer; he and Bowei Du implemented \oasys.
Gilad Arnold and Amir Kamil implemented Gilad Arnold and Amir Kamil implemented
pobj. Jim Blomo, Jason Bayer, and Jimmy pobj. Jim Blomo, Jason Bayer, and Jimmy
Kittiyachavalit worked on an early version of \yad. Kittiyachavalit worked on an early version of \yad.
\rcs{colleen}
Thanks to C. Mohan for pointing out that per-object LSNs may be Thanks to C. Mohan for pointing out that per-object LSNs may be
inadvertently overwritten during recovery. Jim Gray suggested we use inadvertently overwritten during recovery. Jim Gray suggested we use