cleanup
This commit is contained in:
parent
775e9dda49
commit
742fc3bf5d
2 changed files with 85 additions and 40 deletions
|
@ -109,6 +109,21 @@
|
||||||
OPTannote = {}
|
OPTannote = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@inproceedings{SNS,
|
||||||
|
author = "Armando Fox and Steven D. Gribble and Yatin Chawathe and Eric A. Brewer and Paul Gauthier",
|
||||||
|
title = "Cluster-Based Scalable Network Services",
|
||||||
|
booktitle = "Symposium on Operating Systems Principles",
|
||||||
|
pages = "78--91",
|
||||||
|
year = "1997"}
|
||||||
|
|
||||||
|
@misc{ bent04explicit,
|
||||||
|
author = "J. Bent and D. Thain and A. Arpaci-Dusseau and R. Arpaci-Dusseau",
|
||||||
|
title = "Explicit control in a batch-aware distributed file system",
|
||||||
|
text = "J. Bent, D. Thain, A. Arpaci-Dusseau, and R. Arpaci-Dusseau. Explicit control
|
||||||
|
in a batch-aware distributed file system. In Proceedings of the First USENIX/ACM
|
||||||
|
Conference on Networked Systems Design and Implementation, March 2004.",
|
||||||
|
year = "2004",
|
||||||
|
url = "citeseer.ist.psu.edu/article/bent04explicit.html" }
|
||||||
|
|
||||||
|
|
||||||
@InProceedings{mapReduce,
|
@InProceedings{mapReduce,
|
||||||
|
@ -208,7 +223,12 @@
|
||||||
OPTannote = {}
|
OPTannote = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@InProceedings{streaming,
|
||||||
|
author = "S. Chandrasekaran and M. Franklin",
|
||||||
|
title = "Streaming Queries over Streaming Data",
|
||||||
|
booktitle = {Proc. of VLDB},
|
||||||
|
year = "2002"
|
||||||
|
}
|
||||||
|
|
||||||
@InProceedings{molap,
|
@InProceedings{molap,
|
||||||
author = {Yihong Zhao and Prasad M. Deshpande and Jeffrey F. Naughton},
|
author = {Yihong Zhao and Prasad M. Deshpande and Jeffrey F. Naughton},
|
||||||
|
@ -358,6 +378,18 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@inproceedings{newDBtypes,
|
||||||
|
author = {Michael Stonebraker},
|
||||||
|
title = {Inclusion of New Types in Relational Data Base Systems},
|
||||||
|
booktitle = {Proceedings of the Second International Conference on Data Engineering,
|
||||||
|
February 5-7, 1986, Los Angeles, California, USA},
|
||||||
|
publisher = {IEEE Computer Society},
|
||||||
|
year = {1986},
|
||||||
|
isbn = {0-8186-0655-X},
|
||||||
|
pages = {262--269}
|
||||||
|
}
|
||||||
|
|
||||||
@Article{postgres,
|
@Article{postgres,
|
||||||
author = {M. Stonebraker and Greg Kemnitz},
|
author = {M. Stonebraker and Greg Kemnitz},
|
||||||
title = {The {POSTGRES} Next-Generation Database Management System},
|
title = {The {POSTGRES} Next-Generation Database Management System},
|
||||||
|
@ -366,12 +398,23 @@
|
||||||
OPTkey = {},
|
OPTkey = {},
|
||||||
volume = {34},
|
volume = {34},
|
||||||
number = {10},
|
number = {10},
|
||||||
pages = {79-92},
|
pages = {79--92},
|
||||||
month = {October},
|
month = {October},
|
||||||
OPTnote = {},
|
OPTnote = {},
|
||||||
OPTannote = {}
|
OPTannote = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@article{OLAP,
|
||||||
|
author = {Surajit Chaudhuri and
|
||||||
|
Umeshwar Dayal},
|
||||||
|
title = {An Overview of Data Warehousing and OLAP Technology},
|
||||||
|
journal = {SIGMOD Record},
|
||||||
|
volume = {26},
|
||||||
|
number = {1},
|
||||||
|
year = {1997},
|
||||||
|
pages = {65--74}
|
||||||
|
}
|
||||||
|
|
||||||
@InProceedings{aries,
|
@InProceedings{aries,
|
||||||
author = { C. Mohan and D. Haderle and B. Lindsay and H. Pirahesh and P Schwarz },
|
author = { C. Mohan and D. Haderle and B. Lindsay and H. Pirahesh and P Schwarz },
|
||||||
title = {{ARIES}, A Transaction Recovery Method Supporting Fine-Granularity Locking and Partial Rollbacks Using Write-Ahead Logging},
|
title = {{ARIES}, A Transaction Recovery Method Supporting Fine-Granularity Locking and Partial Rollbacks Using Write-Ahead Logging},
|
||||||
|
|
|
@ -119,7 +119,7 @@ scientific computing. These applications have complex transactional
|
||||||
storage requirements, but do not fit well onto SQL or the monolithic
|
storage requirements, but do not fit well onto SQL or the monolithic
|
||||||
approach of current databases. In fact, when performance matters
|
approach of current databases. In fact, when performance matters
|
||||||
these applications often avoid DBMSs and instead implement ad-hoc data
|
these applications often avoid DBMSs and instead implement ad-hoc data
|
||||||
management solutions~\cite{SNS}.
|
management solutions~\cite{mapReduce,SNS}.
|
||||||
|
|
||||||
An example of this mismatch occurs with DBMS support for persistent objects.
|
An example of this mismatch occurs with DBMS support for persistent objects.
|
||||||
In a typical usage, an array of objects is made persistent by mapping
|
In a typical usage, an array of objects is made persistent by mapping
|
||||||
|
@ -221,9 +221,9 @@ database and systems researchers for at least 25 years.
|
||||||
\subsection{The Database View}
|
\subsection{The Database View}
|
||||||
|
|
||||||
The database community approaches the limited range of DBMSs by either
|
The database community approaches the limited range of DBMSs by either
|
||||||
creating new top-down models, such as object oriented or XML databases~\cite{OOdb, XMLdb},
|
creating new top-down models, such as object-oriented, XML or streaming databases~\cite{OOdb, XMLdb, streaming},
|
||||||
or by extending the relational model~\cite{codd} along some axis, such
|
or by extending the relational model~\cite{codd} along some axis, such
|
||||||
as new data types. We cover these attempts in more detail in
|
as new data types~\cite{newDBtypes}. We cover these attempts in more detail in
|
||||||
Section~\ref{sec:related-work}.
|
Section~\ref{sec:related-work}.
|
||||||
|
|
||||||
%Database systems are often thought of in terms of the high-level
|
%Database systems are often thought of in terms of the high-level
|
||||||
|
@ -255,12 +255,11 @@ relation into a set of keyed tuples. If the database were going to be
|
||||||
used for short, write-intensive and high-concurrency transactions
|
used for short, write-intensive and high-concurrency transactions
|
||||||
(OLTP), the physical model would probably translate sets of tuples
|
(OLTP), the physical model would probably translate sets of tuples
|
||||||
into an on-disk B-tree. In contrast, if the database needed to
|
into an on-disk B-tree. In contrast, if the database needed to
|
||||||
support long-running, read-only aggregation queries (OLAP) over high
|
support long-running, read-only aggregation queries (OLAP) over high-dimensional data, a physical model that stores the data in a sparse
|
||||||
dimensional data, a physical model that stores the data in a sparse
|
array format would be more appropriate~\cite{OLAP,molap}. Although both
|
||||||
array format would be more appropriate~\cite{molap}. Although both
|
|
||||||
OLTP and OLAP databases are based upon the relational model they make
|
OLTP and OLAP databases are based upon the relational model they make
|
||||||
use of different physical models in order to efficiently serve
|
use of different physical models in order to serve
|
||||||
different classes of applications.
|
different classes of applications efficiently.
|
||||||
|
|
||||||
A basic claim of
|
A basic claim of
|
||||||
this paper is that no known physical data model can efficiently
|
this paper is that no known physical data model can efficiently
|
||||||
|
@ -277,6 +276,9 @@ implement most of the data models that the underlying hardware can
|
||||||
support, or to abandon the database approach entirely, and forgo
|
support, or to abandon the database approach entirely, and forgo
|
||||||
structured physical models and abstract conceptual mappings.
|
structured physical models and abstract conceptual mappings.
|
||||||
|
|
||||||
|
\eab{add OneSizeFitsAll paragraph}
|
||||||
|
|
||||||
|
|
||||||
\subsection{The Systems View}
|
\subsection{The Systems View}
|
||||||
\label{sec:systems}
|
\label{sec:systems}
|
||||||
The systems community has also worked on this mismatch,
|
The systems community has also worked on this mismatch,
|
||||||
|
@ -350,8 +352,8 @@ atomically updating portions of durable storage. These small atomic
|
||||||
updates bootstrap transactions that are too large to be
|
updates bootstrap transactions that are too large to be
|
||||||
applied atomically. In particular, write-ahead logging (and therefore
|
applied atomically. In particular, write-ahead logging (and therefore
|
||||||
\yad) relies on the ability to write entries to the log
|
\yad) relies on the ability to write entries to the log
|
||||||
file atomically. Transaction systems that store LSNs on pages to
|
file atomically. Transaction systems that store sequence numbers on pages to
|
||||||
track version information rely on atomic page writes as well.
|
track version information also rely on atomic page writes.
|
||||||
|
|
||||||
In practice, a write to a disk page is not atomic (in modern drives). Two common failure
|
In practice, a write to a disk page is not atomic (in modern drives). Two common failure
|
||||||
modes exist. The first occurs when the disk writes a partial sector
|
modes exist. The first occurs when the disk writes a partial sector
|
||||||
|
@ -432,12 +434,11 @@ On recovery, the redo phase applies all updates (even those from
|
||||||
aborted transactions). Then, an undo phase corrects stolen pages for
|
aborted transactions). Then, an undo phase corrects stolen pages for
|
||||||
aborted transactions. Each operation that undo performs is recorded
|
aborted transactions. Each operation that undo performs is recorded
|
||||||
in the log, and the per-page LSN is updated accordingly. In order to
|
in the log, and the per-page LSN is updated accordingly. In order to
|
||||||
prevent repeated crashes during recovery from causing the log to grow
|
ensure progress even with crashes during recovery, special log records
|
||||||
excessively, the entries written during the undo phase tell future
|
mark which actions have been undone, so they may be skipped during
|
||||||
undo phases to skip portions of the transaction that have already been
|
recovery in the future. We also use these records, called {\em
|
||||||
undone. These log entries are usually called {\em Compensation Log
|
Compensation Log Records (CLRs)} to avoid undoing actions that we
|
||||||
Records (CLRs)}.
|
intend to keep even when transactions abort.
|
||||||
|
|
||||||
|
|
||||||
The primary difference between \yad and ARIES for basic transactions
|
The primary difference between \yad and ARIES for basic transactions
|
||||||
is that \yad allows user-defined operations, while ARIES defines a set
|
is that \yad allows user-defined operations, while ARIES defines a set
|
||||||
|
@ -539,9 +540,9 @@ operations:
|
||||||
hash table: the undo for {\em insert} is {\em remove}. This logical
|
hash table: the undo for {\em insert} is {\em remove}. This logical
|
||||||
undo function should arrange to acquire the mutex when invoked by
|
undo function should arrange to acquire the mutex when invoked by
|
||||||
abort or recovery.
|
abort or recovery.
|
||||||
\item Add a ``begin nested top action'' right after the mutex
|
\item Add a ``begin nested top action'' right after mutex
|
||||||
acquisition, and an ``end nested top action'' right before the mutex
|
acquisition, and an ``end nested top action'' right before mutex
|
||||||
is released. \yad includes operations that provide nested top
|
release. \yad includes operations that provide nested top
|
||||||
actions.
|
actions.
|
||||||
\end{enumerate}
|
\end{enumerate}
|
||||||
|
|
||||||
|
@ -608,7 +609,7 @@ recovery-specific code in the system.
|
||||||
|
|
||||||
|
|
||||||
The first step in implementing a new operation is to decide upon an
|
The first step in implementing a new operation is to decide upon an
|
||||||
external interface, which is typically cleaner than directly calling {\tt Tupdate()} to invoke the redo/undo operations.
|
external interface, which is typically cleaner than directly calling {\tt Tupdate()} to invoke the operation(s).
|
||||||
The externally visible interface is implemented
|
The externally visible interface is implemented
|
||||||
by wrapper functions and read-only access methods. The wrapper
|
by wrapper functions and read-only access methods. The wrapper
|
||||||
function modifies the state of the page file by packaging the
|
function modifies the state of the page file by packaging the
|
||||||
|
@ -629,6 +630,8 @@ implementation must obey a few more invariants:
|
||||||
and physical operation implementations may not invoke {\tt
|
and physical operation implementations may not invoke {\tt
|
||||||
Tupdate()}.
|
Tupdate()}.
|
||||||
\item Page updates atomically update the page's LSN by pinning the page.
|
\item Page updates atomically update the page's LSN by pinning the page.
|
||||||
|
\eab{``pinning'' is not quite right here; we could use latch, but we haven't devined it yet; could swict sections 3.4 and 3.5}
|
||||||
|
|
||||||
%\item If the data seen by a wrapper function must match data seen
|
%\item If the data seen by a wrapper function must match data seen
|
||||||
% during redo, then the wrapper should use a latch to protect against
|
% during redo, then the wrapper should use a latch to protect against
|
||||||
% concurrent attempts to update the sensitive data (and against
|
% concurrent attempts to update the sensitive data (and against
|
||||||
|
@ -758,7 +761,7 @@ of the transaction that created a region of freespace, and does not
|
||||||
coalesce or reuse any storage associated with an active transaction.
|
coalesce or reuse any storage associated with an active transaction.
|
||||||
In contrast, the record allocator is called frequently and must enable locality. It associates a set of pages with
|
In contrast, the record allocator is called frequently and must enable locality. It associates a set of pages with
|
||||||
each transaction, and keeps track of deallocation events, making sure
|
each transaction, and keeps track of deallocation events, making sure
|
||||||
that space on a page is never over reserved. Providing each
|
that space on a page is never overbooked. Providing each
|
||||||
transaction with a separate pool of freespace increases
|
transaction with a separate pool of freespace increases
|
||||||
concurrency and locality. This is
|
concurrency and locality. This is
|
||||||
similar to Hoard~\cite{hoard} and
|
similar to Hoard~\cite{hoard} and
|
||||||
|
@ -772,8 +775,10 @@ special-purpose lock managers are a useful abstraction.\rcs{This would
|
||||||
be a good place to cite Bill and others on higher-level locking
|
be a good place to cite Bill and others on higher-level locking
|
||||||
protocols}
|
protocols}
|
||||||
|
|
||||||
Locking is largely orthogonal to the concepts described in this paper.
|
Although custom locking is important for flexiblity, it is largely
|
||||||
We make no assumptions regarding lock managers being used by higher-level code in the remainder of this discussion.
|
orthogonal to the concepts described in this paper. We make no
|
||||||
|
assumptions regarding lock managers being used by higher-level code in
|
||||||
|
the remainder of this discussion.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -830,8 +835,7 @@ deterministic, idempotent redo entries that do not examine page state.
|
||||||
We call such operations ``blind updates.'' Note that we still allow
|
We call such operations ``blind updates.'' Note that we still allow
|
||||||
code that invokes operations to examine the page file, just not during the redo phase of recovery.
|
code that invokes operations to examine the page file, just not during the redo phase of recovery.
|
||||||
For example, these operations could be invoked by log
|
For example, these operations could be invoked by log
|
||||||
entries that contain a set of byte ranges, and the new value
|
entries that contain a set of byte ranges with their new values.
|
||||||
of each byte in the range.
|
|
||||||
|
|
||||||
Recovery works the same way as before, except that it now computes
|
Recovery works the same way as before, except that it now computes
|
||||||
a lower bound for the LSN of each page, rather than reading it from the page.
|
a lower bound for the LSN of each page, rather than reading it from the page.
|
||||||
|
@ -886,8 +890,7 @@ optimizations in a straightforward fashion. Zero-copy writes are
|
||||||
a portion of the log file. However, doing this does not address the problem of updating the page
|
a portion of the log file. However, doing this does not address the problem of updating the page
|
||||||
file. We suspect that contributions from log-based file
|
file. We suspect that contributions from log-based file
|
||||||
systems~\cite{lfs} can address these problems. In
|
systems~\cite{lfs} can address these problems. In
|
||||||
particular, we imagine storing portions of the log (the portion that
|
particular, we imagine writing large blobs to a distinct log segment and just entering metadata in the primary log.
|
||||||
stores the blob) in the page file, or other addressable storage.
|
|
||||||
|
|
||||||
%In
|
%In
|
||||||
%the worst case, the blob would have to be relocated in order to
|
%the worst case, the blob would have to be relocated in order to
|
||||||
|
@ -908,7 +911,7 @@ memory~\cite{lrvm}. However, without support for logical log entries
|
||||||
and nested top actions, it is difficult to implement a
|
and nested top actions, it is difficult to implement a
|
||||||
concurrent, durable data structure using RVM or Camelot. (The description of
|
concurrent, durable data structure using RVM or Camelot. (The description of
|
||||||
Argus in Section~\ref{sec:transactionalProgramming} sketches the
|
Argus in Section~\ref{sec:transactionalProgramming} sketches the
|
||||||
general approach.)
|
general approach.)\eab{check this last sentence}
|
||||||
|
|
||||||
In contrast, LSN-free pages allow logical
|
In contrast, LSN-free pages allow logical
|
||||||
undo and therefore nested top actions and concurrent
|
undo and therefore nested top actions and concurrent
|
||||||
|
@ -935,7 +938,7 @@ updates do not require atomic page writes
|
||||||
and thus impose no meaningful boundaries on atomic updates. We still
|
and thus impose no meaningful boundaries on atomic updates. We still
|
||||||
use pages to simplify integration into the rest of the system, but
|
use pages to simplify integration into the rest of the system, but
|
||||||
need not worry about torn pages. In fact, the redo phase of the
|
need not worry about torn pages. In fact, the redo phase of the
|
||||||
LSN-free recovery algorithm actually creates a torn page each time it
|
LSN-free recovery algorithm effectively creates a torn page each time it
|
||||||
applies an old log entry to a new page. However, it guarantees that
|
applies an old log entry to a new page. However, it guarantees that
|
||||||
all such torn pages will be repaired by the time redo completes. In
|
all such torn pages will be repaired by the time redo completes. In
|
||||||
the process, it also repairs any pages that were torn by a crash.
|
the process, it also repairs any pages that were torn by a crash.
|
||||||
|
@ -999,7 +1002,7 @@ disk. At this point, the page is internally consistent.
|
||||||
|
|
||||||
Since LSN-free recovery only relies upon atomic updates at the bit
|
Since LSN-free recovery only relies upon atomic updates at the bit
|
||||||
level, it decouples page boundaries from atomicity and recovery. This
|
level, it decouples page boundaries from atomicity and recovery. This
|
||||||
allows operations to atomically manipulate (potentially
|
allows operations to manipulate atomically (potentially
|
||||||
non-contiguous) regions of arbitrary size by producing a single log
|
non-contiguous) regions of arbitrary size by producing a single log
|
||||||
entry. If this log entry includes a logical undo function (rather
|
entry. If this log entry includes a logical undo function (rather
|
||||||
than a physical undo), then it can serve the purpose of a nested top
|
than a physical undo), then it can serve the purpose of a nested top
|
||||||
|
@ -1010,7 +1013,7 @@ and reason about when applied to LSN-free pages.
|
||||||
|
|
||||||
\subsection{Summary}
|
\subsection{Summary}
|
||||||
|
|
||||||
In this section, we explored some of the flexibility of \yad. This
|
In these last two sections, we explored some of the flexibility of \yad. This
|
||||||
includes user-defined operations, combinations of steal and force on
|
includes user-defined operations, combinations of steal and force on
|
||||||
a per-operation basis, flexible locking options, and a new class of
|
a per-operation basis, flexible locking options, and a new class of
|
||||||
transactions based on blind updates that enables better support for
|
transactions based on blind updates that enables better support for
|
||||||
|
@ -1071,7 +1074,7 @@ With the lock manager enabled, Berkeley
|
||||||
DB's performance in the multithreaded benchmark (Section~\ref{sec:lht}) strictly decreased with
|
DB's performance in the multithreaded benchmark (Section~\ref{sec:lht}) strictly decreased with
|
||||||
increased concurrency.
|
increased concurrency.
|
||||||
|
|
||||||
We expended a considerable effort tuning Berkeley DB, and our efforts
|
We expended a considerable effort tuning Berkeley DB and our efforts
|
||||||
significantly improved Berkeley DB's performance on these tests.
|
significantly improved Berkeley DB's performance on these tests.
|
||||||
Although further tuning by Berkeley DB experts would probably improve
|
Although further tuning by Berkeley DB experts would probably improve
|
||||||
Berkeley DB's numbers, we think our comparison shows that the systems'
|
Berkeley DB's numbers, we think our comparison shows that the systems'
|
||||||
|
@ -1213,7 +1216,7 @@ persistence library, \oasys. \oasys makes use of pluggable storage
|
||||||
modules that implement persistent storage, and includes plugins
|
modules that implement persistent storage, and includes plugins
|
||||||
for Berkeley DB and MySQL.
|
for Berkeley DB and MySQL.
|
||||||
|
|
||||||
This section describes how the \yads plugin supports optimizations that reduce the
|
This section describes how the \yad plugin supports optimizations that reduce the
|
||||||
amount of data written to log and halve the amount of RAM required.
|
amount of data written to log and halve the amount of RAM required.
|
||||||
We present three variants of the \yad plugin. The basic one treats
|
We present three variants of the \yad plugin. The basic one treats
|
||||||
\yad like Berkeley DB. The ``update/flush'' variant
|
\yad like Berkeley DB. The ``update/flush'' variant
|
||||||
|
@ -1224,9 +1227,9 @@ between versions.
|
||||||
The update/flush variant allows the buffer manager's view of live
|
The update/flush variant allows the buffer manager's view of live
|
||||||
application objects to become stale. This is safe since the system is
|
application objects to become stale. This is safe since the system is
|
||||||
always able to reconstruct the appropriate page entry from the live
|
always able to reconstruct the appropriate page entry from the live
|
||||||
copy of the object. This reduces the number of times the \oasys
|
copy of the object. This reduces the number of times the
|
||||||
plugin must update serialized objects in the buffer manager, and
|
plugin must update serialized objects in the buffer manager, and
|
||||||
allows us to drastically decrease the amount of memory used by the
|
allows us to decrease drastically the amount of memory used by the
|
||||||
buffer manager.
|
buffer manager.
|
||||||
|
|
||||||
We implemented the \yad buffer pool optimization by adding two new
|
We implemented the \yad buffer pool optimization by adding two new
|
||||||
|
@ -1236,8 +1239,7 @@ updates the page when an object is evicted from the application's cache.
|
||||||
The reason it would be difficult to do this with Berkeley DB is that
|
The reason it would be difficult to do this with Berkeley DB is that
|
||||||
we still need to generate log entries as the object is being updated.
|
we still need to generate log entries as the object is being updated.
|
||||||
This would cause Berkeley DB to write data to pages,
|
This would cause Berkeley DB to write data to pages,
|
||||||
increasing the working set of the program, and increasing disk
|
increasing the working set of the program and the amount of disk activity.
|
||||||
activity.
|
|
||||||
|
|
||||||
Furthermore, \yads copy of the objects is updated in the order objects
|
Furthermore, \yads copy of the objects is updated in the order objects
|
||||||
are evicted from cache, not the order in which they are updated.
|
are evicted from cache, not the order in which they are updated.
|
||||||
|
@ -1265,7 +1267,7 @@ during recovery.
|
||||||
%\eab{we should at least implement this callback if we have not already}
|
%\eab{we should at least implement this callback if we have not already}
|
||||||
%
|
%
|
||||||
Alternatively, we could arrange for the object pool
|
Alternatively, we could arrange for the object pool
|
||||||
to atomically update the buffer
|
to update atomically the buffer
|
||||||
manager's copy of all objects that share a given page.
|
manager's copy of all objects that share a given page.
|
||||||
|
|
||||||
The third plugin variant, ``delta'', incorporates the update/flush
|
The third plugin variant, ``delta'', incorporates the update/flush
|
||||||
|
|
Loading…
Reference in a new issue