diff --git a/doc/paper3/LLADD.bib b/doc/paper3/LLADD.bib index d61bf4b..4062f60 100644 --- a/doc/paper3/LLADD.bib +++ b/doc/paper3/LLADD.bib @@ -28,7 +28,7 @@ @Article{excel, author = {B Zeeberg and J Riss and D Kane D and K Bussey and E Uchio and W Linehan and J Barret and J Weinstein}, - title = {Mistaken identifiers: gene name errors can be introduced inadvertently when using {E}xcel in bioinformatics}, + title = {Mistaken identifiers: Gene name errors can be introduced inadvertently when using {E}xcel in bioinformatics}, journal = {BMC Bioinformatics}, year = {2004}, OPTkey = {}, @@ -40,3 +40,177 @@ OPTannote = {} } + +@Article{batoryPhysical, + author = {D. S. Batory and C. C. Gotlieb}, + title = {A Unifying Model of Physical Databases}, + journal = {ACM Transactions on Database Systems}, + year = {1982}, + OPTkey = {}, + volume = {7}, + number = {4}, + pages = {509-539}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} + +@InProceedings{batoryConceptual, + author = {D. S. Batory}, + title = {Conceptual-to-internal mappings in commercial database systems}, + OPTcrossref = {}, + OPTkey = {}, + booktitle = {Proceedings of the 3rd SIGACT-SIGMOD symposium on Principles of database systems}, + pages = {70-78}, + year = {1984}, + OPTeditor = {}, + OPTvolume = {}, + OPTnumber = {}, + OPTseries = {}, + OPTaddress = {}, + OPTmonth = {}, + OPTorganization = {}, + OPTpublisher = {}, + OPTnote = {}, + OPTannote = {} +} + +@Misc{hibernate, + OPTkey = {}, + OPTauthor = {}, + title = {Hibernate: Relational Persistence for {J}ava and {.NET}}, + OPThowpublished = {}, + OPTmonth = {}, + OPTyear = {}, + note = {http://www.hibernate.org/}, + OPTannote = {} +} + + +@Article{lrvm, + author = {M. Satyanarayanan and Henry H. Mashburn and Puneet Kumar and David C. Steere and James J. Kistler}, + title = {Lightweight recoverable virtual memory}, + journal = {ACM Transactions on Computer Systems}, + year = {1994}, + OPTkey = {}, + volume = {12}, + number = {1}, + pages = {33-57}, + month = {Februrary}, + OPTnote = {}, + OPTannote = {} +} + +@Article{genesis, + author = {D. S. Batory and J. R. Barnett and J. F. Garza and K. P. Smith and K. Tsukuda and B. C. Twichell and T. E. Wise}, + title = {{GENESIS}: An Extensible Database Management System}, + journal = {IEEE Transactions on Software Engineering}, + year = {1988}, + OPTkey = {}, + volume = {14}, + number = {11}, + pages = {1711-1729}, + month = {November}, + OPTnote = {}, + OPTannote = {} +} + +@InProceedings{exodus, + author = {Michael J Carey and David J. DeWitt and Daniel Frank and Goetz Graefe and M. Muralikrishna and Joel Richardson and Eugene J. Shekita}, + title = {The Architecture of the {EXODUS} Extensible {DBMS}}, + OPTcrossref = {}, + OPTkey = {}, + booktitle = {Proceedings on the 1986 international workshop on Object-oriented database systems}, + pages = {52-65}, + year = {1986}, + OPTeditor = {}, + OPTvolume = {}, + OPTnumber = {}, + OPTseries = {}, + OPTaddress = {}, + OPTmonth = {}, + OPTorganization = {}, + OPTpublisher = {}, + OPTnote = {}, + OPTannote = {} +} + +@Article{codd, + author = {E. F. Codd}, + title = {A relational model of data for large shared data banks}, + journal = {Communications of the ACM}, + year = {1970}, + OPTkey = {}, + volume = {13}, + number = {6}, + pages = {377-387}, + month = {June}, + OPTnote = {}, + OPTannote = {} +} + +@Article{starburst, + author = {Guy M. Lohman and Bruce Lindsay and Hamid Pirahesh and K. Bernhard Schiefer}, + title = {Extensions to {S}tarburst: Objects, types, functions, and rules}, + journal = {Communications of the ACM}, + year = {1991}, + OPTkey = {}, + volume = {34}, + number = {10}, + pages = {95-109}, + month = {October}, + OPTnote = {}, + OPTannote = {} +} + +@Article{postgres, + author = {M. Stonebraker and Greg Kemnitz}, + title = {The {POSTGRES} Next-Generation Database Management System}, + journal = {Communications of the ACM}, + year = {1991}, + OPTkey = {}, + volume = {34}, + number = {10}, + pages = {79-92}, + month = {October}, + OPTnote = {}, + OPTannote = {} +} + +@InProceedings{aries, + author = { C. Mohan and D. Haderle and B. Lindsay and H. Pirahesh and P Schwarz }, + title = {{ARIES}, A Transaction Recovery Method Supporting Fine-Granularity Locking and Partial Rollbacks Using Write-Ahead Logging}, + OPTcrossref = {}, + OPTkey = {}, + booktitle = {ACM Transactions on Database Systems}, + pages = {94-162}, + year = {1992}, + OPTeditor = {}, + volume = {17}, + number = {1}, + OPTseries = {}, + OPTaddress = {}, + OPTmonth = {}, + OPTorganization = {}, + OPTpublisher = {}, + OPTnote = {}, + OPTannote = {} +} + +@Book{ariesIM, + author = {C Mohan and F Levine}, + ALTeditor = {}, + title = {ARIES/IM: an efficient and high concurrency index management method using write-ahead logging}, + publisher = {ACM Press}, + year = {1992}, + OPTkey = {}, + OPTvolume = {}, + OPTnumber = {}, + OPTseries = {}, + OPTaddress = {}, + OPTedition = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} + diff --git a/doc/paper3/LLADD.tex b/doc/paper3/LLADD.tex index f62a642..9986d06 100644 --- a/doc/paper3/LLADD.tex +++ b/doc/paper3/LLADD.tex @@ -63,15 +63,14 @@ UC Berkeley %\subsection*{Abstract} -{\em There is an increasing need to manage data well in a wide variety of -systems, including robust support for atomic durable concurrent +{\em An increasing range of applications require robust support for atomic, durable and concurrent transactions. Databases provide the default solution, but force applications to interact via SQL and to forfeit control over data layout and access mechanisms. We argue there is a gap between DBMSs and file systems that limits designers of data-oriented applications. \yad is a storage framework that incorporates ideas from traditional -write-ahead-logging storage algorithms and file systems, -while providing applications with flexible control over data structures, layout, and performance vs. robustness tradeoffs. +write-ahead-logging storage algorithms and file systems. +It provides applications with flexible control over data structures and layout, and transactional performance and robustness properties. \yad enables the development of unforeseen variants on transactional storage by generalizing write-ahead-logging algorithms. Our partial implementation of these @@ -81,9 +80,9 @@ We evaluate the performance of a traditional transactional storage system based on \yad, and show that it performs favorably relative to existing systems. We present examples that make use of custom access methods, modifed buffer manager semantics, direct log file manipulation, and LSN-free -pages that facilitate zero-copy optimizations, and discuss the -composability of these extensions. Many of these optimizations are -easy to implement and more than double performance. +pages. These examples facilitate sophisticated performance +optimizations such as zero-copy I/O. These extensions are composable, +easy to implement and frequently more than double performance. } %We argue that our ability to support such a diverse range of @@ -109,7 +108,7 @@ easy to implement and more than double performance. \section{Introduction} -As our reliance on computing infrastructure has increased, a wider range of +As our reliance on computing infrastructure increases, a wider range of applications require robust data management. Traditionally, data management has been the province of database management systems (DBMSs), which are well-suited to enterprise applications, but lead to poor support for @@ -185,7 +184,7 @@ storage at a level of abstraction as close to the hardware as possible. The library can support special purpose, transactional storage interfaces in addition to ACID database-style interfaces to abstract data models. \yad incorporates techniques from databases -(e.g. write-ahead logging) and systems (e.g. zero-copy techniques). +(e.g. write-ahead-logging) and systems (e.g. zero-copy techniques). Our goal is to combine the flexibility and layering of low-level abstractions typical for systems work, with the complete semantics that exemplify the database field. @@ -209,8 +208,9 @@ delivers these properties as reusable building blocks for systems that implement complete transactions. Through examples and their good performance, we show how \yad{} -supports a wide range of uses that in the database gap, including -persistent objects, graph or XML apps, and recoverable +supports a wide range of uses that fall in the gap between +database and filesystem technologies, including +persistent objects, graph or XML based applications, and recoverable virtual memory~\cite{lrvm}. For example, on an object serialization workload, we provide up to @@ -222,21 +222,20 @@ We implemented this extension in 150 lines of C, including comments and boilerpl in mind when we wrote \yad. In fact, the idea came from a potential user that is not familiar with \yad. -An (early) open-source implementation of -the ideas presented here is available. - \eab{others? CVS, windows registry, berk DB, Grid FS?} \rcs{maybe in related work?} This paper begins by contrasting \yad's approach with that of conventional database and transactional storage systems. It proceeds -to discuss write ahead logging, and describe ways in which \yad can be -customized to implement many existing (and some new) write ahead -logging variants. Implementations of some of these variants are +to discuss write-ahead-logging, and describe ways in which \yad can be +customized to implement many existing (and some new) write-ahead-logging variants. Implementations of some of these variants are presented, and benchmarked against popular real-world systems. We conclude with a survey of the technologies the \yad implementation is based upon. +An (early) open-source implementation of +the ideas presented here is available. + \section{\yad is not a Database} \label{sec:notDB} Database research has a long history, including the development of @@ -354,7 +353,7 @@ applications presented in Section~\ref{sec:extensions} are efficiently supported by Berkeley DB. This is a result of Berkeley DB's assumptions regarding workloads and decisions regarding low level data representation. Thus, although Berkeley DB could be built on top of \yad, -Berkeley DB's data model, and write ahead logging system are both too specialized to support \yad. +Berkeley DB's data model, and write-ahead-logging system are too specialized to support \yad. \eab{for BDB, should we say that it still has a data model?} \rcs{ Does the last sentence above fix it?} @@ -429,7 +428,7 @@ to build a system that enables a wider range of data management options. Section~\ref{sec:notDB} described the ways in which a top-down data model limits the generality and flexibility of databases. In this section, we cover the basic bottom-up approach of \yad: {\em transactional -pages}. Although similar to the underlying write-ahead logging +pages}. Although similar to the underlying write-ahead-logging approaches of databases, particularly ARIES~\cite{aries}, \yads bottom-up approach yields unexpected flexibility. @@ -475,7 +474,7 @@ property. However, \yad takes customization of transactional semantics one step further, allowing applications to add support for transactional semantics that we have not anticipated. We do not believe that -we can anticipate every possible variation of write ahead logging. +we can anticipate every possible variation of write-ahead-logging. However, we have observed that most changes that we are interested in making involve a few common underlying primitives. @@ -484,7 +483,7 @@ As we have implemented new extensions, we have located portions of the system that are prone to change, and have extended the API accordingly. Our goal is to allow applications to implement their own modules to -replace our implementations of each of the major write ahead logging +replace our implementations of each of the major write-ahead-logging components. } @@ -492,10 +491,13 @@ components. \subsection{Single-page Transactions} In this section we show how to implement single-page transactions. -This is not at all novel, and is in fact based on ARIES~\cite{aries}, but it forms -important background. We also gloss over many important and -well-known optimizations that \yad exploits, such as group -commit~\cite{group-commit}. +This is not at all novel, and is in fact based on ARIES~\cite{aries}, +but it forms important background. We also gloss over many important +and well-known optimizations that \yad exploits, such as group +commit~\cite{group-commit}. These aspects of recovery algorithms are +described in the literature, and in any good textbook that describes +database implementations. The are not particularly important to the +discussion here, so we do not cover them. The trivial way to acheive single-page transactions is simply to apply all the updates to the page and then write it out on commit. The page @@ -511,8 +513,8 @@ transactions: we write (sequential) ``redo'' information to the log on commit, a then can write the pages later. If we crash, we can use the log to redo the lost updates during recovery. -For this to work, we need to be able to tell which updates to -re-apply, which is solved by using a per-page sequence number called a +For this to work, recovery must be able to decide which updates to +re-apply. This is solved by using a per-page sequence number called a {\em log sequence number}. Each log entry contains the sequence number, and each page contains the sequence number of the last applied update. Thus on recovery, we load a page, look at its sequence @@ -524,7 +526,7 @@ We also need to make sure that only the results of committed transactions still exist after recovery. This is best done by writing a commit record to the log during the commit. If the system pins uncommitted dirty pages in memory, recovery does not need to worry about undoing -any updates, and simply plays back the redo records from +any updates. Therefore recovery simply plays back unapplied redo records from transactions that have commit records. However, pinning the pages of active transactions in memory is problematic. @@ -550,7 +552,7 @@ redo log entry (with its LSN and argument) reaches the disk before commit. Similarly, an undo log entry, with its LSN and argument, always reaches the disk before a page is stolen. ARIES works essentially the same way, but hard-codes recommended page -formats and index structures.~\cite{ariesIM} +formats and index structures~\cite{ariesIM}. To manually abort a transaction, \yad could either reload the page from disk and roll it forward to reflect committed transactions (this would imply ``no steal''), or it @@ -559,7 +561,7 @@ order. (It currently does the latter.) \eat{ -Write ahead logging algorithms are quite simple if each operation +Write-ahead-logging algorithms are quite simple if each operation applied to the page file can be applied atomically. This section will describe a write ahead logging scheme that can transactionally update a single page of storage that is guaranteed to be written to disk @@ -580,7 +582,7 @@ each other. Normally, only calls to abort and recovery will invoke undo, so we will assume that transactions consist of repeated applications of the redo function. -Following the lead of ARIES (the write ahead logging system \yad +Following the lead of ARIES (the write-ahead-logging system \yad originally set out to implement), assume that the function is also passed a distinct, monotonically increasing number each time it is invoked, and that it records that number in an LSN (log sequence number) @@ -608,9 +610,7 @@ is also written to the log. This section very briefly described how a simplified write-ahead-logging algorithm might work, and glossed over many details. Like ARIES, \yad actually implements recovery in three -phases: Analysis, Redo and Undo. Because recovery algorithms are -desribed in the literature, and in an good database textbook, we -will not desribe them in further detail. +phases: Analysis, Redo and Undo. %Recovery is handled by playing the log forward, and only applying log %entries that are newer than the version of the page on disk. Once the @@ -638,8 +638,8 @@ is relatively easy. First, we need to ensure that all log entries have a transaction ID (XID) so that we can tell that updates to different pages are part of -the same transaction (we need this for multiple updates within a -single page too). Given single-page recovery, we can just apply it to +the same transaction (we need this in the single page case as well). + Given single-page recovery, we can just apply it to all of the pages touched by a transaction to recover a multi-page transaction. This works because steal and no-force already imply that pages can be written back early or late (respectively), so there @@ -648,12 +648,12 @@ need only ensure that redo entries for all pages reach the disk before the commit record (and before commit returns). \eat{ -\subsection{Write ahead logging invariants} +\subsection{Write-ahead-logging invariants} In order to support recovery, a write-ahead-logging algorithm must identify pages that {\em may} be written back to disk, and those that {\em must} be written back to disk. \yad provides full support for -Steal/no-Force write ahead logging, due to its generally favorable +Steal/no-Force write-ahead-logging, due to its generally favorable performance properties. ``Steal'' refers to the fact that pages may be written back to disk before a transaction completes. ``No-Force'' means that a transaction may commit before the pages it modified are @@ -694,8 +694,8 @@ structure, and then A aborted. When A rolls back, its UNDO entries will undo the rearrangment that it made to the data structure, without regard to B's modifications. This is likely to cause corruption. -Two common solutions to this problem are ``total isolation'' and -``nested top actions.'' Total isolation simply prevents any +Two common solutions to this problem are {\em total isolation} and +{\em nested top actions}. Total isolation simply prevents any transaction from accessing a data structure that has been modified by another in-progress transaction. An application can achieve this using its own concurrency control mechanisms, or by holding a lock on @@ -715,7 +715,7 @@ aborts. The key idea is to distinguish between the logical operations of a data structure, such as inserting a key, and the physical operations such as splitting tree nodes or or rebalancing a tree. The physical -operations do not need to undone if the containing logical operation +operations do not need to be undone if the containing logical operation (insert) aborts. Because nested top actions are easy to use and do not lead to @@ -751,7 +751,7 @@ As described above, and in all database implementations of which we are aware, transactional pages use LSNs on each page. This makes it difficult to map large objects onto multiple pages, as the LSNs break up the object. It is tempting to try to move the LSNs elsewhere, but -then they will not be written atomically with their page, which +then they would not be written atomically with their page, which defeats their purpose. LSNs were introduced to prevent recovery from applying updates more @@ -760,6 +760,7 @@ entries,\endnote{Idempotency does not guarantee that $f(g(x)) = f(g(f(g(x))))$. Therefore, idempotency does not guarantee that it is safe to assume that a page is older than it is.} \yad can eliminate the LSN on each page. + Consider purely physical logging operations that overwrite a fixed byte range on the page regardless of the page's initial state. We say that such operations perform ``blind writes.'' @@ -784,7 +785,7 @@ update some subset of the bits on the page. If the log entries do not update a bit, then its value was correct before recovery began, so it must be correct after recovery. Otherwise, we know that recovery will update the bit. Furthermore, after all redos, the bit's value will be the -value it contained at crash, so we know that undo will behave +last value it contained before the crash, so we know that undo will behave properly. We call such pages ``LSN-free'' pages. Although this technique is @@ -792,9 +793,9 @@ novel for databases, it resembles the mechanism used by RVM~\cite{rvm}; \yad generalizes the concept and allows it to co-exist with traditional pages. Furthermore, efficient recovery and log truncation require only minor modifications to our recovery -algorithm. In practice, this is implemented by providing a callback -for LSN free pages that allows the buffer manager to compute a -conservative estimate of the page's LSN whenever it is read from disk. +algorithm. In practice, this is implemented by providing a buffer manager callback +for LSN free pages. The callback computes a +conservative estimate of the page's LSN whenever the page is read from disk. For a less conservative estimate, it suffices to write a page's LSN to the log shortly after the page itself is written out; on recovery the log entry is thus a conservative but close estimate. @@ -880,7 +881,7 @@ These issues are beyond the scope of this discussion. Section~\ref{logReorderin \subsection{Summary of Transactional Pages} This section provided an extremely brief overview of transactional -pages and write-ahead logging. Transactional pages are a valuable +pages and write-ahead-logging. Transactional pages are a valuable building block for a wide variety of data management systems, as we show in the next section. Nested top actions and LSN-free pages enable important optimizations. In particular, \yad allows general @@ -940,7 +941,7 @@ Optimizations to Berkeley DB that we performed included disabling the lock manager, though we still use ``Free Threaded'' handles for all tests. This yielded a significant increase in performance because it removed the possibility of transaction deadlock, abort, and -repetition. However, disabling the lock manager, caused highly +repetition. However, disabling the lock manager caused highly concurrent Berkeley DB benchmarks to become unstable, suggesting either a bug or misuse of the feature. @@ -1034,7 +1035,7 @@ straightforward. We then compare our simple, straightforward implementation to our hand-tuned version and Berkeley DB's implementation. The simple hash table uses nested top actions to atomically update its -internal structure. It is based on a {\em linear} hash function~\cite{lht}, allowing +internal structure. It uses a {\em linear} hash function~\cite{lht}, allowing it to incrementally grow its buffer list. It is based on a number of modular subcomponents. Notably, its bucket list is a growable array of fixed length entries (a linkset, in the terms of the physical @@ -1048,8 +1049,8 @@ hashtable is a popular, commonly deployed implementation, and serves as a baseline for our experiments. Both of our hashtables outperform Berkeley DB on a workload that -bulk loads the tables by repeatedly inserting (key, value) pairs, -although we do not wish to imply this is always the case. +bulk loads the tables by repeatedly inserting (key, value) pairs. +However, we do not wish to imply this is always the case. %We do not claim that our partial implementation of \yad %generally outperforms, or is a robust alternative %to Berkeley DB. Instead, this test shows that \yad is comparable to @@ -1071,7 +1072,7 @@ optimize key primitives. %forced to redesign and application to avoid sub-optimal properties of %the transactional data structure implementation. -Figure~\ref{fig:TPS} describes performance of the two systems under +Figure~\ref{fig:TPS} describes the performance of the two systems under highly concurrent workloads. For this test, we used the simple (unoptimized) hash table, since we are interested in the performance a clean, modular data structure that a typical system implementor would @@ -1108,7 +1109,7 @@ The effect of \yad object serialization optimizations under low and high memory \subsection{Object persistance} \label{sec:oasys} Numerous schemes are used for object serialization. Support for two -different styles of object serialization have been eimplemented in +different styles of object serialization have been implemented in \yad. We could have just as easily implemented a persistance mechanism for a statically typed functional programming language, a dynamically typed scripting language, or a particular application, @@ -1157,7 +1158,7 @@ entries, and wrote them all before committing. page file, increasing the working set of the program, and increasing disk activity. -Furthermore, because objects may be written to disk in an +Furthermore, objects may be written to disk in an order that differs from the order in which they were updated, violating one of the write-ahead-logging invariants. One way to deal with this is to maintain multiple LSN's per page. This means we would need to register a @@ -1166,7 +1167,6 @@ callback will be needed in Section~\ref{sec:zeroCopy}), and extend \yads page format to contain per-record LSN's. Also, we must prevent \yads storage allocation routine from overwriting the per-object LSN's of deleted objects that may still be addressed during abort or recovery. -\yad can support this approach. Alternatively, we could arrange for the object pool to cooperate further with the buffer pool by atomically updating the buffer @@ -1174,7 +1174,7 @@ manager's copy of all objects that share a given page, removing the need for multiple LSN's per page, and simplifying storage allocation. However, the simplest solution, and the one we take here, is based on the observation that -updates (not allocations or deletions) to fixed length objects are blind writes. +updates (not allocations or deletions) of fixed length objects are blind writes. This allows us to do away with per-object LSN's entirely. Allocation and deletion can then be handled as updates to normal LSN containing pages. At recovery time, object updates are executed based on the existence of the object on the page @@ -1486,7 +1486,7 @@ is a common pattern in system software design, and manages dependencies and ordering constraints between sets of components. Over time, we hope to shrink \yads core to the point where it is simply a resource manager and a set of implementations of a few unavoidable -algorithms related to write-ahead logging. For instance, +algorithms related to write-ahead-logging. For instance, we suspect that support for appropriaite callbacks will allow us to hardcode a generic recovery agorithm into the system. Similarly, and code that manages book-keeping information, such as diff --git a/doc/paper3/figs/bulk-load.pdf b/doc/paper3/figs/bulk-load.pdf index 23c42c4..315cef0 100644 Binary files a/doc/paper3/figs/bulk-load.pdf and b/doc/paper3/figs/bulk-load.pdf differ diff --git a/doc/paper3/figs/mem-pressure.pdf b/doc/paper3/figs/mem-pressure.pdf index 46603bb..9ace61f 100644 Binary files a/doc/paper3/figs/mem-pressure.pdf and b/doc/paper3/figs/mem-pressure.pdf differ diff --git a/doc/paper3/figs/object-diff.pdf b/doc/paper3/figs/object-diff.pdf index eaca04c..e441a7d 100644 Binary files a/doc/paper3/figs/object-diff.pdf and b/doc/paper3/figs/object-diff.pdf differ diff --git a/doc/paper3/figs/oo7.pdf b/doc/paper3/figs/oo7.pdf index 65d421f..d52786f 100644 Binary files a/doc/paper3/figs/oo7.pdf and b/doc/paper3/figs/oo7.pdf differ diff --git a/doc/paper3/figs/tps-extended.pdf b/doc/paper3/figs/tps-extended.pdf index a303c6a..cfad2de 100644 Binary files a/doc/paper3/figs/tps-extended.pdf and b/doc/paper3/figs/tps-extended.pdf differ