*** empty log message ***

2004-10-22 04:57:25 +00:00 · 2004-10-22 04:57:25 +00:00 · bca199968b
commit bca199968b
parent 3d2a36c417
12 changed files with 2748 additions and 0 deletions
--- a/doc/paper/DB-Architecture.fig
+++ b/doc/paper/DB-Architecture.fig
@ -0,0 +1,121 @@
 #FIG 3.2  Produced by xfig version 3.2.5-alpha5
 Landscape
 Center
 Metric
 A4      
 100.00
 Single
 -2
 1200 2
 0 32 #c6b797
 0 33 #eff8ff
 0 34 #dccba6
 0 35 #404040
 0 36 #808080
 0 37 #c0c0c0
 0 38 #e0e0e0
 0 39 #8e8f8e
 0 40 #aaaaaa
 0 41 #555555
 0 42 #868286
 0 43 #c7c3c7
 0 44 #e7e3e7
 0 45 #8e8e8e
 0 46 #444444
 0 47 #868686
 0 48 #565151
 0 49 #effbff
 0 50 #666666
 0 51 #e2e2ee
 0 52 #94949a
 0 53 #dbdbdb
 0 54 #a1a1b7
 0 55 #9c0000
 0 56 #ededed
 0 57 #86acff
 0 58 #7070ff
 0 59 #bebebe
 0 60 #515151
 0 61 #000049
 0 62 #797979
 0 63 #303430
 0 64 #c7b696
 0 65 #d7d7d7
 0 66 #aeaeae
 0 67 #85807d
 0 68 #d2d2d2
 0 69 #3a3a3a
 0 70 #4573aa
 0 71 #000000
 0 72 #c7c7c7
 0 73 #e7e7e7
 0 74 #f7f7f7
 0 75 #d6d7d6
 0 76 #7b79a5
 0 77 #9e9e9e
 0 78 #717571
 0 79 #73758c
 0 80 #414141
 0 81 #414541
 0 82 #635dce
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 3690 5085 5985 5085 5985 5985 3690 5985 3690 5085
 2 1 0 2 7 7 50 -1 20 0.000 0 0 7 0 0 2
 	 3195 6255 4320 5985
 2 1 0 2 7 7 50 -1 20 0.000 0 0 7 0 0 2
 	 4320 6255 4725 5985
 2 1 0 2 7 7 50 -1 20 0.000 0 0 7 0 0 2
 	 4995 5985 5445 6255
 2 1 0 2 7 7 50 -1 20 0.000 0 0 7 0 0 2
 	 5445 5985 6570 6255
 2 2 0 1 0 0 90 -1 20 0.000 0 0 7 0 0 5
 	 2385 5760 7290 5760 7290 6840 2385 6840 2385 5760
 2 1 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 2
 	 5985 5400 6480 5400
 2 1 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 2
 	 3195 5400 3690 5400
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 3690 5085 5985 5085 5985 5985 3690 5985 3690 5085
 2 1 0 1 0 7 90 -1 20 0.000 0 0 -1 0 0 2
 	 3375 4680 4365 5085
 2 1 0 1 0 7 90 -1 20 0.000 0 0 -1 0 0 2
 	 6345 4680 5400 5085
 2 2 0 1 0 7 90 -1 -1 0.000 0 0 -1 0 0 5
 	 1980 3735 7875 3735 7875 7020 1980 7020 1980 3735
 2 1 0 3 0 7 90 -1 -1 0.000 0 0 7 0 0 2
 	 4860 3015 4860 3735
 2 2 0 1 0 7 90 -1 -1 0.000 0 0 -1 0 0 5
 	 3915 2205 5895 2205 5895 3015 3915 3015 3915 2205
 2 1 0 1 0 7 90 -1 -1 0.000 0 0 -1 0 0 2
 	 4860 5130 4860 4410
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 3960 3915 5760 3915 5760 4410 3960 4410 3960 3915
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 2610 4230 3825 4230 3825 4680 2610 4680 2610 4230
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 2250 5130 3195 5130 3195 5580 2250 5580 2250 5130
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 5895 4230 7290 4230 7290 4680 5895 4680 5895 4230
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 6480 5130 7425 5130 7425 5580 6480 5580 6480 5130
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 6075 6255 7020 6255 7020 6705 6075 6705 6075 6255
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 4950 6255 5895 6255 5895 6705 4950 6705 4950 6255
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 3870 6255 4815 6255 4815 6705 3870 6705 3870 6255
 2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
 	 2745 6255 3690 6255 3690 6705 2745 6705 2745 6255
 4 0 0 90 -1 0 14 0.0000 0 255 1905 4140 2700 Application code\001
 4 0 0 50 -1 0 14 0.0000 0 255 2040 4095 4230 Abstraction Layer\001
 4 0 0 50 -1 0 14 0.0000 0 255 1275 2745 4545 Replication\001
 4 0 0 50 -1 0 14 0.0000 0 195 855 2385 5445 Indexes\001
 4 0 0 50 -1 0 14 0.0000 0 255 900 6615 5445 Locking\001
 4 0 0 50 -1 0 14 0.0000 0 255 420 6390 6570 Log\001
 4 0 0 50 -1 0 14 0.0000 0 255 1035 4995 6570 Recovery\001
 # Abort()
 4 0 0 50 -1 0 14 0.0000 0 195 975 3960 6570 Rollback\001
 # Buffers
 4 0 0 50 -1 0 14 0.0000 0 195 825 2880 6570 Buffers\001
 4 0 0 50 -1 0 14 0.0000 0 195 2610 3825 5580 Transactional Interface\001
 4 0 0 50 -1 0 14 0.0000 0 195 1365 6030 4545 Distribution\001
--- a/doc/paper/DB-Architecture.pdf
+++ b/doc/paper/DB-Architecture.pdf
--- a/doc/paper/INSERT.eps
+++ b/doc/paper/INSERT.eps
--- a/doc/paper/LLADD-Arch2.fig
+++ b/doc/paper/LLADD-Arch2.fig
@ -0,0 +1,123 @@
 #FIG 3.2  Produced by xfig version 3.2.5-alpha5
 Landscape
 Center
 Metric
 Letter  
 100.00
 Single
 -2
 1200 2
 0 32 #ffffff
 0 33 #f2f2f2
 0 34 #d9d9d9
 6 1011 1244 1581 1379
 6 1011 1244 1581 1379
 4 0 0 50 -1 0 8 0.0000 0 135 570 1011 1349 Recovery\001
 -6
 -6
 6 1378 810 1918 990
 6 1378 810 1918 990
 4 0 0 50 -1 0 8 0.0000 0 135 525 1378 945 Page I/O\001
 -6
 -6
 6 1396 2794 2198 3067
 6 1463 2868 2198 2973
 4 0 0 50 -1 0 8 0.0000 0 105 735 1463 2973 Linear Hash\001
 -6
 2 2 0 1 0 7 52 -1 20 0.000 0 0 -1 0 0 5
 	 1396 2794 2101 2794 2101 3067 1396 3067 1396 2794
 -6
 6 477 2806 1017 3076
 4 0 0 50 -1 0 8 0.0000 0 135 465 536 2984 Prepare\001
 -6
 6 442 2794 1027 3067
 2 2 0 1 0 7 52 -1 20 0.000 0 0 -1 0 0 5
 	 442 2794 1027 2794 1027 3067 442 3067 442 2794
 -6
 6 1617 2467 2631 2740
 6 1617 2467 2631 2740
 6 1617 2467 2631 2740
 2 2 0 1 0 7 52 -1 20 0.000 0 0 -1 0 0 5
 	 1617 2467 2631 2467 2631 2740 1617 2740 1617 2467
 -6
 -6
 -6
 6 1141 2150 1958 2423
 6 1141 2150 1875 2423
 6 1141 2150 1875 2423
 2 2 0 1 0 7 52 -1 20 0.000 0 0 -1 0 0 5
 	 1141 2150 1875 2150 1875 2423 1141 2423 1141 2150
 -6
 -6
 4 0 0 50 -1 0 8 0.0000 0 135 750 1208 2329 Page Layout\001
 -6
 6 2003 2155 2492 2426
 6 2003 2155 2458 2426
 6 2003 2155 2458 2426
 6 2003 2155 2458 2426
 2 2 0 1 0 7 52 -1 20 0.000 0 0 -1 0 0 5
 	 2003 2155 2458 2155 2458 2426 2003 2426 2003 2155
 -6
 -6
 -6
 4 0 0 50 -1 0 8 0.0000 0 105 435 2057 2340 Record\001
 -6
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 810 990 810 1620
 2 1 0 1 0 7 70 -1 -1 0.000 0 0 -1 0 0 2
 	 2430 990 2430 1260
 2 1 0 1 0 7 92 -1 -1 0.000 0 0 -1 0 0 2
 	 1260 1440 1260 1620
 2 1 0 0 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 810 990 810 1620
 2 2 0 1 0 7 61 -1 20 0.000 0 0 -1 0 0 5
 	 944 1170 1529 1170 1529 1443 944 1443 944 1170
 2 2 0 1 0 7 50 -1 20 0.000 0 0 -1 0 0 5
 	 560 762 1037 762 1037 1035 560 1035 560 762
 2 2 0 1 0 7 50 -1 20 0.000 0 0 -1 0 0 5
 	 2252 767 2972 767 2972 1040 2252 1040 2252 767
 2 2 0 1 0 7 51 -1 20 0.000 0 0 -1 0 0 5
 	 1710 1167 2675 1167 2675 1440 1710 1440 1710 1167
 2 2 0 1 0 7 51 -1 20 0.000 0 0 -1 0 0 5
 	 1288 762 2008 762 2008 1035 1288 1035 1288 762
 2 1 0 1 0 7 57 -1 -1 0.000 0 0 -1 0 0 2
 	 1890 990 1890 1170
 2 1 0 0 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 810 990 810 1620
 2 1 0 1 0 7 200 -1 -1 0.000 0 0 -1 0 0 3
 	 986 1313 536 1313 536 2840
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 3
 	 855 1845 855 2855 855 2745
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 1260 1845 1260 2160
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 1750 2383 1750 2531
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 1504 2383 1504 2835
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 2213 2363 2222 2506
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 2006 2265 1873 2265
 2 2 0 1 0 7 74 -1 -1 0.000 0 0 -1 0 0 5
 	 180 1893 3375 1893 3375 2912 180 2912 180 1893
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 180 464 3375 464 3375 1699 180 1699 180 464
 2 2 0 1 0 7 44 -1 20 0.000 0 0 -1 0 0 5
 	 709 1578 1378 1578 1378 1851 709 1851 709 1578
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 3
 	 1377 1641 1882 1641 1882 1417
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 1508 1291 1774 1291
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 2209 2207 2209 1365
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 1905 2721 1905 2851
 2 1 0 1 0 7 75 -1 -1 0.000 0 0 -1 0 0 2
 	 975 1001 975 1197
 4 0 0 50 -1 0 8 0.0000 0 135 975 1800 1350 Buffer Manager\001
 4 0 0 50 -1 0 8 0.0000 0 135 435 627 941 Logger\001
 4 0 0 50 -1 0 8 0.0000 0 135 705 2342 950 LRU-2S(?)\001
 4 0 0 50 -1 0 8 0.0000 0 135 1125 1684 2646 Expandable Array\001
 4 0 0 50 -1 0 10 0.0000 0 180 840 1366 2058 Operations\001
 4 0 0 50 -1 0 10 0.0000 0 135 1110 1305 645 LLADD `Core'\001
 4 0 0 38 -1 0 8 0.0000 0 135 675 780 1759 Operations\001
--- a/doc/paper/LLADD-Arch2.pdf
+++ b/doc/paper/LLADD-Arch2.pdf
--- a/doc/paper/LLADD-Freenix.pdf
+++ b/doc/paper/LLADD-Freenix.pdf
--- a/doc/paper/LLADD-Freenix.tex
+++ b/doc/paper/LLADD-Freenix.tex
@ -0,0 +1,924 @@
 %% LyX 1.3 created this file.  For more info, see http://www.lyx.org/.
 %% Do not edit unless you really know what you are doing.
 %\documentclass[letterpaper,twocolumn,english]{article}
 %\usepackage[T1]{fontenc}
 %\usepackage[latin1]{inputenc}
 %\usepackage{graphicx}
 %\makeatletter
 \documentclass[letterpaper,twocolumn,english]{article}
 \usepackage[latin1]{inputenc}
 \usepackage{graphicx}
 \usepackage{usenix,epsfig}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
 %% Bold symbol macro for standard LaTeX users
 %\newcommand{\boldsymbol}[1]{\mbox{\boldmath $#1$}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
 \usepackage[T1]{fontenc}
 \usepackage{ae,aecompl}
 %\usepackage{babel}
 %\makeatother
 \begin{document}
 \date{}
 \title{\Large \bf LLADD: Extensible Transactional Storage FIXME}
 \author{
 Your N.\ Here \\
 %{\em Your Department} \\
 {\em Your Institution}\\
 %{\em Your City, State, ZIP}\\
 % is there a standard format for email/URLs??
 % remember that ~ doesn't do what you expect, use \~{}.
 {\normalsize yourname@host.site.dom, http://host.site.dom/yoururl} \\
 %
 % copy the following lines to add more authors
 \smallskip
 Name Two Here \\
 {\em Two's Institution}\\
 %% is there a standard format for email/URLs??
 {\normalsize two@host.site.dom, http://host.site.dom/twourl}
 %
 } % end author
 \maketitle
 \thispagestyle{empty}
 \subsection*{Abstract}
 While many systems provide transactionally consistent data management,
 existing implementations are generally monolithic and tied to a higher
 level system, limiting the scope of their usefulness to a single application,
 or a specific type of problem. As a result, many systems are forced
 to 'work-around' the data models provided by a transactional storage
 layer. Manifestation of this problem include 'impedence mismatch'
 in the database world and the limited number of data models provided
 by existing libraries such as BerkeleyDB. In this paper, we describe
 a light-weight, easily extendible library, LLADD, that allows application
 developers to develop scalable and transactional application-specific
 data structures. We demonstrate that LLADD is simpler than prior systems
 and is extremely flexible while performing favorably in a number of
 micro-benchmarks. We also describe, in simple and concrete terms,
 the issues inherent in the design and implementation of robust, scalable
 transactional data structures. In addition to the source code, we
 have also made a comprehensive suite of unit-tests, API documentation,
 and debugging mechanisms publicly available.%
 \footnote{http://lladd.sourceforge.net/%
 }
 \section{Introduction}
 Changes in data models, consistency requirements, system scalibility,
 communication models and fault models require changes to the storage
 and recovery subsystems of modern applications. Such changes require
 increased flexibility at the data durability and isolation layer.
 We refer to the functionality provided by this layer as \emph{transactional
 pages,} and in this paper deal with crash recovery, application level
 support for transaction abort and commit, and basic latching for multithreaded
 applications. We leave transaction-level consitency to a higher level
 library.
 Many implementations of transactional pages exist in industry and
 in the literature. Unfortunately, these algorithms tend either to
 be straightforward and unsuitable for real-world deployment, or are
 robust and scalable, but achieve these properties by relying upon
 intricate sets of internal (and often implicit) interactions. The
 ARIES algorithm falls into the second category, has been extremely
 sucessful, and is used by many real-world applications. It provides
 performance and reliability that is comparable to that of current
 commercial and open-source products. Unfortunately, while the algorithm
 is conceptually simple, many subtlties arise in its implementation.
 We chose ARIES as the basis of LLADD, and have made a significant
 effort to document these interactions. While a complete discussion
 of the AIRES algorithm is beyond the scope of this paper, we will
 provide a breif overview, and explain the details that are relevant
 to developers that wish to extend LLADD. 
 By documenting the interface between AIRES and higher-level primitives
 such as data structures, and by structuring LLADD to make this interface
 explicit in both the library and its extensions, we hope to make it
 easy to produce correct and efficient durable data structures. In
 existing systems (and indeed, in earlier versions of LLADD), the implementation
 of such structures is extremely complicated, and subject to the introduction
 of incredibly subtle errors that would only be evident during crash
 recovery or at other inconvenient times.
 Finally, by approaching this problem by implementing a number of simple
 modules that {}``do one thing and do it well'', we believe that
 LLADD can provide superior performance while making future improvements
 to its core implementation significantly easier. In order to achieve
 this goal, LLADD has been split into a number of modules forming a
 'core library', and a number of extensions called 'operations' that
 build upon the core library. Since each of these modules exports a
 stable interface, they can be independently improved.
 \subsection{Prior Work\label{sub:Prior-Work}}
 An extensive amount of prior work covers the algorithms presented in
 this paper.  Most fundamentally, systems that provide transactional
 consistency to their users generally include a number of common
 modules.  A high-level overview of a typical system is given in Figure
 \ref{cap:DB-Architecture}.  
 \begin{figure}
 \includegraphics[%
  width=1.0\columnwidth]{DB-Architecture.pdf}
 \caption{\label{cap:DB-Architecture}Conceptual view of a modern
 transactional application.  Current systems include high level
 functionality, such as indices and locking, but are not designed to
 allow developers to replace this functionality with application
 specific modules.}
 \end{figure}
 Many applications make use of transactional storage, and each is
 designed for a specific application, or set of applications.  LLADD
 provides a flexible substrate that allows such applications to be
 developed.  The complexity of existing systems varies widely, as do
 the applications for which these systems are designed.  
 On the database side of things, relational databases excel in areas
 where performance is important, but where the consistency and
 durability of the data is crucial.  Often, databases significantly
 outlive the software that uses them, and must be able to cope with
 changes in business practices, system architechtures, etc.
 Object-oriented databases are more focused on facilitating the
 development of complex applications that require reliable storage, but
 may take advantage of less-flexible, but more efficient data models,
 as they often only interact with a single application, or a handful of
 variants of that application.
 Databases are designed for circumstances where development time may
 dominate cost, many users must share access to the same data, and
 where security, scalability, and a host of other concerns are
 important.  In many, if not most circumstances, these issues are less
 important, or even irrelevant.  Therefore, applying a database in
 these situations is likely overkill, which may partially explain the
 popularity of MySQL, which allows some of these constraints to be
 relaxed at the discretion of a developer or end user.
 Still, there are many applications where MySQL is still too
 inflexible.  In order to serve these applications, a host of software
 solutions have been devised.  Some are extremely complex, such as
 semantic file systems, where the file system understands the contents
 of the files that it contains, and is able to provide services such as
 rapid search, or file-type specific operations such as thumbnailing,
 automatic content updates, and so on.  Others are simpler, such as
 BerkeleyDB, which provides transactional storage of data in unindexed
 form, in indexed form using a hash table, or a tree.  LRVM, a version
 of malloc() that provides transacational memory, and is similar to an
 object oriented database, but is much lighter weight, and more
 flexible.
 Finally, some applications require incredibly simple, but extremely
 scalable storage mechanisms.  Cluster Hash Tables are a good example
 of the type of system that serves these applications well, due to
 their relative simplicity, and extremely good scalability
 characteristics.  Depending on the fault model a cluster hash table is
 implemented on top of, it is also quite plasible that key portions of
 the transactional mechanism, such as forcing log entries to disk, will
 be replaced with other durability schemes, such as in-memory
 replication across many nodes, or multiplexing log entries across
 multiple systems.  This level of flexibility would be difficult to
 retrofit into existing transactional applications, but is appropriate
 in many environments.
 We have only provided a small sampling of the many applications that
 make use of transactional storage.  Unfortunately, it is extremely
 difficult to implement a correct, efficient and scalable transactional
 data store, and we know of no library that provides low level access
 to the primatives of such a durability algorithm.  These algorithms
 have a reputation of being complex, with many intricate interactions,
 which prevent them from being implemented in a modular, easily
 understandable, and extensible way.  In addition to describing such an
 implementation of ARIES, a popular and well-tested
 'industrial-strength' algorithm for transactional storage, this paper
 will outline the most important interactions that we discovered (that
 is, the ones that could not be encapsulated within our
 implementation), and give the reader a sense of how to use the
 primatives the library provides.
 %Many plausible lock managers, can do any one you want.
 %too much implemented part of DB; need more 'flexible' substrate.
 \section{ARIES from an Operation's Perspective}
 Instead of providing a comprehensive discussion of ARIES, we will
 focus upon those features of the algorithm that are most relevant
 to a developer attempting to add a new set of operations. Correctly
 implementing such extensions is complicated by concerns regarding
 concurrency, recovery, and the possibility that any operation may
 be rolled back at runtime.
 We first sketch the constraints placed upon operation implementations,
 and then describe the properties of our implementation of ARIES that
 make these constraints necessary. Because comprehensive discussions
 of write ahead logging protocols and ARIES are available elsewhere,
 (Section \ref{sub:Prior-Work}) we only discuss those details relevant
 to the implementation of new operations in LLADD.
 \subsection{Properties of an Operation\label{sub:OperationProperties}}
 A LLADD operation consists of some code that performs some action
 on the developer's behalf. These operations implement the actions
 that are composed into transactions. Since transactions may be aborted,
 the effects of an operation must be reversible. Furthermore, aborting
 and comitting transactions may be interleaved, and LLADD does not
 allow cascading abort,%
 \footnote{That is, by aborting, one transaction may not cause other transactions
 to abort. To understand why operation implementors must worry about
 this, imagine that transaction A split a node in a tree, transaction
 B added some data to the node that A just created, and then A aborted.
 When A was undone, what would become of the data that B inserted?%
 } so in order to implement an operation, we must implement some sort
 of locking, or other concurrency mechanism that protects transactions
 from each other. LLADD only provides physical consistency; we leave
 it to the application to decide what sort of transaction isolation is appropriate.
 Therefore, data dependencies between transactions are allowed, but
 we still must ensure the physical consistency of our data structures.
 Also, all actions performed by a transaction that commited must be
 restored in the case of a crash, and all actions performed by aborting
 transactions must be undone. In order for LLADD to arrange for this
 to happen at recovery, operations must produce log entries that contain
 all information necessary for undo and redo.
 Finally, each page contains some metadata needed for recovery. This
 must be updated apropriately.
 \subsection{Normal Processing}
 \subsubsection{The buffer manager}
 LLADD manages memory on behalf of the application and prevents pages
 from being stolen prematurely. While LLADD uses the STEAL policy and
 may write buffer pages to disk before transaction commit, it still
 must make sure that the redo and undo log entries have been forced
 to disk before the page is written to disk. Therefore, operations
 must inform the buffer manager when they write to a page, and update
 the log sequence number of the page. This is handled automatically
 by many of the write methods provided to operation implementors (such
 as writeRecord()), but the low-level page manipulation calls (which
 allow byte level page manipulation) leave it to their callers to update
 the page metadata appropriately.
 \subsubsection{Log entries and forward operation (the Tupdate() function)\label{sub:Tupdate}}
 In order to handle crashes correctly, and in order to the undo the
 effects of aborted transactions, LLADD provides operation implementors
 with a mechanism to log undo and redo information for their actions.
 This takes the form of the log entry interface, which works as follows.
 Operations consist of a wrapper function that performs some pre-calculations
 and perhaps acquires latches. The wrapper function then passes a log
 entry to LLADD. LLADD passes this entry to the logger, and then processes
 it as though it were redoing the action during recovery, calling a function
 that the operation implementor registered with
 LLADD. When the function returns, control is passed back to the wrapper
 function, which performs any post processing (such as generating return
 values), and releases any latches that it acquired. %
 \begin{figure}
 ~~~~~~~~\includegraphics[%
  width=0.70\columnwidth]{TSetCall.pdf}
 \caption{Runtime behavior of a simple operation. Tset() and do\_set() are
 implemented as extensions, while Tupdate() is built in. New operations
 need not be aware of the complexities of LLADD.}
 \end{figure}
 This way, the operation's behavior during recovery's redo phase (an
 uncommon case) will be identical to the behavior during normal processing,
 making it easier to spot bugs. Similarly, undo and redo operations take
 an identical set of parameters, and undo during recovery is the same 
 as undo during normal processing.  This makes recovery bugs more obvious and allows redo
 functions to be reused to implement undo. 
 Although any latches acquired by the wrapper function will not be
 reacquired during recovery, the redo phase of the recovery process
 is single threaded. Since latches acquired by the wrapper function
 are held while the log entry and page are updated, the ordering of
 the log entries and page updates associated with a particular latch
 must be consistent. However, some care must be taken to ensure proper
 undo behavior.
 \subsubsection{Concurrency and Aborted Transactions}
 Section \ref{sub:OperationProperties} states that LLADD does not
 allow cascading aborts, implying that operation implementors must
 protect transactions from any structural changes made to data structures
 by uncomitted transactions, but LLADD does not provide any mechanisms
 designed for long term locking. However, one of LLADD's goals is to
 make it easy to implement custom data structures for use within safe,
 multi-threaded transactions. Clearly, an additional mechanism is needed.
 The solution is to allow portions of an operation to 'commit' before
 the operation returns.%
 \footnote{We considered the use of nested top actions, which LLADD could easily
 support. However, we currently use the slightly simpler (and lighter-weight)
 mechanism described here. If the need arises, we will add support
 for nested top actions.%
 } An operation's wrapper is just a normal function, and therefore may
 generate multiple log entries. First, it writes an undo-only entry
 to the log. This entry will cause the \emph{logical} inverse of the
 current operation to be performed at recovery or abort, must be idempotent,
 and must fail gracefully if applied to a version of the database that
 does not contain the results of the current operation. Also, it must
 behave correctly even if an arbitrary number of intervening operations
 are performed on the data structure.
 The remaining log entries are redo-only, and may perform structural
 modifications to the data structure. They should not make any assumptions
 about the consistency of the current version of the database. Finally,
 any prefix of the sequence of the redo-only operations performed by
 this operation must leave the database in a consistent state. The
 $B^{LINK}$ tree {[}...{]} is an example of a B-Tree implementation
 that behaves in this way, as is the linear hash table implementation
 discussed in Section \ref{sub:Linear-Hash-Table}. 
 Some of the logging constraints introduced in this section may seem
 strange at this point, but are motivated by the recovery process.
 \subsection{Recovery}
 \subsubsection{ANALYSIS / REDO / UNDO}
 Recovery in AIRES consists of three stages, analysis, redo and undo
 . The first, analysis, is
 partially implemented by LLADD, but will not be discussed in this
 paper. The second, redo, ensures that each redo entry in the log 
 will have been applied each page in the page file exactly once.
 The third phase, undo rolls back any transactions that were active
 when the crash occured, as though the application manually aborted
 them with the {}``abort()'' call.
 After the analysis phase, the on-disk version of the page file
 is in the same state it was in when LLADD crashed. This means that
 some subset of the page updates performed during normal operation
 have made it to disk, and that the log contains full redo and undo
 information for the version of each page present in the page file.%
 \footnote{Although this discussion assumes that the entire log is present, the
 ARIES algorithm supports log truncation, which allows us to discard
 old portions of the log, bounding its size on disk.%
 } However, we make no further assumptions regarding the order in which
 pages were propogated to disk. Therefore, redo must assume that any
 data structures, lookup tables, etc. that span more than a single
 page are in an inconsistent state. Therefore, as the redo phase re-applies
 the information in the log to the page file, it must address all pages directly. 
 Therefore, the redo information for each operation in the log
 must contain the physical address (page number) of the information
 that it modifies, and the portion of the operation executed by a single
 log entry must only rely upon the contents of the page that the log
 entry refers to. Since we assume that pages are propagated to disk
 atomicly, the REDO phase may rely upon information contained within
 a single page.
 Once redo completes, some prefix of the runtime log that contains 
 complete entries for all committed transactions has been applied 
 to the database.  Therefore, we know that the page file is in
 a physically consistent state (although it contains portions of the
 results of uncomitted transactions). The final stage of recovery is
 the undo phase, which simply aborts all uncomitted transactions. Since
 the page file is physically consistent, the transactions are aborted
 exactly as they would be during normal operation. 
 \subsubsection{Physical, Logical and Phisiological Logging.}
 The above discussion avoided the use of some terminology that is common
 in the database literature and which should be presented here. {}``Physical
 loggging'' is the practice of logging physical (byte level) upates
 and the physical (page number) addresses that they are applied to. 
 It is subtly different than {}``physiological logging,'' which is
 what LLADD recommends for its redo records. In physiological logging,
 the physical (page number) address is stored, but the byte offset
 and the actual difference are stored implicitly in the parameters
 of some function. When the parameters are applied to the function,
 it will update the page in a way that preserves application semantics.
 This allows for some convenient optimizations. For example, data within
 a single page can be re-arranged at runtime to produce contiguous
 regions of free space, or the parameters passed to the function may
 be significantly smaller than the physical change made to the page. 
 {}``Logical logging'' can only be used for undo entries in LLADD,
 and is identical to physiological logging, except that it stores a
 logical address (the key of a hash table, for instance) instead of
 a physical address. This allows the location of data in the page file
 to change, even if outstanding transactions may have to roll back
 changes made to that data. Clearly, for LLADD to be able to apply
 logical log entries, the page file must be physically consistent,
 ruling out use of logical logging for redo operations.
 LLADD supports all three types of logging, and allows developers to
 register new operations, which is the key to its extensibility. After
 discussing LLADD's architecture, we will revisit this topic with a
 concrete example.
 \subsection{Summary}
 This section presented a relatively simple set of rules and patterns
 that a developer must follow in order to implement a durable, transactional
 and highly-concurrent data structure using LLADD:
 \begin{itemize}
 \item Pages should only be updated inside of a redo or undo function.
 \item An update to a page should update the LSN. 
 \item If the data read by the wrapper function must match the state of
 the page that the redo function sees, then the wrapper should latch
 the relevant data.
 \item Redo operations should address pages by their physical offset,
 while Undo operations should use a more permenant address (such as
 index key) if the data may move between pages over time.
 \item An undo operation must correctly update a data structure if any
 prefix of its corresponding redo operations are applied to the
 structure, and if any number of intervening operations are applied to
 the structure.
 \end{itemize}
 Because undo and redo operations during normal operation and recovery
 are similar, most bugs will be found with conventional testing
 strategies.  It is difficult to verify the final property, although a
 number of tools could be written to simulate various crash scenarios,
 and check the behavior of operations under these scenarios.  
 Note that the ARIES algorithm is extremely complex, and we have left
 out most of the details needed to implement it correctly.\footnote{The original ARIES paper was around 70 pages, and the ARIES/IM paper, which covered index implementation is roughly the same length}
  Yet, we believe we have covered everything that a programmer needs to know in order to implement new data structures using the basic functionality that ARIES provides. This was possible due to the encapsulation
 of the ARIES algorithm inside of LLADD, which is the feature that
 most strongly differentiates LLADD from other, similar libraries.
 We hope that this will increase the availability of transactional
 data primatives to application developers.
 \section{LLADD Architecture}
 %
 \begin{figure}
 \includegraphics[%
  width=1.0\columnwidth]{LLADD-Arch2.pdf}
 \caption{\label{cap:LLADD-Architecture}Simplified LLADD Architecture: The
 core of the library places as few restrictions on the application's
 data layout as possible. Custom {}``operations'' implement the client's
 desired data layout. The seperation of these two sets of modules makes
 it easy to improve and customize LLADD.}
 \end{figure}
 LLADD is a toolkit for building ARIES style transaction managers.
 It provides user defined redo and undo behavior, and has an extendible
 logging system with ... types of log entries so far. Most of these
 extensions deal with data layout or modification, but some deal with
 other aspects of LLADD, such as extensions to recovery semantics (Section
 \ref{sub:Two-Phase-Commit}). LLADD comes with some default page layout
 schemes, but allows its users to redefine this layout as is appropriate.
 Currently LLADD imposes two requirements on page layouts. The first
 32 bits must contain a log sequence number for recovery purposes,
 and the second 32 bits must contain the page type.
 While it ships with basic operations that support variable length
 records, hash tables and other common data types, our goal is to
 decouple all decisions regarding data format from the implementation
 of the logging and recovery systems. Therefore, the preceeding section
 is essentially documentation for potential users of the library, while
 the purpose of the performance numbers in our evaluation section are
 not to validate our hash table, but to show that the underlying architecture
 is able to efficiently support interesting data structures.
 Despite the complexity of the interactions between its modules, the
 ARIES algorithm itself is quite simple. Therefore, in order to keep
 LLADD simple, we started with a set of modules, and iteratively refined
 the boundaries between these modules. A summary of the result is presented
 in Figure \ref{cap:LLADD-Architecture}. The core of the LLADD library
 is quite small at ... lines of code, and has been documented extensively.
 We hope that we have exposed most of the subtle interactions between
 internal modules in the online documentation. {[}... doxygen ...{]}
 As LLADD has evolved, many of its sub-systems have been incrementally
 improved, and we believe that the current set of modules is amenable
 to the addition of new functionality. For instance, the logging module
 interface encapsulates all of the details regarding its on disk format,
 which would make it straightforward to implement more exotic logging
 techniques such as using log shipping to maintain a 'warm replica'
 for failover purposes, or the use of log replication to avoid physical
 disk access at commit time. Similarly, the interface encodes the dependencies
 between the logger and other subsystems, so, for instance, the requirements
 that the buffer manager places on the logger would be obvious to someone
 that attempted to alter the logging functionality.%
 \footnote{The buffer manager must ensure that the logger has forced the appropriate
 log entries to disk before writing a dirty page to disk. Otherwise,
 it would be impossible to undo the changes that had been made to the
 page.%
 }
 The buffer manager itself is another potential area for extension.
 Because the interface between the buffer manager and LLADD is simple,
 we would like to support transactional access to resources beyond
 simple page files. Some examples include transactional updates of
 multiple files on disk, transactional groups of program executions
 or network requests, or even leveraging some of the advances being
 made in the Linux and other modern operating system kernels. For example,
 ReiserFS recently added support for atomic file system operations.
 It is possible that this could be used to provide variable sized pages
 to LLADD.
 From the testing point of view, the advantage of LLADD's division
 into subsystems with simple interfaces is obvious. We are able to
 use standard unit-testing techniques to test each of LLADD's subsystems
 independently, and have documented both external and internal interfaces,
 making it easy to add new tests and debug old ones. Furthermore, by
 adding a 'simulate crash' operation to a few of the key components,
 we can simulate application level crashes by clearing LLADD's internal
 state, re-initializing the library and verifying that recovery was
 successful. These tests currently cover approximately 90\% of the
 code. We have not yet developed a mechanism that will allow us to
 accurately model hardware failures, which is an area where futher
 work is needed.
 LLADD's performance requirements vary wildly depending on the workload
 with which it is presented. Its performance on a large number of small,
 sequential transactions will always be limited by the amount time
 required to flush a page to disk. To some extent, compact logical
 and physiological log entries improve this situation. On the other
 hand, long running transactions only rarely force-write to disk and
 become CPU bound. Standard profiling techniques of the overall library's
 performance, and microbenchmarks of crucial modules handle such situations
 nicely. 
 A more interesting set of performance requirements are imposed by
 multithreaded workloads. Each module of LLADD is reentrant, and a
 C preprocessor directive allows the entire library to be instrumented
 in order to profile latching behavior, which is useful both for perfomance
 tuning and for debugging purposes. A thread that is not involved in
 an I/O request never needs to wait for a latch held by a thread that
 is waiting for I/O.%
 \footnote{Strictly speaking, this statement is only true for the LLADD's core.
 However, there are variants of most popular data structures that allow
 us to preserve these invariants. LLADD can correctly support operations
 whether or not they have these properties.%
 }
 There are a number of performance optimizations that are specific
 to multithreaded operations that we do not perform. The most glaring
 omission is log bundling; if multiple transactions commit at once,
 LLADD must force the log to disk one time per transaction. This problem
 is not fundamental, but simply has not made it into the current code
 base. Similarly, since page eviction requires a force-write if the
 full ARIES recovery algorithm is in use, we could implement a thread
 that asynchronously maintained a set of free buffer pages. Such optimizations
 will be implemented before LLADD's final release, but are not reflected
 in this paper's performance figures.
 \section{Sample Operations}
 In order to validate LLADD's architecture, and to show that it simplifies
 the creation of efficient data structures, we have have implemented
 a number of simple extensions. In this section, we describe their
 design, and provide some concrete examples of our experiences extending
 LLADD.
 \subsection{Linear Hash Table\label{sub:Linear-Hash-Table}}
 Linear hash tables are hash tables that are able to extend their bucket
 list incrementally at runtime. They work as follows. Imagine that
 we want to double the size of a hash table of size $2^{n}$, and that
 the hash table has been constructed with some hash function $h_{n}(x)=h(x)\, mod\,2^{n}$
 Choose $h_{n+1}(x)=h(x)\, mod\,2^{n+1}$ as the hash function for
 the new table. Conceptually we are simply prepending a random bit
 to the old value of the hash function, so all lower order bits remain
 the same. At this point, we could simply block all concurrent access
 and iterate over the entire hash table, reinserting values according
 to the new hash function. 
 However, because of the way we chose $h_{n+1}(x),$ we know that the
 contents of each bucket, $m$, will be split betwen bucket $m$ and
 bucket $m+2^{n}$. Therefore, if we keep track of the last bucket
 that was split, we can split a few buckets at a time, resizing the
 hash table without introducing long pauses while we reorganize the
 hash table.{[}...{]} We can handle overflow using standard techniques.
 LLADD's linear hash table uses linked lists of overflow buckets.
 For this scheme to work, we must be able to address a portion of the
 page file as though it were an expandable array. We have implemented
 this functionality as a seperate module, but will not discuss it here.
 For the purposes of comparison, we provide two linear hash implementations.
 The first is straightforward, and is layered on top of LLADD's standard
 record setting operation, Tset(), and therefore performs physical
 undo. This implementation provided a stepping stone to the more sophisticated
 version which employs logical undo, and uses an identical on-disk
 layout. As we discussed earlier, logical undo provides more opportunities
 for concurrency, while decreasing the size of log entries. In fact,
 the physical-redo implementation of the linear hash table cannot support
 concurrent transactions!%
 \begin{figure}
 ~~~~~~~~\includegraphics[%
  width=0.80\columnwidth]{LinkedList.pdf}
 \caption{\label{cap:Linear-Hash-Table}Linear Hash Table Bucket operations.}
 \end{figure}
 Because another module provides the resizable arrays needed for the
 bucket list, the complexity of the linear hash algorithm is in two
 areas. The first, linked list management, is straightforward in the
 physical case, but must be performed in a specific order in the logical
 case. See Figure \ref{cap:Linear-Hash-Table} for a sequence of steps
 that safely implement the necessary linked list operations. Note that
 in the first two cases, the portion of the linked list that is visible
 from LLADD's point of view is always consistent. This is important
 for crash recovery; it is possible that LLADD will crash before the
 entire sequence of operations has been completed. The logging protocol
 guarantees that some prefix of the log will be available. Therefore,
 as long as the run-time version of the hash table is always consisten,
 we do not have to consider the impact of skipped updates, but we must
 be certain that the logical consistency of the linked list is maintained
 at all steps. Here, challenge comes from the fact that the buffer
 manager only provides atomic updates of single pages; in practice,
 a linked list may span pages.
 The last case, where buckets are split as the bucket list is expanded
 is a bit more complicated. We must maintain consistency between two
 linked lists, and a page at the begining of the hash table that contains
 the last bucket that we successfully split. Here, we misuse the undo
 entry to ensure proper crash recovery. Our bucket split algorithm
 is idempotent, so it may be applied an arbitrary number of times to
 a given bucket with no ill-effects. Also note that (for our purposes),
 there is never a good reason to undo a bucket split, so we can safely
 apply the split whether or not the current transaction commits.
 First, an 'undo' record that checks the hash table's meta data and
 redoes the split if necessary is written. Second, we write a series
 of redo-only records to log. These encode the bucket split, and follow
 the linked list protocols listed above. Finally, we write a redo-only
 entry that updates the hash table's metadata.%
 \footnote{Had we been using nested top actions, we would not need the special
 undo entry, but we would need to store physical undo information for
 each of the modifications made to the bucket. This method does have
 the disadvantage of producing a few redo-only entries during recovery,
 but recovery is an uncommon case.%
 }
 We allow pointer aliasing at this step so that a given key can be
 present for a short period of time in both buckets. If we crash before
 the undo entry is written, no harm is done. If we crash after the
 entire update makes it to log, the redo stage will set the hash's
 metadata appropriately, and the 'undo' record becomes a no-op. If
 we crash in the middle of the bucket split, we know that the current
 transaction did not commit, and that recovery will execute the 'undo'
 record. It will see that the bucket split is still pending and finish
 splitting the bucket appropriately. Since the bucket split is idempotent,
 and we've arranged for it to behave correctly regardless of the point
 at which it was interrupted, the hastable is correctly restored. 
 Note that there is a point during the undo phase where the bucket
 is in an inconsistent physical state, although normally the redo phase
 is able to bring the database to a fully consistent physical state.
 We handle this by obtaining a runtime lock on the bucket during normal
 operation. This runtime lock blocks any attempt to write log entries
 that effect a bucket that is being split, so we know that no other
 logical operations will attempt to access an inconsistent bucket.
 Since the second implementation of the linear hash table uses logical
 redo, we are able to allow concurrent updates to different portions
 of the table. This is not true in the case of the implementation that
 uses pure physical logging, as physical undo cannot generally tolerate
 concurrent structural modifications to data structures.
 \subsection{Two Phase Commit\label{sub:Two-Phase-Commit}}
 The two phase commit protocol is used in clustering applications where
 multiple, well maintained, well connected computers must agree upon
 a set of successful transactions. Some of the systems could crash,
 or the network could fail during operation, but we assume that such
 failures are temporary. Two phase commit designates a single computer
 as the coordinator of a given transaction. This computer contacts
 the other systems participating in the transaction, and asks them
 to prepare to commit the transaction. If a subordinate system sees
 that an error has occurred, or the transaction should be aborted for
 some other reason, then it informs the coordinator. Otherwise, it
 enters the \emph{prepared} state, and tells the coordinator that it
 is ready to commit. At some point in the future, the coordinator will
 reply telling the subordinate to commit or abort. From LLADD's point
 of view, the interesting portion of this algorithm is the \emph{prepared}
 state, since it must be able to commit a prepared transaction if it
 crashes before the coordinator responds, but cannot commit before
 hearing the response, since it may be asked to abort the transaction.
 Implementing the prepare state on top of the ARIES algorithm constists
 of writing a special log entry that informs the undo portion of the
 recovery phase that it should stop rolling back the current transaction
 and instead add it to the list of active transactions.%
 \footnote{Also, any locks that the transaction obtained should be restored,
 which is outside of the scope of LLADD, although this functionality
 could be added relatively easily if a lock manager were implemented
 on top of LLADD.%
 } Due to LLADD's extendible logging system, and the simplicity of simplicity
 of its recovery code, it took an afternoon to add a prepare operation
 to LLADD.
 \section{Evaluation}
 We hope that the preceeding sections have given the reader an idea
 of the usefulness and extensibility of the LLADD library. In this
 section we focus on performance evaluation.
 In order to evaluate the physical and logical hashtable implementations,
 we first ran a test that inserts some tuples into the database. For
 this test, we chose fixed length key, values pairs of integers. For
 simplicity, our hashtable implementations currently only support fixed-length
 keys and values, so this this test puts us at a significant advantage.
 It also provides an example of the type of workload that LLADD handles
 well, since LLADD is specifically designed to support application
 specific transactional data structures. 
 %
 \begin{figure*}
 %\includegraphics[%
 %  width=1.0\textwidth]{INSERT.pdf}
 \caption{\label{cap:INSERTS}The final data points for LLADD's and Berkeley
 DB's record number based storage are 7.4 and 9.5 seconds, respectively.
 LLADD's hash table is significantly faster than Berkeley DB in this
 test, but provides less functionality than the Berkeley DB hash. Finally,
 the logical logging version of LLADD's hash table is faster than the
 physical version, and handles the multi-threaded test well. The threaded
 test split its workload into 200 seperate transactions.}
 \end{figure*}
 The times included in Figure \ref{cap:INSERTS} include page file
 and log creation, insertion of the tuples as a single transaction,
 and a clean program shutdown. We used the 'transapp.cs' program from
 the Berkeley DB 4.2 tutorial to run the Berkeley DB tests, and hardcoded
 it to use integers instead of strings. We used the Berkeley DB {}``DB\_HASH''
 index type for the hashtable implementation, and {}``DB\_RECNO''
 in order to run the {}``Record Number'' test.
 Since LLADD addresses records as \{Page, Slot, Size\} triples, which
 is a lower level interface than Berkeley DB exports, we used the expandible
 array that supports the hashtable implementation to run the {}``LLADD
 Record Number'' test.
 One should not look at Figure \ref{cap:INSERTS}, and conclude {}``LLADD
 is almost five times faster than Berkeley DB,'' since we chose a
 hash table implementation that is tuned for fixed-length data. Instead,
 the conclusions we draw from this test are that, first, LLADD's primative
 operations are on par, perforance wise, with Berkeley DB's, which
 we find very encouraging. Second, even a highly tuned implementation
 of a 'simple,' general purpose data structure is not without overhead,
 and for applications where performance is important a special purpose
 structure may be appropriate.
 As a final note on our performance graph, we would like to address
 the fact that LLADD's hashtable curve is non-linear. LLADD currently
 uses a fixed-size in-memory hashtable implementation in many areas,
 and it is possible that we exceed the fixed-size of this hashtable
 on the larger test sets. Also, LLADD's buffer manager is currently
 fixed size. Regardless of the cause of this non-linearity, we do not
 believe that it is fundamental to our implementation.
 {[} 
 Still need to run the multi-threaded tests. The physical one's performance
 should strictly degrade as the number of threads increases, while
 the logical one's performance should increase for a while, and then
 begin to degrade. Hopefully the logical one's peak performance will
 be better than the physical implementation's peak performance.
 I still haven't decided how to run the 2PC performance numbers. Maybe
 I could run a speed-up or scale-up test on it.
 I expect this section to be two pages, including graphs.
 {]}
 \section{Future Work}
 LLADD is an extendible implementation of the ARIES algorithm. This
 allows application developers to incorporate transactional recovery
 into a wide range of systems. We have a few ideas along these lines,
 and also have some ideas for extensions to LLADD itself.
 LLADD currently relies upon its buffer manager for page oriented storage.
 Although we did not have space to discuss it in this paper, we have
 a blob implementation that stores large data outside of the page file.
 This concept could be extended to arbitrary primatives, such as transactional
 updates to file system directory trees, integration of networking,
 or other operations directly into LLADD transactions. Doing this would
 allow LLADD to act as a sort of 'glue-code' between various systems,
 ensuring data integrity and adding database-style functionality, such
 as continuous backup to systems that currently do not provide such
 mechanisms. We believe that there is quite a bit of room for the developement
 of new software systems in the space between the high-level, but sometimes
 inappropriate interfaces exported by database servers, and the low-level,
 general purpose primatives supported by current file systems.
 Currently, while we have implemented a two phase commit algorithm,
 LLADD really is not very network aware. If we provided a clean abstraction
 that allowed LLADD extensions and operations to cross network boundaries,
 then we could provide a wider range of network consistency algorithms,
 and cleanly support the implementation of operations that perform
 well in networked and in local environments.
 While LLADD is re-entrant, its latching mechanisms only provide physical
 consistency. Traditionally, lock-managers, which provide higher levels
 of consistency have been tightly coupled with transactional page implementations.
 Generally, the semantics of undo and redo operations provided by the
 transactional page layer and its associated data structures determine
 the level of concurrency that is possible. Since prior systems provide
 a set of primitives to their users, these systems encapsulated the
 interactions between the lock manager, on-disk formats and the transactional
 page layer. Finally, at recovery time it is often desirable to reacquire
 locks on behalf of a transaction. Without extensible logging and without
 modifying the recovery code, it is impossible to ensure that such
 locks are correctly restored. By providing extensible logging, data-structures,
 and undo/redo semantics, LLADD removes these reasons for coupling
 the lock manager and the rest of the storage mechanisms. The flexiblity
 offered by splitting the lock manager and the ARIES algorithm into
 independent sub-systems, and allowing users to independently extend
 either module seems to outweigh the extra complexity that will be
 added to LLADD's interface. In particular, most difficulties related
 to locking seem to be data-structure dependent, suggesting that, like
 page layout or the semantics of various types of log entires, they
 are largely orthagonal to the atomicity and durability algorithms
 implemented by LLADD.
 By releasing LLADD to the community, we hope that we will be able
 to provide a toolkit that aids in the development of real-world applications,
 and is flexible enough for use as a research platform.
 \section{Conclusion}
 \section{Acknowledgements}
 \section{Availability}
 LLADD is free software, available at:
 \begin{center}
 {\tt http://www.sourceforge.net/projects/lladd}\\
 \end{center}
 \begin{thebibliography}{99}
 \bibitem[Beazley]{beazley} D.~M.~Beazley and P.~S.~Lomdahl, 
 {\em Message-Passing Multi-Cell Molecular Dynamics on the Connection
 Machine 5}, Parall.~Comp.~ 20 (1994) p. 173-195.
 \bibitem[RealName]{CitePetName} A.~N.~Author and A.~N.~Other, 
 {\em Title of Riveting Article}, JournalName VolNum (Year) p. Start-End
 \bibitem[ET]{embed} Embedded Tk, \\
 {\tt ftp://ftp.vnet.net/pub/users/drh/ET.html}
 \bibitem[Expect]{expect} Don Libes, {\em Exploring Expect}, O'Reilly \& Associates, Inc. (1995).
 \bibitem[Heidrich]{heidrich} Wolfgang Heidrich and Philipp Slusallek, {\em
 Automatic Generation of Tcl Bindings for C and C++ Libraries.},
 USENIX 3rd Annual Tcl/Tk Workshop (1995).
 \bibitem[Ousterhout]{ousterhout} John K. Ousterhout, {\em Tcl and the Tk Toolkit}, Addison-Wesley Publishers (1994).
 \bibitem[Perl5]{perl5} Perl5 Programmers reference,\\
 {\tt http://www.metronet.com/perlinfo/doc}, (1996).
 \bibitem[Wetherall]{otcl} D. Wetherall, C. J. Lindblad, ``Extending Tcl for
 Dynamic Object-Oriented Programming'', Proceedings of the USENIX 3rd Annual Tcl/Tk Workshop (1995).
 \end{thebibliography}
 \end{document}
--- a/doc/paper/LinkedList.fig
+++ b/doc/paper/LinkedList.fig
@ -0,0 +1,216 @@
 #FIG 3.2  Produced by xfig version 3.2.5-alpha5
 Landscape
 Center
 Metric
 A4      
 100.00
 Single
 -2
 1200 2
 0 32 #c6b797
 0 33 #eff8ff
 0 34 #dccba6
 0 35 #404040
 0 36 #808080
 0 37 #c0c0c0
 0 38 #e0e0e0
 0 39 #8e8f8e
 0 40 #aaaaaa
 0 41 #555555
 0 42 #868286
 0 43 #c7c3c7
 0 44 #e7e3e7
 0 45 #8e8e8e
 0 46 #444444
 0 47 #868686
 0 48 #565151
 0 49 #effbff
 0 50 #666666
 0 51 #e2e2ee
 0 52 #94949a
 0 53 #dbdbdb
 0 54 #a1a1b7
 0 55 #9c0000
 0 56 #ededed
 0 57 #86acff
 0 58 #7070ff
 0 59 #bebebe
 0 60 #515151
 0 61 #000049
 0 62 #797979
 0 63 #303430
 0 64 #c7b696
 0 65 #d7d7d7
 0 66 #aeaeae
 0 67 #85807d
 0 68 #d2d2d2
 0 69 #3a3a3a
 0 70 #4573aa
 0 71 #000000
 0 72 #c7c7c7
 0 73 #e7e7e7
 0 74 #f7f7f7
 0 75 #d6d7d6
 0 76 #7b79a5
 0 77 #9e9e9e
 0 78 #717571
 0 79 #73758c
 0 80 #414141
 0 81 #414541
 0 82 #635dce
 5 1 0 1 0 0 50 -1 -1 0.000 0 0 1 0 3982.500 7335.000 3105 6705 4005 6255 4860 6705
 	1 1 1.00 60.00 120.00
 6 1890 1080 2183 1485
 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2036.500 1226.500 1927 1227 2037 1117 2146 1227
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 1927 1227 1927 1302
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 2146 1227 2146 1302
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1890 1302 2183 1302 2183 1485 1890 1485 1890 1302
 -6
 6 1260 3645 5445 5580
 5 1 0 1 0 7 50 -1 -1 0.000 0 0 1 0 3982.500 5130.000 3105 4500 4005 4050 4860 4500
 	1 1 1.00 60.00 120.00
 6 1890 3915 2183 4320
 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2036.500 4061.500 1927 4062 2037 3952 2146 4062
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 1927 4062 1927 4137
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 2146 4062 2146 4137
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1890 4137 2183 4137 2183 4320 1890 4320 1890 4137
 -6
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 3645 1845 3645 1845 4185 1260 4185 1260 3645
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 4185 1845 4185 1845 4725 1260 4725 1260 4185
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 4725 1845 4725 1845 5265 1260 5265 1260 4725
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2520 4185 3105 4185 3105 4725 2520 4725 2520 4185
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 3690 4185 4275 4185 4275 4725 3690 4725 3690 4185
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 1845 4500 2520 4500
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 4860 4185 5445 4185 5445 4725 4860 4725 4860 4185
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 3105 4500 3690 4500
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 4275 4500 4860 4500
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 3105 4500 3690 4500
 4 0 0 50 -1 1 12 0.0000 0 195 1755 2070 5535 Delete Bucket Entry\001
 4 0 0 50 -1 1 12 0.0000 0 195 1410 2115 3915 (1) Lock Bucket\001
 4 0 0 50 -1 1 12 0.0000 0 195 1545 3285 5085 (3) Free old Block\001
 4 0 0 50 -1 1 12 0.0000 0 195 255 3870 3960 (2)\001
 4 0 0 50 -1 1 12 0.0000 0 150 300 3780 4545 Old\001
 -6
 6 1890 7785 2183 8190
 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2036.500 7931.500 1927 7932 2037 7822 2146 7932
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 1927 7932 1927 8007
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 2146 7932 2146 8007
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1890 8007 2183 8007 2183 8190 1890 8190 1890 8007
 -6
 6 1890 5940 3555 6525
 6 1890 6120 2183 6525
 5 1 0 1 0 7 50 -1 -1 0.000 0 0 0 0 2036.500 6266.500 1927 6267 2037 6157 2146 6267
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 1927 6267 1927 6342
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 2146 6267 2146 6342
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1890 6342 2183 6342 2183 6525 1890 6525 1890 6342
 -6
 4 0 0 50 -1 1 12 0.0000 0 195 1410 2115 6120 (1) Lock Bucket\001
 -6
 6 1485 6885 1575 7245
 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 1530 6930 45 45 1530 6930 1575 6930
 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 1530 7065 45 45 1530 7065 1575 7065
 1 3 0 1 0 0 50 -1 20 0.000 1 0.0000 1530 7200 45 45 1530 7200 1575 7200
 -6
 6 1845 6390 3105 6930
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2520 6390 3105 6390 3105 6930 2520 6930 2520 6390
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 1845 6705 2520 6705
 -6
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 810 1845 810 1845 1350 1260 1350 1260 810
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 1350 1845 1350 1845 1890 1260 1890 1260 1350
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 1890 1845 1890 1845 2430 1260 2430 1260 1890
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 2430 1845 2430 1845 2970 1260 2970 1260 2430
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2520 1350 3105 1350 3105 1890 2520 1890 2520 1350
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 3690 1350 4275 1350 4275 1890 3690 1890 3690 1350
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2385 2340 2970 2340 2970 2880 2385 2880 2385 2340
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 1845 1665 2520 1665
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 3105 1665 3690 1665
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 2700 2340 2790 1890
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 1845 1665 2385 2610
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 1260 6255 1845 6255 1845 6795 1260 6795 1260 6255
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 3105 6705 3690 6705
 2 2 0 1 0 50 50 -1 20 0.000 0 0 -1 0 0 5
 	 3690 6390 4275 6390 4275 6930 3690 6930 3690 6390
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 4860 6390 5445 6390 5445 6930 4860 6930 4860 6390
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 3105 6705 3690 6705
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 4275 6705 4860 6705
 2 1 0 1 0 0 50 -1 20 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 3105 7470 3690 6705
 2 1 0 3 0 0 50 -1 20 0.000 0 0 -1 0 0 2
 	 4635 6570 4410 6840
 2 1 0 3 0 0 50 -1 20 0.000 0 0 -1 0 0 2
 	 4410 6570 4635 6840
 2 2 0 1 0 50 50 -1 20 0.000 0 0 -1 0 0 5
 	 1260 7335 1845 7335 1845 7875 1260 7875 1260 7335
 2 2 0 1 0 50 50 -1 20 0.000 0 0 -1 0 0 5
 	 2520 7155 3105 7155 3105 7695 2520 7695 2520 7155
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	1 1 1.00 60.00 120.00
 	 1845 7470 2520 7470
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 1530 5760 5310 5760
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 1485 3420 5265 3420
 4 0 0 50 -1 1 12 0.0000 0 195 2055 3240 2835 (1) Allocate New Block\001
 4 0 0 50 -1 1 12 0.0000 0 195 1410 2115 1080 (2) Lock Bucket\001
 4 0 0 50 -1 1 12 0.0000 0 195 255 2880 2205 (3)\001
 4 0 0 50 -1 1 12 0.0000 0 195 255 1935 2430 (4)\001
 4 0 0 50 -1 1 12 0.0000 0 150 435 2475 2700 New\001
 4 0 0 50 -1 1 12 0.0000 0 195 1695 2070 3240 Insert Bucket Entry\001
 4 0 0 50 -1 1 12 0.0000 0 195 255 3870 6165 (4)\001
 4 0 0 50 -1 1 12 0.0000 0 195 255 3375 7290 (3)\001
 4 0 0 50 -1 1 12 0.0000 0 195 1410 2250 8145 (2) Lock Bucket\001
 4 0 0 50 -1 1 12 0.0000 0 195 1515 3870 7200 (5) Delete pointer\001
 4 0 0 50 -1 1 12 0.0000 0 195 2385 2070 8550 Move Entry to New Bucket\001
--- a/doc/paper/LinkedList.pdf
+++ b/doc/paper/LinkedList.pdf
--- a/doc/paper/TSetCall.fig
+++ b/doc/paper/TSetCall.fig
@ -0,0 +1,28 @@
 #FIG 3.2  Produced by xfig version 3.2.5-alpha5
 Landscape
 Center
 Metric
 A4      
 100.00
 Single
 -2
 1200 2
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 3555 1125 6390 1125 6390 4680 3555 4680 3555 1125
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 3825 2835 6120 2835 6120 4185 3825 4185 3825 2835
 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 3285 450 6660 450 6660 4905 3285 4905 3285 450
 4 0 0 50 -1 1 12 0.0000 0 195 1380 3825 1980 Read pre-image\001
 4 0 0 50 -1 1 12 0.0000 0 195 1530 3825 2235 Allocate log entry\001
 4 0 0 50 -1 1 12 0.0000 0 195 2325 3825 1710 Load page (pin in memory)\001
 4 0 0 50 -1 1 12 0.0000 0 195 1275 3825 2475 Write log entry\001
 4 0 0 50 -1 1 12 0.0000 0 195 2280 3555 1035 Serialize arguments for log\001
 4 0 0 50 -1 1 12 0.0000 0 195 1155 3825 4500 Release page\001
 4 0 0 50 -1 1 12 0.0000 0 195 915 4095 4005 Update lsn\001
 4 0 0 50 -1 1 12 0.0000 0 195 1575 4095 3735 Write data to page\001
 4 0 0 50 -1 1 12 0.0000 0 195 1455 4095 3465 Parse arguments\001
 4 0 0 50 -1 1 14 0.0000 0 255 1980 3960 3105 do_set(LogEntry)\001
 4 0 0 50 -1 1 14 0.0000 0 255 2655 3690 1395 TUpdate(Record, Args)\001
 4 0 0 50 -1 1 14 0.0000 0 255 2310 3420 765 TSet(Record, Value)\001
 4 0 0 50 -1 1 12 0.0000 0 195 1890 3825 2745 Invoke redo operation\001
--- a/doc/paper/TSetCall.pdf
+++ b/doc/paper/TSetCall.pdf
--- a/doc/paper/usenix.sty
+++ b/doc/paper/usenix.sty
@ -0,0 +1,101 @@
 % usenix-2e.sty - to be used with latex2e (the new one) for USENIX.
 % To use this style file, do this:
 %
 %    \documentclass[twocolumn]{article}
 %    \usepackage{usenix-2e}
 % and put {\rm ....} around the author names.
 %
 % $Id$
 %
 % The following definitions are modifications of standard article.sty
 % definitions, arranged to do a better job of matching the USENIX
 % guidelines.
 % It will automatically select two-column mode and the Times-Roman
 % font.
 %
 % USENIX papers are two-column.
 % Times-Roman font is nice if you can get it (requires NFSS,
 % which is in latex2e.
 \if@twocolumn\else\input twocolumn.sty\fi
 \usepackage{times}
 %
 % USENIX wants margins of: 1" sides, 1" bottom, and 1" top.
 % 0.25" gutter between columns.
 % Gives active areas of 6.5" x 9"
 %
 \setlength{\textheight}{9.0in}
 \setlength{\columnsep}{0.25in}
 \setlength{\textwidth}{6.50in}
 %\setlength{\footheight}{0.0in}
 \setlength{\topmargin}{0.625in}
 % you should fiddle with topmargin, depending on your paper size, options to
 % dvips, etc.
 \setlength{\headheight}{0.0in}
 \setlength{\headsep}{0.0in}
 % \setlength{\evensidemargin}{-0.125in}
 % \setlength{\oddsidemargin}{-0.125in}
 %
 % Usenix wants no page numbers for camera-ready papers, so that they can
 % number them themselves.  But submitted papers should have page numbers
 % for the reviewers' convenience.
 % 
 %
 % \pagestyle{empty}
 %
 % Usenix titles are in 14-point bold type, with no date, and with no
 % change in the empty page headers.  The whol author section is 12 point
 % italic--- you must use {\rm } around the actual author names to get
 % them in roman.
 %
 \def\maketitle{\par
 \begingroup
   \renewcommand\thefootnote{\fnsymbol{footnote}}%
   \def\@makefnmark{\hbox to\z@{$\m@th^{\@thefnmark}$\hss}}%
    \long\def\@makefntext##1{\parindent 1em\noindent
            \hbox to1.8em{\hss$\m@th^{\@thefnmark}$}##1}%
   \if@twocolumn
     \twocolumn[\@maketitle]%
     \else \newpage
     \global\@topnum\z@
     \@maketitle \fi\@thanks
 \endgroup
 \setcounter{footnote}{0}%
 \let\maketitle\relax
 \let\@maketitle\relax
 \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
 \def\@maketitle{\newpage
 \vbox to 2.5in{
 \vspace*{\fill}
 \vskip 2em
 \begin{center}%
  {\Large\bf \@title \par}%
  \vskip 0.375in minus 0.300in
  {\large\it
   \lineskip .5em
   \begin{tabular}[t]{c}\@author
   \end{tabular}\par}%
 \end{center}%
 \par
 \vspace*{\fill}
 % \vskip 1.5em
 }
 }
 %
 % The abstract is preceded by a 12-pt bold centered heading
 \def\abstract{\begin{center}%
 {\large\bf \abstractname\vspace{-.5em}\vspace{\z@}}%
 \end{center}}
 \def\endabstract{}
 %
 % Main section titles are 12-pt bold.  Others can be same or smaller.
 %
 \def\section{\@startsection {section}{1}{\z@}{-3.5ex plus-1ex minus
    -.2ex}{2.3ex plus.2ex}{\reset@font\large\bf}}