commit d016498f8d621fa897e0f1442d6f49981d902854
Author: sears <sears@8dad8b1f-cf64-0410-95b6-bcf113ffbcfe>
Date:   Sat Jan 23 02:13:59 2010 +0000

    initial import; removed cruft from mert's tarball, tweaked make's clean targets
    
    git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@520 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe

diff --git a/FwCode.h b/FwCode.h
new file mode 100644
index 0000000..5af3d06
--- /dev/null
+++ b/FwCode.h
@@ -0,0 +1,165 @@
+/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */
+
+#ifndef __FW_CODE__H
+#define __FW_CODE__H
+
+#include <string>
+
+/**
+ * Global framework response codes.
+ */
+class FwCode {
+ public:
+
+    typedef int ResponseCode;
+
+    static const std::string unknownCodeStr;
+
+    /**
+     * The convention here is to keep related codes grouped together, so
+     * that it is easier to find all existing codes for a particular
+     * module.  Each section is given a range of 50 codes, so that adding
+     * a new code to an existing section won't invalidate all of the codes
+     * following it in the enum (causing binary incompatibility).
+     */
+
+    //----------- Generic section -------------
+    static const ResponseCode FwOk = 0;    //!< All successes
+    static const ResponseCode FwError = 1; //!< General error code
+
+    static const ResponseCode FwCrit = 2;  //!< General critical error. could be originated by low level library to indicate some nasty error has occurred.
+
+    static const ResponseCode MdbmOpenFailed = 3; //!< Any kind of mdbm open failure
+    static const ResponseCode MdbmOperationFailed = 4; //!< Any store/fetch/lock from mdbm failed
+    static const ResponseCode NoMem = 5; //!< Out Of Memory
+    static const ResponseCode InvalidParam = 6; //!< Invalid parameter
+    static const ResponseCode NotFound = 7; //!< Fail to find the specified info; usuall returned by access methods
+    static const ResponseCode InvalidState = 8; //!< Invalid state
+    static const ResponseCode ConnReset = 9; //!< connection reset
+    static const ResponseCode Timeout = 10; //!< operation timed out
+    static const ResponseCode InvalidData = 11; //!< buffer data is invalid
+    static const ResponseCode BufTooSmall = 12;  //!< Buffer size is smaller than required
+    static const ResponseCode MalformedRequest = 13; //!< Request data (like the URI) is malformed
+    static const ResponseCode RequestTooLarge = 14; //!< Request data (like the body) is too big
+    static const ResponseCode ConvertToDhtDataFailed = 15; // !< Failed convert json string to DHT::Data
+    static const ResponseCode ConvertFromDhtDataFailed = 16; // !< Failed to convert DHT::Data to json string
+    static const ResponseCode BadHexString = 17; //!< Failed to parse a hex string
+    static const ResponseCode ShmemCorrupted = 18;  //!< A shared mem corruption has been detected.
+    static const ResponseCode ParseError = 19; //!< Generic parsing problem
+    /// If mdbm unlock fails, most of the time we want to shut off the
+    /// system automatically, without letting the caller know that we did
+    /// so. On specific instances where the caller is the FaultHandler, or
+    /// Oversight Fault counter (there may be other examples), we don't want
+    /// to do this because we want to avoid cross-dependency.
+    static const ResponseCode MdbmUnlockFailed = 20;
+
+    //----------- Generic section -------------
+    // Config
+    static const ResponseCode ConfigFailure = 50;  //!< Failure to find or parse a config entry
+
+    //----------- UChar section -------------
+    // UCharUtils
+    static const ResponseCode UcnvOpenFailed = 100; //!< Failed to open ucnv converter for utf-8
+    static const ResponseCode DataNotUtf8 = 101;    //!< Data is not in utf-8 format
+    static const ResponseCode ConvertToUCharFailed = 102; //!< Failed to convert utf-8 string to UChar string
+    static const ResponseCode CompileRegExFailed = 103; //!< Failed to compile the regular expression
+
+    //----------- Yca section -------------
+    // YcaClient
+    static const ResponseCode YcaOpenFailed = 150; //!< Failed to open the yca database
+    static const ResponseCode YcaCertInvalid = 151; //!< Validation of presented cert failed
+    static const ResponseCode YcaCertNotFound = 152;        //!< certificate for the requested appID was not found
+
+    //----------- Broker section -------------
+    static const ResponseCode BrokerClientOpenFailed = 200;  //!< Failed to connect to broker
+    static const ResponseCode UncertainPublish = 201; //!< Publish was uncertain - unknown if it happened
+    static const ResponseCode PublishFailed = 202;   //!< Publish failed (for certain :))
+    static const ResponseCode SubscribeFailed = 203; //!< Failed to subscribe to a topic
+    static const ResponseCode NoSubscriptionFound = 204; //!< Operation on a sub failed because we (locally)
+    // don't know about it
+    static const ResponseCode RegisterFailed = 205; //!< Failed to register handler for subscription
+    static const ResponseCode UnsubscribeFailed = 206; //!< Failed to unsubscribe from sub
+    static const ResponseCode ListTopicsFailed = 207; //!< Failed to list subscribed topics
+    static const ResponseCode ConsumeFailed = 208; //!< Failed to consume messages for a topic
+    static const ResponseCode TopicInvalid = 209;  //!< Topic is invalid (was usurped or ymb 'lost' it)
+    static const ResponseCode NoMessageDelivered = 210;  //!< Call to deliver() found no messages ready
+    static const ResponseCode ConsumeFailedBadTopic = 211; //!< The topic is bad - our handle is bad,
+    // or it got usurped
+    static const ResponseCode ConsumeFailedBadHandle = 212; //!< Our ymb handle is bad - not usable anymore
+    static const ResponseCode ConsumeFailedConnectionError = 213; //!< a recoverable connection error
+    static const ResponseCode ConsumeFailedServerBusy = 214; //!< ymb server is having a temporary issue,
+    // not a failure per se
+    // second argument to messageProcessed()
+    static const ResponseCode ConsumeMessage = 215; //!< consume this message
+    static const ResponseCode ConsumeAndUnsubscribe = 216; //!< end this channel
+    // Internal to ymb implementation
+    static const ResponseCode YmbSubscribeTempFailure = 217;  //!< A failure that might be resolved on a retry
+    static const ResponseCode YmbSubscribeTimedout = 218; //!< A timeout failure
+    static const ResponseCode YmbSubscriptionExists = 219; //!< Attempt to create a sub that already exists
+    static const ResponseCode NoSuchSubscription = 220; //!< Attempt to attach to a sub that does not exist
+    static const ResponseCode AttachNoSuchSubscription = 221; //!< Specific to attach, no subscription to attach to (not necessarily an error)
+    static const ResponseCode BrokerInitFailed = 222; //!< Config or allocation failed
+    static const ResponseCode BrokerConnectionLost = 223; //!< Lost connection to broker
+    static const ResponseCode BrokerFatalError = 224; //!< Generally shared mem corruption
+
+
+    //----------- Daemon section -------------
+    // Daemon
+    static const ResponseCode NoImpl = 250;    //!< No op
+    static const ResponseCode Restart = 251; //!< Exit the daemon so that it is restarted right away.
+    // request that the daemon do a soft restart
+    static const ResponseCode Exit = 252; //!< Exit the daemon so that it is NOT restarted right away. A monitoring process may restart the entire system later.
+    static const ResponseCode StopDelivery = 253; //!< Stop delivery on the topic, returned by Broker handlers only.
+    static const ResponseCode RetryDelivery = 254; //!< Stop delivery on the topic but retry after sometime, returned by Broker handlers only.
+
+    //----------- Lock section -------------
+    // LockManager
+    //ALL these lock errors are handled in SuFaulHandler.cc
+    //Any addition to these error codes requires update to the SuFaultHandler
+    static const ResponseCode LockSyserr = 301;        //!< System error during lock/unlock op
+    static const ResponseCode LockInconsis = 302;        //!< Inconsistency detected in LockManager.
+    static const ResponseCode LockNested = 303;         //!< Nested locking of same key not allowed.
+    static const ResponseCode LockNosuchpid = 304;      //!< This pid does not hold the lock.
+    static const ResponseCode LockUnavail = 305;        //!< Outa lock
+    static const ResponseCode LockInitfail = 306;        //!< Initialization failure of the lock subsystem
+    static const ResponseCode LockInvalidarg = 307;      //!< Invalid arguments to lock subsystem.
+
+    //----------- Message section -------------
+    //Message and Message serialization
+    static const ResponseCode SerializeFailed = 350;     //!< Message Serialization Failed
+    static const ResponseCode DeserializeFailed = 351;   //!< Message Deserialization failed
+    static const ResponseCode NoResponseCodeInMessage = 352;
+
+    //----------- Transport Errors -------------
+    static const ResponseCode TransportSendError = 400;    //!< Curl error in communicating with other server
+    static const ResponseCode TransportSetHeaderFailed = 401; //!< Error in setting header in curl request
+    static const ResponseCode TransportCurlInitError = 402;  // !< Error initializing curl handle -- should be curl specific
+    static const ResponseCode TransportUncertain = 403;  //!< Send came back uncertain (timeout, usually)
+    static const ResponseCode TransportInvalidResponseBody = 404;  //!< Send came back unparsable body
+
+    //----------- Apache/Web section -------------
+    static const ResponseCode EndOfBody = 450;    //!< Normal end of incoming request body
+    static const ResponseCode BodyReadFailed = 451;    //!< Failed reading incoming request body
+    static const ResponseCode BodyWriteFailed = 452;    //!< Failed writing outgoing request body
+    static const ResponseCode EncryptionFailed = 453;    //!< Failed to encrypt body or header
+    static const ResponseCode DecryptionFailed = 454;    //!< Failed to decrypt body or header
+        
+    /**
+     * Give back a basic, generic string description of the response code.
+     *
+     * @param rc The response code to convert.
+     * @return The string describing it.
+     */
+    static std::string toString(ResponseCode rc);
+
+};
+
+/* For customized vim control
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: sw=4:ts=4:et
+ * vim<600: sw=4:ts=4:et
+ */
+#endif
diff --git a/LogUtils.cc b/LogUtils.cc
new file mode 100644
index 0000000..3dea981
--- /dev/null
+++ b/LogUtils.cc
@@ -0,0 +1,77 @@
+/*! \file log4_util.cc
+ *  \brief This file has the helper functions for log4cpp;
+ *
+ *  Copyright (c) 2008 Yahoo, Inc.
+ *  All rights reserved.
+ */
+#include <iostream>
+#include <log4cpp/PropertyConfigurator.hh>
+
+#include "LogUtils.h"
+
+using namespace log4cpp;
+using namespace std;
+
+// hacked link to actioncontext
+std::string s_trackPathLog;
+
+LogMethod::
+LogMethod(log4cpp::Category& log, log4cpp::Priority::Value priority,
+          const char *function) :
+    log_(log), priority_(priority), function_(function)
+{
+    if(log_.isPriorityEnabled(priority_)) {
+        log_.getStream(priority_) << "Entering: " << function_;
+    }
+}
+
+
+LogMethod::
+~LogMethod()
+{
+    if(log_.isPriorityEnabled(priority_)) {
+        log_.getStream(priority_) << "Exiting: " << function_;
+    }
+}
+
+// Protects against multiple calls (won't try to re-init) and gives
+// back the same answer the original call got.
+static int log4cppInitResult = -1;
+
+bool
+initLog4cpp(const string &confFile)
+{
+
+    if (log4cppInitResult != -1) {
+        return (log4cppInitResult == 0 ? true : false);
+    }
+
+    log4cppInitResult = 0; // Assume success.
+    try {
+        PropertyConfigurator::configure(confFile);
+    } catch (log4cpp::ConfigureFailure &e) {
+        cerr << "log4cpp configuration failure while loading '" <<
+            confFile << "' : " << e.what() << endl;
+        log4cppInitResult = 1;
+    } catch (std::exception &e) {
+        cerr << "exception caught while configuring log4cpp via '" <<
+            confFile << "': " << e.what() << endl;
+        log4cppInitResult = 1;
+    } catch (...) {
+        cerr << "unknown exception while configuring log4cpp via '" <<
+            confFile << "'." << endl;
+        log4cppInitResult = 1;
+    }
+
+    return (log4cppInitResult == 0 ? true : false);
+}
+
+/*
+ * For customized vim control
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: sw=4:ts=4:et
+ * vim<600: sw=4:ts=4:et
+ */
diff --git a/LogUtils.h b/LogUtils.h
new file mode 100644
index 0000000..73c0af6
--- /dev/null
+++ b/LogUtils.h
@@ -0,0 +1,130 @@
+/* Copyright (C) 2007 Yahoo! Inc. All Rights Reserved. */
+
+#ifndef LOG_UTIL_H
+#define LOG_UTIL_H
+
+#include <log4cpp/Category.hh>
+#include "StringUtils.h"
+
+/**
+ * Quick and dirty link between LogUtils and ActionContext without having to 
+ * resolve cross-inclusion issues, or force all components to start including
+ * ActionContext if they don't already.
+ */
+extern std::string s_trackPathLog;
+
+// These macros cannot be protected by braces because of the trailing stream
+// arguments that get appended.  Care must taken not to use them inside if/else 
+// blocks that do not use curly braces.
+// I.e., the following will give unexpected results:
+// if(foo)
+//   DHT_DEBUG_STREAM() << "heyheyhey";
+// else
+//   blah();
+// The 'else' will end up applying to the 'if' within the debug macro.
+// Regardless of this, our standards say to always use curly brackets
+// on every block anyway, no matter what.
+
+#define DHT_DEBUG_STREAM() if(log.isDebugEnabled()) log.debugStream() << __FUNCTION__ << "():" <<  __LINE__ << ":"
+#define DHT_INFO_STREAM() if(log.isInfoEnabled()) log.infoStream() <<  __FUNCTION__ << "():" << __LINE__ << ":"
+#define DHT_INFO_WITH_STACK_STREAM() if(log.isInfoEnabled()) log.infoStream() <<  __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_WARN_STREAM() if(log.isWarnEnabled()) log.warnStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_ERROR_STREAM() if(log.isErrorEnabled()) log.errorStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_CRIT_STREAM() if(log.isCritEnabled()) log.critStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_TRACE_PRIORITY log4cpp::Priority::DEBUG + 50
+#define DHT_TRACE_STREAM() if (log.isPriorityEnabled(DHT_TRACE_PRIORITY)) log.getStream(DHT_TRACE_PRIORITY) <<  __FUNCTION__ << "():" << __LINE__ << ":"
+
+// Sadly, sometimes 'log' is reserved by someone else so the code needs to
+// use a different name for log.  In that case, it can be passed in to these.
+#define DHT_DEBUG_STREAML(x_log_hdl_x) if((x_log_hdl_x).isDebugEnabled()) (x_log_hdl_x).debugStream() <<  __FUNCTION__ << "():" << __LINE__ << ":"
+#define DHT_INFO_STREAML(x_log_hdl_x) if((x_log_hdl_x).isInfoEnabled()) (x_log_hdl_x).infoStream() <<  __FUNCTION__ << "():" << __LINE__ << ":"
+#define DHT_INFO_WITH_STACK_STREAML(x_log_hdl_x) if((x_log_hdl_x).isInfoEnabled()) (x_log_hdl_x).infoStream() <<  __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_WARN_STREAML(x_log_hdl_x) if((x_log_hdl_x).isWarnEnabled()) (x_log_hdl_x).warnStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_ERROR_STREAML(x_log_hdl_x) if((x_log_hdl_x).isErrorEnabled()) (x_log_hdl_x).errorStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_CRIT_STREAML(x_log_hdl_x) if((x_log_hdl_x).isCritEnabled()) (x_log_hdl_x).critStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog
+#define DHT_TRACE_STREAML(x_log_hdl_x) if ((x_log_hdl_x).isPriorityEnabled(DHT_TRACE_PRIORITY)) (x_log_hdl_x).getStream(DHT_TRACE_PRIORITY) <<  __FUNCTION__ << "():" << __LINE__ << ":"
+
+//Macros to use when a function returns on error without writing any log message
+// or error translation
+#define RETURN_IF_NOT_OK(x_call_x) \
+{ \
+  FwCode::ResponseCode rcx___ = (x_call_x); \
+  if(rcx___ != FwCode::FwOk) {   \
+    return rcx___; \
+  } \
+}
+
+#define RETURN_THIS_IF_NOT_OK(x_othercode_x, x_call_x)   \
+{ \
+  FwCode::ResponseCode rcx___ = (x_call_x); \
+  if(rcx___ != FwCode::FwOk) {   \
+      return (x_othercode_x);    \
+  } \
+}
+
+/// Caution!  Only use in checks for 'impossible' code conditions.  Regular errors
+/// should be handled regularly
+#define BAD_CODE_ABORT() \
+    { \
+        std::string x_msg_x("Bad code at " __FILE__ ":"); \
+        x_msg_x.append(StringUtils::toString(__LINE__)); \
+        throw std::runtime_error(x_msg_x); \
+    }
+
+#define BAD_CODE_IF_NOT_OK(x_call_x) \
+    do {\
+    if((x_call_x) != FwCode::FwOk) { \
+        BAD_CODE_ABORT(); \
+    } \
+    } while(0)
+
+/*
+ * Above macros are meant to be used by all components.
+ */
+
+/**
+ * Class that allows for method entry/exit logging with a single declaration.
+ * Always uses debug.
+ */
+class LogMethod
+{
+ public:
+    LogMethod(log4cpp::Category& log, log4cpp::Priority::Value priority,
+              const char *function);
+    virtual ~LogMethod();
+
+ private:
+    log4cpp::Category& log_;
+    log4cpp::Priority::Value priority_;
+    const char *function_;
+};
+
+// convenience macros to use the above class
+#define LOG_METHOD() LogMethod log_method_entry_exit(log, log4cpp::Priority::DEBUG, __FUNCTION__)
+#define TRACE_METHOD() LogMethod log_method_entry_exit(log, DHT_TRACE_PRIORITY, __FUNCTION__)
+
+/** Initialize log4cpp config file.
+ * This function needs to be called once for each executable. Multiple
+ * initializations will return the result of the first initialization (IOW,
+ * an executable can be initialized with exactly one config file). Errors
+ * encountered by this function are printed onto cerr. See log4cpp
+ * documentation for what happens when PropertyConfigurator::configure()
+ * fails.
+ * \param confFile is the path name of the log4cpp config file.
+ * Depending on the machine that the executable is running in, the path
+ * will be different.
+ * \return true if the initialization succeeds, false if it fails.
+ */
+bool initLog4cpp(const std::string & confFile);
+
+#endif
+
+/*
+ * For customized vim control
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: sw=4:ts=4:et
+ * vim<600: sw=4:ts=4:et
+ */
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7fcb172
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,74 @@
+STASIS_DIR=../stasis
+
+LIB=$(STASIS_DIR)/build/src/stasis \
+	-L/home/y/lib
+INCLUDE=-I$(STASIS_DIR)/src/ -I$(STASIS_DIR) -I./ \
+	-I/home/y/include
+
+LIBLIST=-lpthread \
+	-lstasis \
+	-lm 
+#	-licui18n \
+#	-licuuc \
+#	-licudata \
+#	-licuio \
+#	-llog4cpp_y \
+#	-lthoth 
+
+FLAGS=-pg -g -O1
+#FLAGS=-O3
+
+HFILES=logserver.h logstore.h logiterators.h datapage.h merger.h tuplemerger.h datatuple.h
+CFILES=logserver.cpp logstore.cpp logiterators.cpp datapage.cpp merger.cpp tuplemerger.cpp 	
+
+
+# STASIS_DIR=../stasis
+# LD_LIBRARY_PATH=$STASIS_DIR/build/src/stasis 
+# LD_LIBRARY_PATH=$STASIS_DIR/build/src/stasis ./hello
+
+
+logstore: check_gen.cpp	$(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+test:	dp_check lt_check ltable_check merger_check rb_check  \
+	lmerger_check tmerger_check server_check tcpclient_check
+
+lt_check: check_logtree.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+dp_check: check_datapage.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+ltable_check: check_logtable.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+merger_check: check_merge.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+lmerger_check: check_mergelarge.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+tmerger_check: check_mergetuple.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+rb_check: check_rbtree.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+server_check:  check_server.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+tcpclient_check:  check_tcpclient.cpp $(HFILES) $(CFILES)
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+
+hello : hello.cpp UCharUtils.cc LogUtils.cc
+	g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS)
+
+clean: 
+	rm -f logstore server_check hello lt_check merger_check lmerger_check rb_check \
+	dp_check ltable_check tmerger_check rose tcpclient_check
+veryclean: clean
+	rm -f *~ gmon.out prof.res
+
+
+
diff --git a/NOTES b/NOTES
new file mode 100644
index 0000000..cc6008a
--- /dev/null
+++ b/NOTES
@@ -0,0 +1,152 @@
+######################################################################################
+constants.h
+######################################################################################
+
+#define PAGE_SIZE 4096 
+#define BLOB_THRESHOLD_SIZE (PAGE_SIZE-30)
+
+SLOT TYPES
+
+#define INVALID_SLOT  (-1)
+/** This constant is used as a placeholder to mark slot locations that contain blobs.
+    @see slotted.c, indirect.c,  blobManager.c  */
+#define BLOB_SLOT     (-2)
+#define NORMAL_SLOT  (-3)
+#define SLOT_TYPE_END (-4)
+
+######################################################################################
+allocationPolicy.h
+######################################################################################
+
+struct allocationPolicy { 
+  struct LH_ENTRY(table) * xidAlloced;
+  struct LH_ENTRY(table) * xidDealloced;
+  struct RB_ENTRY(tree)  * availablePages;
+  struct LH_ENTRY(table) * pageOwners;
+  struct LH_ENTRY(table) * allPages;
+};
+
+typedef struct allocationPolicy stasis_allocation_policy_t;
+
+typedef struct availablePage { 
+  int freespace;
+  pageid_t pageid;
+  int lockCount;  // Number of active transactions that have alloced or dealloced from this page.
+} availablePage;
+
+availablePage * stasis_allocation_policy_pick_suitable_page(stasis_allocation_policy_t * ap, int xid, int freespace);
+
+////////////////////////////////////////////////////////////////////////////////////
+
+==15277== Thread 4:
+==15277== Invalid free() / delete / delete[]
+==15277==    at 0x401BEFA: free (vg_replace_malloc.c:235)
+==15277==    by 0x4FD60FB: free_mem (in /lib/tls/libc-2.3.4.so)
+==15277==    by 0x4FD5B21: __libc_freeres (in /lib/tls/libc-2.3.4.so)
+==15277==    by 0x4017336: _vgw_freeres (vg_preloaded.c:62)
+==15277==    by 0x4030B25: pthread_cond_wait@@GLIBC_2.3.2 (in /lib/tls/libpthread-2.3.4.so)
+==15277==    by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so)
+==15277==    by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so)
+==15277==  Address 0x4EC66B8 is not stack'd, malloc'd or (recently) free'd
+==15277==
+==15277== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 40 from 1)
+==15277== malloc/free: in use at exit: 8,540,389 bytes in 912 blocks.
+==15277== malloc/free: 1,815,016 allocs, 1,814,105 frees, 1,121,769,405 bytes allocated.
+==15277== For counts of detected errors, rerun with: -v
+==15277== searching for pointers to 912 not-freed blocks.
+==15277== checked 43,383,184 bytes.
+==15277==
+==15277== Thread 1:
+==15277==
+==15277== 4,883,561 (32 direct, 4,883,529 indirect) bytes in 1 blocks are definitely lost in loss record 16 of 46
+==15277==    at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164)
+==15277==    by 0x8052C01: __gnu_cxx::new_allocator<std::_Rb_tree_node<datatuple> >::allocate(unsigned, void const*) (new_allocator.h:81)
+==15277==    by 0x8052B79: std::_Rb_tree<datatuple, datatuple, std::_Identity<datatuple>, datatuple, std::allocator<datatuple> >::_M_get_node() (stl_tree.h:356)
+==15277==    by 0x8052ACC: std::_Rb_tree<datatuple, datatuple, std::_Identity<datatuple>, datatuple, std::allocator<datatuple> >::_M_create_node(datatuple const&) (stl_tree.h:365)
+==15277==    by 0x8052978: std::_Rb_tree<datatuple, datatuple, std::_Identity<datatuple>, datatuple, std::allocator<datatuple> >::_M_insert(std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, datatuple const&) (stl_tree.h:783)
+==15277==    by 0x805270C: std::_Rb_tree<datatuple, datatuple, std::_Identity<datatuple>, datatuple, std::allocator<datatuple> >::insert_unique(datatuple const&) (stl_tree.h:881)
+==15277==    by 0x8052332: std::set<datatuple, datatuple, std::allocator<datatuple> >::insert(datatuple const&) (stl_set.h:314)
+==15277==    by 0x8050077: logtable::insertTuple(datatuple&) (logstore.cpp:1030)
+==15277==    by 0x804A641: insertProbeIter(int) (check_merge.cpp:160)
+==15277==    by 0x804AB9B: main (check_merge.cpp:235)
+==15277==
+==15277==
+==15277== 336 (28 direct, 308 indirect) bytes in 1 blocks are definitely lost in loss record 17 of 46
+==15277==    at 0x401B405: malloc (vg_replace_malloc.c:149)
+==15277==    by 0x404D906: stasis_dirty_page_table_init (dirtyPageTable.c:133)
+==15277==    by 0x404BFA5: Tinit (transactional2.c:66)
+==15277==    by 0x804A2AE: insertProbeIter(int) (check_merge.cpp:97)
+==15277==    by 0x804AB9B: main (check_merge.cpp:235)
+==15277==
+==15277==
+==15277== 40 bytes in 1 blocks are definitely lost in loss record 20 of 46
+==15277==    at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164)
+==15277==    by 0x8053025: merge_scheduler::addlogtable(logtable*) (merger.cpp:20)
+==15277==    by 0x804A33E: insertProbeIter(int) (check_merge.cpp:113)
+==15277==    by 0x804AB9B: main (check_merge.cpp:235)
+==15277==
+==15277==
+==15277== 80 bytes in 10 blocks are definitely lost in loss record 32 of 46
+==15277==    at 0x401B405: malloc (vg_replace_malloc.c:149)
+==15277==    by 0x804D75E: logtree::create(int) (logstore.cpp:169)
+==15277==    by 0x8053BD5: memMergeThread(void*) (merger.cpp:236)
+==15277==    by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so)
+==15277==    by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so)
+==15277==
+==15277==
+==15277== 4,792 (432 direct, 4,360 indirect) bytes in 18 blocks are definitely lost in loss record 40 of 46
+==15277==    at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164)
+==15277==    by 0x80501C5: logtable::insertTuple(int, datatuple&, recordid&, logtree*) (logstore.cpp:1064)
+==15277==    by 0x8054FA7: insertTuple(int, DataPage<datatuple>*, datatuple&, logtable*, logtree*, recordid&, int&, int&) (merger.cpp:643)
+==15277==    by 0x8054AFF: merge_iterators(int, treeIterator<datatuple>*, memTreeIterator<std::set<datatuple, datatuple, std::allocator<datatuple> >, datatuple>*, logtable*, logtree*, int&) (merger.cpp:534)
+==15277==    by 0x8053C8F: memMergeThread(void*) (merger.cpp:251)
+==15277==    by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so)
+==15277==    by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so)
+==15277==
+==15277==
+==15277== 576 bytes in 4 blocks are possibly lost in loss record 41 of 46
+==15277==    at 0x401C6BF: calloc (vg_replace_malloc.c:279)
+==15277==    by 0x400E71A: _dl_allocate_tls (in /lib/ld-2.3.4.so)
+==15277==    by 0x402E91E: pthread_create@@GLIBC_2.1 (in /lib/tls/libpthread-2.3.4.so)
+==15277==    by 0x80538FF: merge_scheduler::startlogtable(int) (merger.cpp:184)
+==15277==    by 0x804A37E: insertProbeIter(int) (check_merge.cpp:116)
+==15277==    by 0x804AB9B: main (check_merge.cpp:235)
+==15277==
+==15277==
+==15277== 3,175 bytes in 1 blocks are possibly lost in loss record 42 of 46
+==15277==    at 0x401B405: malloc (vg_replace_malloc.c:149)
+==15277==    by 0x8051BC7: DataPage<datatuple>::readbytes(int, int, int, unsigned char**) (datapage.cpp:235)
+==15277==    by 0x8051F7F: DataPage<datatuple>::RecordIterator::getnext(int) (datapage.cpp:442)
+==15277==    by 0x80512E0: DataPage<datatuple>::recordRead(int, unsigned char*, unsigned, datatuple**) (datapage.cpp:206)
+==15277==    by 0x8050449: logtable::findTuple(int, unsigned char*, unsigned, logtree*) (logstore.cpp:1104)
+==15277==    by 0x804FF48: logtable::findTuple(int, unsigned char*, unsigned) (logstore.cpp:979)
+==15277==    by 0x804A8D3: insertProbeIter(int) (check_merge.cpp:198)
+==15277==    by 0x804AB9B: main (check_merge.cpp:235)
+==15277==
+==15277==
+==15277== 173,599 bytes in 2 blocks are possibly lost in loss record 43 of 46
+==15277==    at 0x401B405: malloc (vg_replace_malloc.c:149)
+==15277==    by 0x804FFD0: logtable::insertTuple(datatuple&) (logstore.cpp:1014)
+==15277==    by 0x804A641: insertProbeIter(int) (check_merge.cpp:160)
+==15277==    by 0x804AB9B: main (check_merge.cpp:235)
+==15277==
+==15277==
+==15277== 2,281,057 bytes in 681 blocks are definitely lost in loss record 45 of 46
+==15277==    at 0x401B405: malloc (vg_replace_malloc.c:149)
+==15277==    by 0x8051BC7: DataPage<datatuple>::readbytes(int, int, int, unsigned char**) (datapage.cpp:235)
+==15277==    by 0x8051F7F: DataPage<datatuple>::RecordIterator::getnext(int) (datapage.cpp:442)
+==15277==    by 0x80512E0: DataPage<datatuple>::recordRead(int, unsigned char*, unsigned, datatuple**) (datapage.cpp:206)
+==15277==    by 0x8050449: logtable::findTuple(int, unsigned char*, unsigned, logtree*) (logstore.cpp:1104)
+==15277==    by 0x804FF81: logtable::findTuple(int, unsigned char*, unsigned) (logstore.cpp:990)
+==15277==    by 0x804A8D3: insertProbeIter(int) (check_merge.cpp:198)
+==15277==    by 0x804AB9B: main (check_merge.cpp:235)
+==15277==
+==15277== LEAK SUMMARY:
+==15277==    definitely lost: 2,281,669 bytes in 712 blocks.
+==15277==    indirectly lost: 4,888,197 bytes in 150 blocks.
+==15277==      possibly lost: 177,350 bytes in 7 blocks.
+==15277==    still reachable: 1,193,173 bytes in 43 blocks.
+==15277==         suppressed: 0 bytes in 0 blocks.
+==15277== Reachable blocks (those to which a pointer was found) are not shown.
+==15277== To see them, rerun with: --show-reachable=yes
+Killed
diff --git a/StringUtils.h b/StringUtils.h
new file mode 100644
index 0000000..d098b76
--- /dev/null
+++ b/StringUtils.h
@@ -0,0 +1,345 @@
+/* $Id: StringUtils.h,v 1.17 2009/03/25 20:32:51 dlomax Exp $ */
+/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */
+
+#ifndef __STRING_UTIL_H
+#define __STRING_UTIL_H
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include "FwCode.h"
+
+/**
+ * Container for static string manipulation utilities.
+ */
+class StringUtils
+{
+ public:
+
+    /**
+     * Our replacement for yax_getroot().  Allows our code to have a different
+     * root than components we use or link with.  Is nice for unit testing.
+     * @return Copy of the value in a std::string
+     */
+    static std::string getDhtRoot();
+
+    /**
+     * Parse a tablet name into left and right limits.
+     * @return true if parsing successful, false if incorrect format
+     */
+    static bool parseTabletName(const std::string& tablet, std::string& leftLimit,
+                                std::string& rightLimit);
+
+    /**
+     * Construct a tablet name from left and right limits.
+     */
+    static void buildTabletName(const std::string& leftLimit,
+                                const std::string& rightLimit,
+                                std::string& tablet);
+ 
+    /**
+     * General purpose method to assemble a full path name, using
+     * getDhtRoot() so that
+     * the root will be configurable.  DO NOT supply "/home/y" in path1.
+     */
+    static std::string makePath(const std::string& path1 = "",
+                                const std::string& path2 = "",
+                                const std::string& path3 = "",
+                                const std::string& path4 = "",
+                                const std::string& path5 = "",
+                                const std::string& path6 = "");
+
+    /**
+     * Append additional paths to an existing one - does not prepend ROOT.
+     */
+    static void appendPath(std::string& base_path, const std::string& path2 = "",
+                            const std::string& path3 = "",
+                            const std::string& path4 = "");
+
+    /**
+     * Construct a topic name from a table/tablet.
+     * 
+     * @return the topic name
+     */
+    static std::string buildTopicName(const std::string& table,
+                                      const std::string& tablet);
+
+    /**
+     * Construct a topic name from a table/tablet.
+     * @param topic  Is filled with the topic name.
+     */
+    static void buildTopicName(const std::string& table,
+                               const std::string& tablet,
+                               std::string &topic);
+
+    /**
+     * Parses <code>topic</code> into table and tablet portions.
+     *
+     * @param table Filled with the table name.
+     * @param tablet Filled with the tablet name.
+     * @param true if the parsing succeeded, false if not.
+     */
+    static bool parseTopicName(const std::string& topic,
+                               std::string& table,
+                               std::string &tablet);
+
+    /**
+     * Only for use in log statements - this is slow.  Produce a printable
+     * string where binary (<32) characters are hex encoded, but all others
+     * are left alone.
+     *
+     * @param str string to encode
+     * @param len length of string
+     * @return encoded string.
+     */
+    static std::string toPrintable(const char *str, size_t len);
+
+    /**
+     * Convert a formatted hex string back into its original
+     * 64-bit value
+     *
+     * @param value the hex-encoded string
+     * @param out the value
+     * @return FwCode::FwOk on success, FwCode::BadHexString on parse failure
+     */
+    static FwCode::ResponseCode
+        convertHexStringToUI64(const std::string& value, uint64_t& out);
+
+    /**
+     * Convert a formatted hex string back into its original
+     * 32-bit value
+     *
+     * @param value the hex-encoded string
+     * @param out the value
+     * @return FwCode::FwOk on success, FwCode::BadHexString on parse failure
+     */
+    static FwCode::ResponseCode 
+        convertHexStringToUI32(const std::string& value, uint32_t& out);
+
+    /**
+     * Standard means for formatting a 0x prefixed hex string from a
+     * 64-bit unsigned value.  Will produce upper-case letters.  Will
+     * pad with zeros at the beginning to fill out 16 hex chars.
+     *
+     * @param the value to format
+     * @return the formatted value, like "0xDEADBEEF00000000"
+     */
+    static std::string convertUI64ToHexString( uint64_t val );
+
+    /**
+     * Standard means for formatting a 0x prefixed hex string from a
+     * 32-bit unsigned value.  Will produce upper-case letters.  Will
+     * pad with zeros at the beginning to fill out 8 hex chars.
+     *
+     * @param the value to format
+     * @return the formatted value, like "0xDEADBEEF"
+     */
+    static std::string convertUI32ToHexString( unsigned int val );
+
+    /**
+     * Standard means for formatting a small hex string from a
+     * 32-bit unsigned value.  The "0x" will NOT be included.
+     * Will produce upper-case letters.  Will NOT pad with zeros
+     * at the beginning.
+     *
+     * @param the value to format
+     * @return the formatted value, like "DEADBEEF"
+     */
+    static std::string convertUI32ToMinimalHexString( unsigned int val );
+
+    /**
+     * Assemble the fields of ENCRYPTED_BODY_HEADER and encrypt it for
+     * sending to the remote side.
+     * @param result is the out parameter having the resulting string.
+     * @param encKeyName is the name of the key in keydb whose value will be
+     * used as the encryption key
+     * @param bodyEncVersion is the version of the encryption scheme used to
+     * encrypt the body (not the encryption scheme of this header itself).
+     * @param expireTime is the time (in usecs) after which the request
+     * should not be processed by the receiver of this header.
+     */
+    static FwCode::ResponseCode makeEncryptedBodyHdr(std::string & result,
+            const char *encKeyName, uint32_t bodyEncVersion, uint64_t expireTime);
+
+    /**
+     * Parse the incoming ENCRYPTED_BODY_HEADER, decrypting it, and
+     * separating the fields in it.
+     * @param inval is the incoming encrypted string.
+     * @param encKeyName is the name of the key in keydb whose value will be
+     * used as the decryption key
+     * @param bodyEncVersion is the version of the encryption scheme to be
+     * used to * decrypt the body (not for the decryption of this header
+     * itself).
+     * @param expireTime is the time (in usecs) after which the response
+     * should not be processed by the receiver of this header.
+     */
+    static FwCode::ResponseCode parseEncryptedBodyHdr(const std::string & inval,
+            const char *encKeyName, uint32_t & bodyEncVersion, uint64_t & expireTime);
+
+    /**
+     * Get the hash for an un-normalized record name.
+     *
+     * @param unnormalizedRecordName a raw record name from user input
+     * @param (output) hashResult the hex string of the hash value.
+     * @return FwCode::FwOk on success, else an error relating to normalization
+     */
+    static FwCode::ResponseCode normalizeAndHashRecordName
+        ( const std::string& unnormalizedRecordName,
+          std::string & hashResult /* out */ );
+
+    /**
+     * Get the hash for a normalized record name.
+     *
+     * @param recordName the record name.  MUST be previously normalized.
+     * @return hashResult the uint32_t of the hash value.
+     */
+    static uint32_t hashRecordName(const std::string& recordName);
+
+    /**
+     * Get the hash for a normalized record name.
+     *
+     * @param recordName the record name.  MUST be previously normalized.
+     * @param (output) hashResult the hex string of the hash value.
+     */
+    static void hashRecordName( const std::string& recordName,
+                                std::string & hashResult /* out */ );
+    /**
+     * Get the hash for a normalized record name in string and int form
+     *
+     * @param recordName the record name.  MUST be previously normalized.
+     * @param (output) hashResult the hex string of the hash value.
+     * @param (output) hexNum numerical value of hash
+     */
+    static void hashRecordName( const std::string& recordName,
+                                std::string & hashResult /* out */,
+                                uint32_t& hexNum);
+
+    /**
+     * Method to hash a string using crc32.
+     *
+     * @param buf data to hash
+     * @param len length of buf
+     * @return hash value
+     */
+    static uint32_t crcHash(const char * buf, uint32_t len);
+
+    /**
+     * util function to convert any type to a string
+     */
+    template<typename T> static inline std::string toString(T item);
+    
+    /** 
+     * convert string to any type of value
+     * @param strValue string value to parse
+     * @param value(out) value to read from strValue
+     * @return FwCode::FwOk on success
+     *         FwCode::FwError on failure (error is *not* logged)
+     */
+    template<typename T> static inline 
+    FwCode::ResponseCode  fromString(const std::string& strValue,
+                                     T& value);
+
+    /** 
+     * convert a hexadecimal number to string representation 
+     * of fixed width ( 2 * sizeof(T) )
+     * @param value number to convert to string
+     * @return string representation of value
+     */
+    template<typename T> static inline
+    std::string numberToHexString(T value);
+
+    /** 
+     * convert a hexadecimal number to minimal string representation
+     * @param value number to convert to string
+     * @return string representation of value
+     */
+    template<typename T> static inline
+    std::string numberToMinimalHexString(T value);
+    
+    /**
+     * convert a hexadecimal string to a number
+     * @param strvalue input string to read from
+     * @param value(out) output number
+     * @return FwCode::FwOk on successful conversion
+     *         FwCode::FwError on failure to convert strvalue
+     *         to number
+     */
+    template<typename T> static inline 
+    FwCode::ResponseCode hexStringToNumber(const std::string& strvalue,
+                                           T& value);
+
+    
+    static const std::string EMPTY_STRING;
+};
+
+template<typename T> 
+std::string StringUtils::
+toString(T item) 
+{
+    std::ostringstream buf;
+    buf << item;
+    return buf.str();
+}
+
+template<typename T>
+FwCode::ResponseCode  StringUtils::
+fromString(const std::string& strValue,
+           T& value)
+{
+    std::istringstream buf(strValue);
+    buf >> value;
+    if(buf.fail()|| 
+       (strValue.length() != buf.tellg() )) 
+    {
+        return FwCode::FwError;
+    }
+    return FwCode::FwOk;
+}
+
+template<typename T>
+std::string StringUtils::
+numberToHexString(T value)
+{
+    std::ostringstream buf;
+    buf << "0x" << std::hex 
+        << std::setw(sizeof(T) * 2) << std::setfill('0') 
+        << std::uppercase << value;
+    return buf.str();
+
+}
+
+template<typename T>
+std::string StringUtils::
+numberToMinimalHexString(T value)
+{
+    std::ostringstream buf;
+    buf << std::hex << std::uppercase << value;
+    return buf.str();
+
+}
+
+template<typename T>
+FwCode::ResponseCode StringUtils::
+hexStringToNumber(const std::string& strvalue,
+                  T& value)
+{
+    std::istringstream buf(strvalue);
+    buf >> std::hex >> value;
+    if(buf.fail() || 
+       (strvalue.length() != buf.tellg() )) 
+    {
+        return FwCode::FwError;
+    }
+    return FwCode::FwOk;
+
+}
+
+/*
+ * For customized vim control
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: sw=4:ts=4:et
+ * vim<600: sw=4:ts=4:et
+ */
+#endif
diff --git a/UCharUtils.cc b/UCharUtils.cc
new file mode 100644
index 0000000..2133034
--- /dev/null
+++ b/UCharUtils.cc
@@ -0,0 +1,326 @@
+/* $Id: UCharUtils.cc,v 1.16 2009/03/03 20:19:18 dlomax Exp $ */
+/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */
+
+//#include <dht/UCharUtils.h>
+#include "UCharUtils.h"
+#include <log4cpp/Category.hh>
+#include "LogUtils.h"
+//#include "ActionContext.h"
+#include <unicode/ucnv.h>
+#include <unicode/unorm.h>
+#include <thoth/validate.h> // To make sure we have UTF-8
+
+static log4cpp::Category &log = 
+                    log4cpp::Category::getInstance("dht.framework." __FILE__);
+
+
+UCharUtilsImpl *UCharUtils::instance_ = NULL;
+
+UCharUtilsImpl::
+UCharUtilsImpl() : uconv_(NULL) { 
+    LOG_METHOD();
+
+    ucBuffLen = 0;
+    ucBuff = NULL;
+
+    ucNormBuffLen = 0;
+    ucNormBuff = NULL;
+
+    charBuffLen = 0;
+    charBuff = NULL;
+}
+
+FwCode::ResponseCode UCharUtilsImpl::
+init()
+{
+    UErrorCode erc = U_ZERO_ERROR;
+
+    uconv_ = ucnv_open("utf-8", &erc);
+    if (uconv_ == NULL) {
+        DHT_ERROR_STREAM() << "EC:UNICODE:Problem geting utf-8 converter, erc:" << erc
+                           << ", " << u_errorName(erc);
+        return FwCode::UcnvOpenFailed;
+    }
+    return FwCode::FwOk;
+}
+
+UCharUtilsImpl::
+~UCharUtilsImpl() {
+    reset();
+    if (uconv_ != NULL) {
+        ucnv_close(uconv_);
+        uconv_ = NULL;
+    }
+}
+
+void UCharUtilsImpl::
+reset() {
+    LOG_METHOD();
+
+    if (ucBuff != NULL) {
+        delete[] ucBuff;
+        ucBuffLen = 0;
+        ucBuff = NULL;
+    }
+    if (ucNormBuff != NULL) {
+        delete[] ucNormBuff;
+        ucNormBuffLen = 0;
+        ucNormBuff = NULL;
+    }
+    if (charBuff != NULL) {
+        delete[] charBuff;
+        charBuffLen = 0;
+        charBuff = NULL;
+    }
+}
+
+/**
+ * Small wrapper to hide multi-line thoth api inside single-line call.
+ */
+bool UCharUtils::
+isUTF8(const std::string& value)
+{
+    size_t pos = 0;
+    thoth_result result = thoth_validate_utf8(value.c_str(), value.length(),
+                                              &pos);
+			
+    if(result != UTF8_VALID) {
+        std::cerr 
+            //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8)
+            << "value (" << value << ") is not UTF-8. thoth_result:" << result
+            << ", position=" << pos;
+        return false;
+    }
+    return true;
+}
+
+/**
+ * Small wrapper to hide multi-line thoth api inside single-line call.
+ */
+bool UCharUtils::
+isUTF8(const char * value, size_t value_len)
+{
+    size_t pos = 0;
+    thoth_result result = thoth_validate_utf8(value, value_len, &pos);
+			
+    if(result != UTF8_VALID) {
+        //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8)
+        std::cerr
+            << "value (" << std::string(value, value_len)
+            << ") is not UTF-8. thoth_result:" << result
+            << ", position=" << pos;
+        return false;
+    }
+    return true;
+}
+
+// Convert an input string (expected to be UTF-8) into unicode UChars
+// The result of the conversion will be sitting in our ucBuff area.
+FwCode::ResponseCode UCharUtilsImpl::
+convert(const std::string &input, int32_t &len)
+{
+    LOG_METHOD();
+
+    //UTF-8 validation
+    if(!UCharUtils::isUTF8(input)) {
+        return FwCode::DataNotUtf8;
+    }
+
+    int size = input.length() * 2;
+
+    // Check if we already have a big enough buffer
+    if (ucBuffLen < size) {
+        // Nope, first check if we need to release what we've been using
+        if (ucBuff) {
+            delete[] ucBuff;
+        }
+        ucBuffLen = size;
+        ucBuff = new UChar[ucBuffLen];
+    }
+
+    UErrorCode erc = U_ZERO_ERROR;
+    len = ucnv_toUChars(uconv_, 
+                        ucBuff, 
+                        ucBuffLen,
+                        input.data(), 
+                        input.length(), &erc);
+
+    if (U_FAILURE(erc)) {
+        //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed)
+        std::cerr
+            << "EC:UNICODE:error:" << erc
+                                                            << ", " << u_errorName(erc)
+                           << " from converting input:'" << input << "'";
+        len = 0;
+        return FwCode::ConvertToUCharFailed;
+    }
+    return FwCode::FwOk;
+}
+
+// Normalize an input string. Note that all three internal buffers will
+// be used by this operation, but by the time we finish, we'll be done
+// with them.
+FwCode::ResponseCode UCharUtilsImpl::
+normalize(const std::string &input, std::string &result /* out */)
+{
+    LOG_METHOD();
+
+    // convert our UTF-8 into UChar
+    int32_t inLen = 0;
+    FwCode::ResponseCode rc = convert(input, inLen);
+
+    if (rc != FwCode::FwOk) {
+        result.erase();
+        return rc;
+    }
+
+    // Do a quick check if the input is already normalized so that
+    // we can duck out early
+    UErrorCode status = U_ZERO_ERROR;
+    if (unorm_quickCheck(ucBuff, inLen,
+                         UNORM_NFC, &status) == UNORM_YES) {
+        DHT_DEBUG_STREAM() << "already normalized input:" << input;
+        result = input;
+        return FwCode::FwOk;
+    }
+
+    // Check if we have enough space for the normalized result.
+    // We'll make the output space twice as big as the input (although
+    // it's more likely that the normalized result will be shorter
+    // as it combines characters. E.g. 'A' 'put an accent on the previous'
+    int32_t newSize = inLen * 2;
+    if (newSize > ucNormBuffLen) {
+        DHT_DEBUG_STREAM() << "newSize:" << newSize
+                           << " ucNormBuffLen:" << ucNormBuffLen;
+        if (ucNormBuff) {
+            delete[] ucNormBuff;
+        }
+        ucNormBuffLen = newSize;
+        ucNormBuff = new UChar[ucNormBuffLen];
+    }
+
+    // Do the actual normalization
+    status = U_ZERO_ERROR;
+    int32_t normLen = unorm_normalize(ucBuff, inLen,
+                                                        UNORM_NFC, 0,
+                                                        ucNormBuff, 
+                                                        ucNormBuffLen,
+                                                        &status);
+    if(U_FAILURE(status)) {
+        //RESPONSE_ERROR_STREAM(FwCode::FwError)
+        std::cerr
+            << "EC:UNICODE:error:" << status << ", " << u_errorName(status)
+                           <<" in unorm_normalize, inLen:" << inLen
+                           << " ucNormBuffLen:" << ucNormBuffLen;
+        return FwCode::FwError;
+    }
+
+    // Make sure we have some space to convert back to UTF-8
+    int32_t resultLen = normLen * 4;
+    if (resultLen > charBuffLen) {
+        DHT_DEBUG_STREAM() << "resultLen:" << resultLen
+                           << " charBuffLen:" << charBuffLen;
+        if (charBuff) {
+            delete[] charBuff;
+            charBuff= NULL;
+        }
+        charBuffLen = resultLen;
+        charBuff = new char[charBuffLen];
+    }
+
+    DHT_DEBUG_STREAM() <<"calling ucnv_fromUChars, normLen:" << normLen;
+
+    // Go from UChar array to UTF-8
+    int32_t actualLen = ucnv_fromUChars(uconv_,
+                                                          charBuff, charBuffLen,
+                                                          ucNormBuff, normLen,
+                                                          &status);
+    if(U_FAILURE(status)) {
+        //RESPONSE_ERROR_STREAM(FwCode::FwError)
+        std::cerr
+            << "EC:UNICODE:error:" << status << ", " << u_errorName(status)
+                           << " in ucnv_fromUChars charBuffLen:" << charBuffLen
+                           << " normLen:" << normLen;
+        return FwCode::FwError;
+    }
+
+    // Smack our UTF-8 characters into the result string
+    result.assign(charBuff, actualLen);
+    DHT_DEBUG_STREAM() << "leaving actualLen:" << actualLen
+                       << " result:" << result;
+    return FwCode::FwOk;
+}
+
+
+FwCode::ResponseCode UCharUtils::
+init()
+{
+    if (instance_ == NULL) {
+        instance_ = new UCharUtilsImpl();
+        return instance_->init();
+    }
+    return FwCode::FwOk;  // already initialized
+}
+
+void UCharUtils::
+close()
+{
+    if(instance_ != NULL) {
+        delete instance_;
+        instance_ = NULL;
+    }
+}
+
+// Given an input string, return a unicode UChar array. Note that the 
+// return value is a pointer to our internal buffer.
+UChar * UCharUtils::
+getUChar(const std::string &input, int32_t& len) {
+    LOG_METHOD();
+
+    // do the conversion...somehow need 2x input len for utf8 to utf16
+    if(instance_->convert(input, len) != FwCode::FwOk) {
+        len = 0;
+        return NULL;
+    }
+
+    return instance_->ucBuff;
+}
+
+FwCode::ResponseCode UCharUtils::
+normalize(const std::string &input, std::string &result) {
+    LOG_METHOD();
+    return(instance_->normalize(input, result));
+}
+
+
+FwCode::ResponseCode UCharUtils::
+parseRegExpPattern(const std::string &pattern,
+                   URegularExpression * & result /* out */)
+{
+    UParseError perr;
+    UErrorCode erc = U_ZERO_ERROR;
+    int32_t ureglen = 0;
+
+    // Do not delete uregexp, it's a static reusable buffer inside UCharUtils
+    UChar *uregexp = UCharUtils::getUChar(pattern, ureglen);
+    if (uregexp == NULL) {
+        //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed)
+        std::cerr
+            << "EC:UNICODE|IMPOSSIBLE:Unable to convert pattern to unicode: " << pattern;
+        return FwCode::ConvertToUCharFailed;
+    }
+
+    URegularExpression *regexp= uregex_open(uregexp, ureglen, 0, 
+                                            &perr, 
+                                            &erc);
+    if(erc != U_ZERO_ERROR) {
+        //RESPONSE_DEBUG_STREAM(FwCode::CompileRegExFailed)
+        std::cerr
+            << "Compiling regex failed at: " << perr.offset
+            << "; re=" << pattern;
+        return FwCode::CompileRegExFailed;
+    }
+    
+    result = regexp;
+    return FwCode::FwOk;
+}
diff --git a/UCharUtils.h b/UCharUtils.h
new file mode 100644
index 0000000..4f751be
--- /dev/null
+++ b/UCharUtils.h
@@ -0,0 +1,139 @@
+/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */
+
+#ifndef UCHAR_UTILS_H
+#define UCHAR_UTILS_H
+
+#include <unicode/ucnv.h>
+#include <string>
+#include "FwCode.h"
+#include <unicode/uregex.h>
+
+// Forward declaration
+class UCharUtilsImpl;
+
+/**
+ * Some handy utilities for working with unicode characters.  Yes, these
+ * could have just been some regular routines instead of static methods
+ * in a class, but doing it this way gives us some containment of what
+ * other static tidbits might be necessary (like reusable buffer space).
+ * which are all hidden within the UCharUtilsImpl class.
+ *
+ * This is a singleton - do not use in a threaded program.
+ */
+class UCharUtils {
+    private:
+
+        /**
+         * Our pointer to all sorts of goodness.
+         */
+        static UCharUtilsImpl *instance_;
+    public:
+
+        /**
+         * Initialize the utilities.  Primarily opens the utf-8 converter.
+         * Calling this is required prior to using the converter.
+         * 
+         * @return FwCode::FwOk on success, FwCode::UcnvOpenFailed on
+         *         failure.
+         */
+        static FwCode::ResponseCode init();
+
+        /**
+         * Release all resources.  <code>init()</code> must be called again
+         * in order to use again.
+         */
+        static void close();
+
+        /**
+         * Small wrapper to hide multi-line thoth api inside single-line call.
+         *
+         * @param value string to be tested for utf-8-ness
+         * @return true if it is utf-8, false if not
+         */
+        static bool isUTF8(const std::string& value);
+
+        /**
+         * Small wrapper to hide multi-line thoth api inside single-line call.
+         *
+         * @param value char string to be tested for utf-8-ness
+         * @param value_len length of <code>value</code>
+         * @return true if it is utf-8, false if not
+         */
+        static bool isUTF8(const char * value, size_t value_len);
+
+        /**
+         * Convert utf-8 strings into UChar strings. Note that the
+         * result is an internal reusable buffer so the caller should 
+         * *not* release it.
+         * @param input utf-8 string to convert
+         * @param len set to length of output string
+         * @return NULL if anything bad happens, otherwise an allocated UChar *
+         *         the caller must *NEVER* free this pointer. 
+         */
+        static UChar * getUChar(const std::string &input, int32_t& len);
+
+        /**
+         * Do a NFC normalization so that different yet equivalent strings
+         * will have a single representation. See 
+         * http://www.unicode.org/unicode/reports/tr15/
+         * for more information.
+         * @param input A UTF-8 string that we want to normalize
+         * @param result (output) the normalized UTF-8 string
+         * @return FwCode::FwOk on success,
+         *         FwCode::FwError on conversion failure,
+         *         FwCode::InvalidData if input was not utf-8
+         */
+        static FwCode::ResponseCode normalize(const std::string &input, 
+                                              std::string &result);
+
+        /**
+         * Compile a regular expression in a unicode-friendly way.
+         *
+         * @param pattern the regexp pattern to compile.  Assumed to 
+         *        be utf-8.
+         * @param result (output) Set to point to the compiled regexp.
+         *        Must be released by the caller via uregex_close() when
+         *        finished with it.
+         * @return FwCode::FwOk if compilation succeeded,
+         *         FwCode::CompileRegExFailed or FwCode::ConvertToUCharFailed
+         *         on failure.
+         */
+        static FwCode::ResponseCode parseRegExpPattern
+            (const std::string &pattern,
+             URegularExpression * & result /* out */);
+
+};
+
+/**
+ * Bug 2574599 - Impl exposed for use by multiple threads; singleton not
+ * appropriate for multi-threaded program.
+ */
+class UCharUtilsImpl
+{
+private:
+    UConverter *uconv_;
+
+public:
+    UCharUtilsImpl();
+    ~UCharUtilsImpl();
+
+    FwCode::ResponseCode init();
+    void reset();
+    FwCode::ResponseCode convert(const std::string &input, int32_t &len);
+
+    FwCode::ResponseCode normalize(const std::string &nput, std::string &result);
+
+    // Buffer used to convert from UTF-* into UChar
+    int32_t ucBuffLen;
+    UChar *ucBuff;
+
+    // Buffer used for UChar normalization output
+    int32_t ucNormBuffLen;
+    UChar *ucNormBuff;
+
+    // Buffer used to convert UChars back to UTF-8
+    int32_t charBuffLen;
+    char   *charBuff;
+};
+
+#endif // _DHT_UCHAR_UTILS_
diff --git a/adriana-lima.awk b/adriana-lima.awk
new file mode 100755
index 0000000..4454496
--- /dev/null
+++ b/adriana-lima.awk
@@ -0,0 +1,130 @@
+#! /usr/bin/awk -f
+
+BEGIN{
+
+    READ_SLA = 500;
+    WRITE_SLA = 750;
+    
+    readcnt = 0;
+    writecnt = 0;
+
+    wlat_tot = 0;
+    wlat_max = 0;
+    wlat_sqtot = 0;
+    wlat_slafail = 0;
+
+    DIST_BUCKET_LENGTH = 100;
+    DIST_BUCKET_COUNT = 20;
+    for(i=1; i<=DIST_BUCKET_COUNT; i++)
+    {
+        rlat_dist[i] = 0;
+        wlat_dist[i] = 0;
+    }
+    
+
+    rlat_tot = 0;
+    rlat_max = 0;
+    rlat_sqtot = 0;
+    rlat_slafail = 0;
+
+    printf("READ SLA:\t%d\n", READ_SLA);
+    printf("WRITE SLA:\t%d\n", WRITE_SLA);
+    printf("\n");
+    
+}
+
+/INFO - doRead()/ { readcnt = readcnt + 1;
+
+    split(substr($0, match($0, "latency:")+ length("latency:")+1), tmp_arr, " ");
+    #printf("%d\n", strtonum(tmp_arr[1]));
+
+    lat_val = strtonum(tmp_arr[1]);
+
+    dist_index = int(lat_val / DIST_BUCKET_LENGTH) + 1;
+    if(dist_index > DIST_BUCKET_COUNT)
+        dist_index = DIST_BUCKET_COUNT;
+    rlat_dist[dist_index]++;
+
+    rlat_tot = rlat_tot + lat_val;
+
+    rlat_sqtot = rlat_sqtot + lat_val*lat_val;
+    
+    if(lat_val > rlat_max)
+        rlat_max = lat_val;
+
+    if(lat_val > READ_SLA)
+        rlat_slafail = rlat_slafail + 1;
+    
+}
+
+
+/INFO - doInsert()/ { writecnt = writecnt + 1;
+
+    split(substr($0, match($0, "latency:")+ length("latency:")+1), tmp_arr, " ");
+
+    lat_val = tmp_arr[1];
+    
+    if(index(tmp_arr[1], ",")!= 0)
+        lat_val = substr(tmp_arr[1],1,index(tmp_arr[1],",")-1);
+    
+    #printf("%d\n", strtonum(lat_val));
+    lat_val = strtonum(lat_val);
+
+    dist_index = int(lat_val / DIST_BUCKET_LENGTH) + 1;
+    if(dist_index > DIST_BUCKET_COUNT)
+        dist_index = DIST_BUCKET_COUNT;
+    wlat_dist[dist_index]++;
+    
+    wlat_tot = wlat_tot + lat_val;
+
+    wlat_sqtot = wlat_sqtot + lat_val*lat_val;
+
+    if(lat_val > wlat_max)
+        wlat_max = lat_val;
+
+    if(lat_val > WRITE_SLA)
+        wlat_slafail = wlat_slafail + 1;
+    
+
+}
+
+
+END{
+
+    printf("R/W ratio:\t%.2f\n", strtonum(readcnt) / strtonum(writecnt));
+
+    printf("\n");
+
+    printf("#reads:\t%d\n",readcnt);
+    if(strtonum(readcnt) != 0)
+    {
+        printf("avg read latency:\t%.2f\n", (rlat_tot / readcnt));
+        printf("var read latency:\t%.2f\n", (rlat_sqtot/readcnt) - (rlat_tot/readcnt)*(rlat_tot/readcnt));
+        printf("max read latency:\t%.2f\n", rlat_max);
+        printf("read SLA fail:\t%d\n", rlat_slafail);
+
+        printf("\nREAD LATENCY DISTRIBUTION\n");
+        for(i=1; i<DIST_BUCKET_COUNT; i++)
+            printf("\t%d - %d:\t%d\n", (i-1)*DIST_BUCKET_LENGTH, i*DIST_BUCKET_LENGTH-1, rlat_dist[i]);
+        printf("\t%d - Inf:\t%d\n", (i-1)*DIST_BUCKET_LENGTH, rlat_dist[i]);
+    }
+    
+    printf("\n");
+    
+    printf("#writes:\t%d\n",writecnt);
+    if(strtonum(writecnt) != 0)
+    {
+        printf("avg write latency:\t%.2f\n", (wlat_tot / writecnt));
+        printf("var write latency:\t%.2f\n", (wlat_sqtot/writecnt) - (wlat_tot/writecnt)*(wlat_tot/writecnt));
+        printf("max write latency:\t%.2f\n", wlat_max);
+        printf("write SLA fail:\t%d\n", wlat_slafail);
+
+        printf("\nWRITE LATENCY DISTRIBUTION\n");
+        for(i=1; i<DIST_BUCKET_COUNT; i++)
+            printf("\t%d - %d:\t%d\n", (i-1)*DIST_BUCKET_LENGTH, i*DIST_BUCKET_LENGTH-1, wlat_dist[i]);
+        printf("\t%d - Inf:\t%d\n", (i-1)*DIST_BUCKET_LENGTH, wlat_dist[i]);
+    }
+
+    
+}
+
diff --git a/check_datapage.cpp b/check_datapage.cpp
new file mode 100644
index 0000000..71eb08d
--- /dev/null
+++ b/check_datapage.cpp
@@ -0,0 +1,321 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include "datapage.cpp"
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#undef begin
+#undef end
+
+template class DataPage<datatuple>;
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+//must be given a sorted array
+void removeduplicates(std::vector<std::string> &arr)
+{
+
+    for(int i=arr.size()-1; i>0; i--)
+    {
+        if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i])))        
+            arr.erase(arr.begin()+i);
+            
+    }
+
+}
+
+void preprandstr(int count, std::vector<std::string> &arr, int avg_len=50, bool duplicates_allowed=false)
+{    
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = (rand()%(avg_len*2)) + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        //make sure there is no duplicate key
+        if(!duplicates_allowed)
+        {
+            bool dup = false;
+            for(int i=0; i<j; i++)        
+                if(! (mycmp(arr[i], str) || mycmp(str, arr[i])))
+                {
+                    dup=true;
+                    break;
+                }
+            if(dup)
+            {
+                j--;
+                continue;
+            }
+        }
+
+        
+        //printf("keylen-%d\t%d\t%s\n", str_len, str.length(),rc);
+        free(rc);
+
+        arr.push_back(str);
+        
+    }
+
+}
+
+/**
+ * REGION ALLOCATION
+ **/
+pageid_t alloc_region(int xid, void *conf)
+{
+    RegionAllocConf_t* a = (RegionAllocConf_t*)conf;
+    
+  if(a->nextPage == a->endOfRegion) {
+    if(a->regionList.size == -1) {
+        //DEBUG("nextPage: %lld\n", a->nextPage);
+        a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t));
+        DEBUG("regionList.page: %lld\n", a->regionList.page);
+        DEBUG("regionList.slot: %d\n", a->regionList.slot);
+        DEBUG("regionList.size: %lld\n", a->regionList.size);
+        
+        a->regionCount = 0;
+    }
+    DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page);
+    TarrayListExtend(xid,a->regionList,1);
+    a->regionList.slot = a->regionCount;
+    DEBUG("region lst slot %d\n",a->regionList.slot);
+    a->regionCount++;
+    DEBUG("region count %lld\n",a->regionCount);
+    a->nextPage = TregionAlloc(xid, a->regionSize,12);
+    DEBUG("next page %lld\n",a->nextPage);
+    a->endOfRegion = a->nextPage + a->regionSize;
+    Tset(xid,a->regionList,&a->nextPage);
+    DEBUG("next page %lld\n",a->nextPage);
+  }
+    
+  DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion);
+  pageid_t ret = a->nextPage;
+  // Ensure the page is in buffer cache without accessing disk (this
+  // sets it to clean and all zeros if the page is not in cache).
+  // Hopefully, future reads will get a cache hit, and avoid going to
+  // disk.
+
+  Page * p = loadUninitializedPage(xid, ret);
+  releasePage(p);
+  DEBUG("ret %lld\n",ret);
+  (a->nextPage)++;
+  return ret;
+
+}
+
+
+pageid_t alloc_region_rid(int xid, void * ridp) {
+  recordid rid = *(recordid*)ridp;
+  RegionAllocConf_t conf;
+  Tread(xid,rid,&conf);
+  pageid_t ret = alloc_region(xid,&conf);
+  DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page);
+
+  Tset(xid,rid,&conf);
+  return ret;
+}
+
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+    srand(1000);
+    unlink("storefile.txt");
+    unlink("logfile.txt");
+
+    sync();
+
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    std::vector<std::string> data_arr;
+    std::vector<std::string> key_arr;
+    preprandstr(NUM_ENTRIES, data_arr, 5*4096, true);
+    preprandstr(NUM_ENTRIES+200, key_arr, 50, true);//well i can handle upto 200
+    
+    std::sort(key_arr.begin(), key_arr.end(), &mycmp);
+
+    removeduplicates(key_arr);
+    if(key_arr.size() > NUM_ENTRIES)
+        key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end());
+    
+    NUM_ENTRIES=key_arr.size();
+    
+    if(data_arr.size() > NUM_ENTRIES)
+        data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end());
+    
+    //for(int i = 0; i < NUM_ENTRIES; i++)
+    //{
+    //   printf("%s\t", arr[i].c_str());
+    //   int keylen = arr[i].length()+1;
+    //  printf("%d\n", keylen);      
+    //}
+
+
+
+    recordid alloc_state = Talloc(xid,sizeof(RegionAllocConf_t));
+    
+    Tset(xid,alloc_state, &logtree::REGION_ALLOC_STATIC_INITIALIZER);
+
+
+    
+    
+    
+
+    printf("Stage 1: Writing %d keys\n", NUM_ENTRIES);
+      
+    int pcount = 10;
+    int dpages = 0;
+    DataPage<datatuple> *dp=0;
+    int64_t datasize = 0;
+    std::vector<pageid_t> dsp;
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        //prepare the key
+        datatuple newtuple;        
+        uint32_t keylen = key_arr[i].length()+1;
+        newtuple.keylen = &keylen;
+        
+        newtuple.key = (datatuple::key_t) malloc(keylen);
+        for(int j=0; j<keylen-1; j++)
+            newtuple.key[j] = key_arr[i][j];
+        newtuple.key[keylen-1]='\0';
+
+        //prepare the data
+        uint32_t datalen = data_arr[i].length()+1;
+        newtuple.datalen = &datalen;
+        
+        newtuple.data = (datatuple::data_t) malloc(datalen);
+        for(int j=0; j<datalen-1; j++)
+            newtuple.data[j] = data_arr[i][j];
+        newtuple.data[datalen-1]='\0';
+
+        /*
+        printf("key: \t, keylen: %u\ndata:  datalen: %u\n",
+               //newtuple.key,
+               *newtuple.keylen,
+               //newtuple.data,
+               *newtuple.datalen);
+               */
+        datasize += newtuple.byte_length();
+        if(dp==NULL || !dp->append(xid, newtuple))
+        {
+            dpages++;
+            if(dp)
+                delete dp;
+            
+            dp = new DataPage<datatuple>(xid, pcount, &DataPage<datatuple>::dp_alloc_region_rid, &alloc_state );
+            
+            if(!dp->append(xid, newtuple))
+            {            
+                delete dp;
+                dp = new DataPage<datatuple>(xid, pcount, &DataPage<datatuple>::dp_alloc_region_rid, &alloc_state );            
+                assert(dp->append(xid, newtuple));
+            }
+               
+            dsp.push_back(dp->get_start_pid());
+        }
+        
+        
+    }
+
+    printf("Total data set length: %d\n", datasize);
+    printf("Storage utilization: %.2f\n", (datasize+.0) / (PAGE_SIZE * pcount * dpages));
+    printf("Number of datapages: %d\n", dpages);
+    printf("Writes complete.\n");
+    
+    Tcommit(xid);
+    xid = Tbegin();
+
+
+    printf("Stage 2: Reading %d tuples\n", NUM_ENTRIES);
+
+    
+    int tuplenum = 0;
+    for(int i = 0; i < dpages ; i++)
+    {
+        DataPage<datatuple> dp(xid, dsp[i]);
+        DataPage<datatuple>::RecordIterator itr = dp.begin();
+        datatuple *dt=0;
+        while( (dt=itr.getnext(xid)) != NULL)
+            {
+                assert(*(dt->keylen) == key_arr[tuplenum].length()+1);
+                assert(*(dt->datalen) == data_arr[tuplenum].length()+1);
+                tuplenum++;
+                free(dt->keylen);
+                free(dt);
+                dt = 0;
+            }
+
+    }
+    
+    printf("Reads completed.\n");
+/*
+    
+    int64_t count = 0;
+    lladdIterator_t * it = logtreeIterator::open(xid, tree);
+
+    while(logtreeIterator::next(xid, it)) {
+        byte * key;
+        byte **key_ptr = &key;
+        int keysize = logtreeIterator::key(xid, it, (byte**)key_ptr);
+        
+        pageid_t *value;
+        pageid_t **value_ptr = &value;
+        int valsize = lsmTreeIterator_value(xid, it, (byte**)value_ptr);
+        //printf("keylen %d key %s\n", keysize, (char*)(key)) ;
+        assert(valsize == sizeof(pageid_t));
+        assert(!mycmp(std::string((char*)key), arr[count]) && !mycmp(arr[count],std::string((char*)key)));
+        assert(keysize == arr[count].length()+1);
+        count++;
+    }
+    assert(count == NUM_ENTRIES);
+
+    logtreeIterator::close(xid, it);
+
+    
+    */
+
+  
+        Tcommit(xid);
+        Tdeinit();
+}
+
+
+/** @test
+ */
+int main()
+{
+    insertProbeIter(10000);
+
+    
+    
+    return 0;
+}
+
diff --git a/check_gen.cpp b/check_gen.cpp
new file mode 100644
index 0000000..100d9d0
--- /dev/null
+++ b/check_gen.cpp
@@ -0,0 +1,39 @@
+
+
+#include "logstore.h"
+
+int main(int argc, char **argv)
+{
+    unlink("storefile.txt");
+    unlink("logfile.txt");
+
+    sync();
+
+    //    PAGELAYOUT::initPageLayout();
+    
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    logtable ltable;
+
+    recordid table_root = ltable.allocTable(xid);    
+
+    Tcommit(xid);
+
+    //ltable.startTable();
+
+//    lsmTableHandle<PAGELAYOUT>* h = TlsmTableStart<PAGELAYOUT>(lsmTable, INVALID_COL);
+
+    xid = Tbegin();
+    logtreeIterator::open(xid,ltable.get_tree_c2()->get_root_rec() );
+    Tcommit(xid);
+    
+
+    Tdeinit();
+
+
+
+}
diff --git a/check_logtable.cpp b/check_logtable.cpp
new file mode 100644
index 0000000..5d01500
--- /dev/null
+++ b/check_logtable.cpp
@@ -0,0 +1,276 @@
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include "datapage.cpp"
+#include "logiterators.cpp"
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#undef begin
+#undef end
+
+
+
+//template class DataPage<datatuple>;
+template class treeIterator<datatuple>;
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+//must be given a sorted array
+void removeduplicates(std::vector<std::string> &arr)
+{
+
+    for(int i=arr.size()-1; i>0; i--)
+    {
+        if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i])))        
+            arr.erase(arr.begin()+i);
+            
+    }
+
+}
+
+void preprandstr(int count, std::vector<std::string> &arr, int avg_len=50, bool duplicates_allowed=false)
+{    
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = (rand()%(avg_len*2)) + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        //make sure there is no duplicate key
+        if(!duplicates_allowed)
+        {
+            bool dup = false;
+            for(int i=0; i<j; i++)        
+                if(! (mycmp(arr[i], str) || mycmp(str, arr[i])))
+                {
+                    dup=true;
+                    break;
+                }
+            if(dup)
+            {
+                j--;
+                continue;
+            }
+        }
+
+        
+        //printf("keylen-%d\t%d\t%s\n", str_len, str.length(),rc);
+        free(rc);
+
+        arr.push_back(str);
+        
+    }
+
+}
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+    srand(1000);
+    unlink("storefile.txt");
+    unlink("logfile.txt");
+
+    sync();
+
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    logtable ltable;
+
+    int pcount = 5;
+    ltable.set_fixed_page_count(pcount);
+    recordid table_root = ltable.allocTable(xid);
+
+    Tcommit(xid);
+    
+    xid = Tbegin();
+    logtree *lt = ltable.get_tree_c1();
+    
+    recordid tree_root = lt->get_root_rec();
+
+
+    std::vector<std::string> data_arr;
+    std::vector<std::string> key_arr;
+    preprandstr(NUM_ENTRIES, data_arr, 5*4096, true);
+    preprandstr(NUM_ENTRIES+200, key_arr, 50, true);//well i can handle upto 200
+    
+    std::sort(key_arr.begin(), key_arr.end(), &mycmp);
+
+    removeduplicates(key_arr);
+    if(key_arr.size() > NUM_ENTRIES)
+        key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end());
+    
+    NUM_ENTRIES=key_arr.size();
+    
+    if(data_arr.size() > NUM_ENTRIES)
+        data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end());  
+    
+    
+    
+    
+    printf("Stage 1: Writing %d keys\n", NUM_ENTRIES);
+
+    
+    int dpages = 0;
+    int npages = 0;
+    DataPage<datatuple> *dp=0;
+    int64_t datasize = 0;
+    std::vector<pageid_t> dsp;
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        //prepare the key
+        datatuple newtuple;        
+        uint32_t keylen = key_arr[i].length()+1;
+        newtuple.keylen = &keylen;
+        
+        newtuple.key = (datatuple::key_t) malloc(keylen);
+        for(int j=0; j<keylen-1; j++)
+            newtuple.key[j] = key_arr[i][j];
+        newtuple.key[keylen-1]='\0';
+
+        //prepare the data
+        uint32_t datalen = data_arr[i].length()+1;
+        newtuple.datalen = &datalen;
+        
+        newtuple.data = (datatuple::data_t) malloc(datalen);
+        for(int j=0; j<datalen-1; j++)
+            newtuple.data[j] = data_arr[i][j];
+        newtuple.data[datalen-1]='\0';
+
+//        printf("key: \t, keylen: %u\ndata:  datalen: %u\n",
+               //newtuple.key,
+//               *newtuple.keylen,
+               //newtuple.data,
+//               *newtuple.datalen);
+
+        datasize += newtuple.byte_length();
+
+        if(dp == NULL)
+        {
+            dp = ltable.insertTuple(xid, newtuple, ltable.get_dpstate1(), lt);
+            dpages++;
+            dsp.push_back(dp->get_start_pid());
+        }
+        else
+        {
+            if(!dp->append(xid, newtuple))
+            {
+                npages += dp->get_page_count();
+                delete dp;
+                dp = ltable.insertTuple(xid, newtuple, ltable.get_dpstate1(), lt);
+                dpages++;
+                dsp.push_back(dp->get_start_pid());            
+            }
+        }
+
+        free(newtuple.key);
+        free(newtuple.data);
+        
+        
+    }
+
+    printf("\nTREE STRUCTURE\n");
+    lt->print_tree(xid);
+    
+    printf("Total data set length: %d\n", datasize);
+    printf("Storage utilization: %.2f\n", (datasize+.0) / (PAGE_SIZE * npages));
+    printf("Number of datapages: %d\n", dpages);
+    printf("Writes complete.\n");
+    
+    Tcommit(xid);
+    xid = Tbegin();
+
+
+
+
+
+    printf("Stage 2: Sequentially reading %d tuples\n", NUM_ENTRIES);
+
+    
+    int tuplenum = 0;
+    treeIterator<datatuple> tree_itr(tree_root);
+
+
+    datatuple *dt=0;
+    while( (dt=tree_itr.getnext()) != NULL)
+    {
+        assert(*(dt->keylen) == key_arr[tuplenum].length()+1);
+        assert(*(dt->datalen) == data_arr[tuplenum].length()+1);
+        tuplenum++;
+        free(dt->keylen);
+        free(dt);
+        dt = 0;
+    }
+
+    assert(tuplenum == key_arr.size());
+    
+    printf("Sequential Reads completed.\n");
+
+    int rrsize=key_arr.size() / 3;
+    printf("Stage 3: Randomly reading %d tuples by key\n", rrsize);
+
+    for(int i=0; i<rrsize; i++)
+    {
+        //randomly pick a key
+        int ri = rand()%key_arr.size();
+
+        //get the key
+        uint32_t keylen = key_arr[ri].length()+1;        
+        datatuple::key_t rkey = (datatuple::key_t) malloc(keylen);
+        for(int j=0; j<keylen-1; j++)
+            rkey[j] = key_arr[ri][j];
+        rkey[keylen-1]='\0';
+
+        //find the key with the given tuple
+        datatuple *dt = ltable.findTuple(xid, rkey, keylen, lt);
+
+        assert(dt!=0);
+        assert(*(dt->keylen) == key_arr[ri].length()+1);
+        assert(*(dt->datalen) == data_arr[ri].length()+1);
+        free(dt->keylen);
+        free(dt);
+        dt = 0;        
+    }
+
+    printf("Random Reads completed.\n");
+    Tcommit(xid);
+    Tdeinit();
+
+}
+
+/** @test
+ */
+int main()
+{
+    insertProbeIter(15000);
+
+    
+    
+    return 0;
+}
+
diff --git a/check_logtree.cpp b/check_logtree.cpp
new file mode 100644
index 0000000..6e4a3c1
--- /dev/null
+++ b/check_logtree.cpp
@@ -0,0 +1,331 @@
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#define LOG_NAME   "check_logTree.log"
+#define NUM_ENTRIES_A 10000
+#define NUM_ENTRIES_B 10
+#define NUM_ENTRIES_C 0
+
+#define OFFSET      (NUM_ENTRIES * 10)
+
+#undef begin
+#undef end
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+void preprandstr(int count, std::vector<std::string> &arr)
+{    
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = rand()%100 + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        //make sure there is no duplicate key
+        bool dup = false;
+        for(int i=0; i<j; i++)        
+            if(! (mycmp(arr[i], str) || mycmp(str, arr[i])))
+            {
+                dup=true;
+                break;
+            }
+        if(dup)
+        {
+            j--;
+            continue;
+        }
+
+        
+        //printf("keylen-%d\t%d\t%s\n", str_len, str.length(),rc);
+        free(rc);
+
+        arr.push_back(str);
+        
+    }
+
+}
+
+
+void insertProbeIter_str(int  NUM_ENTRIES)
+{
+    srand(1000);
+    unlink("storefile.txt");
+    unlink("logfile.txt");
+
+    sync();
+
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    logtable ltable;
+
+    recordid table_root = ltable.allocTable(xid);
+
+    Tcommit(xid);
+    
+    xid = Tbegin();
+    logtree *lt = ltable.get_tree_c1();
+    
+    recordid tree = lt->get_root_rec();
+  
+    long oldpagenum = -1;
+
+    std::vector<std::string> arr;
+    preprandstr(NUM_ENTRIES, arr);
+    std::sort(arr.begin(), arr.end(), &mycmp);
+    
+    //for(int i = 0; i < NUM_ENTRIES; i++)
+    //{
+    //   printf("%s\t", arr[i].c_str());
+    //   int keylen = arr[i].length()+1;
+    //  printf("%d\n", keylen);      
+    //}
+
+
+    printf("Stage 1: Writing %d keys\n", NUM_ENTRIES);
+      
+
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        int keylen = arr[i].length()+1;
+        byte *currkey = (byte*)malloc(keylen);
+        for(int j=0; j<keylen-1; j++)
+            currkey[j] = arr[i][j];
+        currkey[keylen-1]='\0';      
+      
+        //printf("\n#########\ni=%d\nkey:\t%s\nkeylen:%d\n",i,((char*)currkey),keylen);
+        long pagenum = logtree::findPage(xid, tree, currkey, keylen);
+        //printf("pagenum:%d\n", pagenum);
+        assert(pagenum == -1 || pagenum == oldpagenum || oldpagenum == -1);
+        //printf("TlsmAppendPage %d\n",i);
+
+        recordid rid = lt->get_tree_state();
+        RegionAllocConf_t alloc_conf;
+        Tread(xid,rid,&alloc_conf);
+      
+        logtree::appendPage(xid, tree, lt->lastLeaf, currkey, keylen, lt->alloc_region, &alloc_conf, i + OFFSET);
+
+        //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page);
+        // XXX get rid of Tset by storing next page in memory, and losing it
+        //     on crash.
+        Tset(xid,rid,&alloc_conf);
+      
+      
+        pagenum = logtree::findPage(xid, tree, currkey,keylen);
+        oldpagenum = pagenum;
+        //printf("pagenum:%d\n", pagenum);      
+        assert(pagenum == i + OFFSET);
+        free(currkey);
+
+
+    }
+
+    printf("Writes complete.");
+    
+    tree = lt->get_root_rec();
+    Tcommit(xid);
+    xid = Tbegin();
+
+    printf("\nTREE STRUCTURE\n");
+    lt->print_tree(xid);
+
+    printf("Stage 2: Looking up %d keys\n", NUM_ENTRIES);
+  
+    for(int i = 0; i < NUM_ENTRIES; i++) {
+        int keylen = arr[i].length()+1;
+        byte *currkey = (byte*)malloc(keylen);
+        for(int j=0; j<keylen-1; j++)
+            currkey[j] = arr[i][j];
+        currkey[keylen-1]='\0';
+
+        //printf("\n#########\ni=%d\nkey:\t%s\nkeylen:%d\n",i,((char*)currkey),keylen);
+        long pagenum = logtree::findPage(xid, tree, currkey, keylen);
+        //printf("pagenum:%d\n", pagenum);      
+        assert(pagenum == i + OFFSET);
+        free(currkey);
+    }
+
+
+    printf("Stage 3: Iterating over %d keys\n", NUM_ENTRIES);
+
+    
+    int64_t count = 0;
+    lladdIterator_t * it = logtreeIterator::open(xid, tree);
+
+    while(logtreeIterator::next(xid, it)) {
+        byte * key;
+        byte **key_ptr = &key;
+        int keysize = logtreeIterator::key(xid, it, (byte**)key_ptr);
+        
+        pageid_t *value;
+        pageid_t **value_ptr = &value;
+        int valsize = lsmTreeIterator_value(xid, it, (byte**)value_ptr);
+        //printf("keylen %d key %s\n", keysize, (char*)(key)) ;
+        assert(valsize == sizeof(pageid_t));
+        assert(!mycmp(std::string((char*)key), arr[count]) && !mycmp(arr[count],std::string((char*)key)));
+        assert(keysize == arr[count].length()+1);
+        count++;
+    }
+    assert(count == NUM_ENTRIES);
+
+    logtreeIterator::close(xid, it);
+
+    
+
+
+  
+        Tcommit(xid);
+        Tdeinit();
+}
+
+
+
+
+void insertProbeIter_int(int  NUM_ENTRIES)
+{
+
+    unlink("storefile.txt");
+    unlink("logfile.txt");
+
+    sync();
+
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    logtable ltable;
+
+    recordid table_root = ltable.allocTable(xid);
+
+    Tcommit(xid);
+    
+    xid = Tbegin();
+    logtree *lt = ltable.get_tree_c1();
+    
+    recordid tree = lt->get_root_rec();
+  
+    long oldpagenum = -1;
+    
+    for(int32_t i = 0; i < NUM_ENTRIES; i++) {
+        int keylen = sizeof(int32_t);
+        byte *currkey = (byte*)malloc(keylen);
+        memcpy(currkey, (byte*)(&i), keylen);
+        //currkey[]='\0';
+      
+        printf("\n#########\ni=%d\nkey:\t%d\nkeylen:%d\n",i,*((int32_t*)currkey),keylen);
+        long pagenum = logtree::findPage(xid, tree, currkey, keylen);
+        printf("pagenum:%d\n", pagenum);
+        assert(pagenum == -1 || pagenum == oldpagenum || oldpagenum == -1);
+        printf("TlsmAppendPage %d\n",i);
+
+        recordid rid = lt->get_tree_state();
+        RegionAllocConf_t alloc_conf;
+        Tread(xid,rid,&alloc_conf);
+      
+        logtree::appendPage(xid, tree, lt->lastLeaf, currkey, keylen, lt->alloc_region, &alloc_conf, i + OFFSET);
+
+        //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page);
+        // XXX get rid of Tset by storing next page in memory, and losing it
+        //     on crash.
+        Tset(xid,rid,&alloc_conf);
+      
+      
+        pagenum = logtree::findPage(xid, tree, currkey,keylen);
+        oldpagenum = pagenum;
+        printf("pagenum:%d\n", pagenum);      
+        assert(pagenum == i + OFFSET);
+        free(currkey);
+    }
+
+    printf("Writes complete.");
+  
+    tree = lt->get_root_rec();
+    Tcommit(xid);
+    xid = Tbegin();
+
+    printf("\nTREE STRUCTURE\n");
+    lt->print_tree(xid);
+  
+    for(int32_t i = 1; i < NUM_ENTRIES; i++) {
+        int keylen = sizeof(int32_t);
+        byte *currkey = (byte*)malloc(keylen);
+        memcpy(currkey, (byte*)(&i), keylen);
+
+        printf("\n#########\ni=%d\nkey:\t%d\nkeylen:%d\n",i,*((int32_t*)currkey),keylen);
+        long pagenum = logtree::findPage(xid, tree, currkey, keylen);
+        printf("pagenum:%d\n", pagenum);      
+        assert(pagenum == i + OFFSET);
+        free(currkey);
+    }
+
+    /*
+      int64_t count = 0;
+
+      lladdIterator_t * it = lsmTreeIterator_open(xid, tree);
+
+      while(lsmTreeIterator_next(xid, it)) {
+      lsmkey_t * key;
+      lsmkey_t **key_ptr = &key;
+      int size = lsmTreeIterator_key(xid, it, (byte**)key_ptr);
+      assert(size == sizeof(lsmkey_t));
+      long *value;
+      long **value_ptr = &value;
+      size = lsmTreeIterator_value(xid, it, (byte**)value_ptr);
+      assert(size == sizeof(pageid_t));
+      assert(*key + OFFSET == *value);
+      assert(*key == count);
+      count++;
+      }
+      assert(count == NUM_ENTRIES);
+
+      lsmTreeIterator_close(xid, it);
+
+    */
+    Tcommit(xid);
+    Tdeinit();
+}
+
+/** @test
+ */
+int main()
+{
+    insertProbeIter_str(NUM_ENTRIES_A);
+    //insertProbeIter_int(NUM_ENTRIES_A);
+
+    
+    
+    return 0;
+}
+
+
diff --git a/check_merge.cpp b/check_merge.cpp
new file mode 100644
index 0000000..79a6bee
--- /dev/null
+++ b/check_merge.cpp
@@ -0,0 +1,246 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include "datapage.cpp"
+#include "logiterators.cpp"
+#include "merger.h"
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#undef begin
+#undef end
+
+
+
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+//must be given a sorted array
+void removeduplicates(std::vector<std::string> *arr)
+{
+
+    for(int i=arr->size()-1; i>0; i--)
+    {
+        if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i])))        
+            arr->erase(arr->begin()+i);
+            
+    }
+
+}
+
+void preprandstr(int count, std::vector<std::string> *arr, int avg_len=50)
+{
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = (rand()%(avg_len*2)) + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        free(rc);
+
+        arr->push_back(str);
+        
+    }
+
+}
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+    srand(1000);
+    unlink("storefile.txt");
+    unlink("logfile.txt");
+
+    sync();
+
+    //data generation
+    std::vector<std::string> * data_arr = new std::vector<std::string>;
+    std::vector<std::string> * key_arr = new std::vector<std::string>;
+    
+    preprandstr(NUM_ENTRIES, data_arr, 10*8192);
+    preprandstr(NUM_ENTRIES+200, key_arr, 100);
+    
+    std::sort(key_arr->begin(), key_arr->end(), &mycmp);
+
+    removeduplicates(key_arr);
+    if(key_arr->size() > NUM_ENTRIES)
+        key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end());
+    
+    NUM_ENTRIES=key_arr->size();
+    
+    if(data_arr->size() > NUM_ENTRIES)
+        data_arr->erase(data_arr->begin()+NUM_ENTRIES, data_arr->end());
+    
+    
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    merge_scheduler mscheduler;    
+    logtable ltable;
+
+    int pcount = 5;
+    ltable.set_fixed_page_count(pcount);
+
+    recordid table_root = ltable.allocTable(xid);
+
+    Tcommit(xid);
+    
+    xid = Tbegin();
+
+    int lindex = mscheduler.addlogtable(&ltable);
+    ltable.setMergeData(mscheduler.getMergeData(lindex));
+    
+    mscheduler.startlogtable(lindex);
+
+    printf("Stage 1: Writing %d keys\n", NUM_ENTRIES);
+    
+    struct timeval start_tv, stop_tv, ti_st, ti_end;
+    double insert_time = 0;
+    int dpages = 0;
+    int npages = 0;
+    DataPage<datatuple> *dp=0;
+    int64_t datasize = 0;
+    std::vector<pageid_t> dsp;
+    gettimeofday(&start_tv,0);
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        //prepare the key
+        datatuple newtuple;        
+        uint32_t keylen = (*key_arr)[i].length()+1;
+        newtuple.keylen = &keylen;
+        
+        newtuple.key = (datatuple::key_t) malloc(keylen);
+        memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen);
+        //for(int j=0; j<keylen-1; j++)
+        //    newtuple.key[j] = (*key_arr)[i][j];
+        //newtuple.key[keylen-1]='\0';
+
+        //prepare the data
+        uint32_t datalen = (*data_arr)[i].length()+1;
+        newtuple.datalen = &datalen;        
+        newtuple.data = (datatuple::data_t) malloc(datalen);
+        memcpy((byte*)newtuple.data, (*data_arr)[i].c_str(), datalen);
+//        for(int j=0; j<datalen-1; j++)
+//            newtuple.data[j] = (*data_arr)[i][j];
+//        newtuple.data[datalen-1]='\0';        
+        
+        /*
+        printf("key: \t, keylen: %u\ndata:  datalen: %u\n",
+               //newtuple.key,
+               *newtuple.keylen,
+               //newtuple.data,
+               *newtuple.datalen);
+               */
+        
+        datasize += newtuple.byte_length();
+
+        gettimeofday(&ti_st,0);        
+        ltable.insertTuple(newtuple);
+        gettimeofday(&ti_end,0);
+        insert_time += tv_to_double(ti_end) - tv_to_double(ti_st);
+
+        free(newtuple.key);
+        free(newtuple.data);
+        
+    }
+    gettimeofday(&stop_tv,0);
+    printf("insert time: %6.1f\n", insert_time);
+    printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+
+    printf("\nTREE STRUCTURE\n");
+    //ltable.get_tree_c1()->print_tree(xid);
+    printf("datasize: %d\n", datasize);
+    //sleep(20);
+
+    Tcommit(xid);
+    xid = Tbegin();
+
+
+    printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES);
+
+    int found_tuples=0;
+    for(int i=NUM_ENTRIES-1; i>=0; i--)
+    {        
+        int ri = i;
+        //printf("key index%d\n", i);
+        fflush(stdout);
+
+        //get the key
+        uint32_t keylen = (*key_arr)[ri].length()+1;        
+        datatuple::key_t rkey = (datatuple::key_t) malloc(keylen);
+        memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen);
+        //for(int j=0; j<keylen-1; j++)
+        //rkey[j] = (*key_arr)[ri][j];
+        //rkey[keylen-1]='\0';
+
+        //find the key with the given tuple
+        datatuple *dt = ltable.findTuple(xid, rkey, keylen);
+
+        assert(dt!=0);
+        //if(dt!=0)
+        {
+        found_tuples++;
+        assert(*(dt->keylen) == (*key_arr)[ri].length()+1);
+        assert(*(dt->datalen) == (*data_arr)[ri].length()+1);
+        free(dt->keylen);
+        free(dt);
+        }
+        dt = 0;
+        free(rkey);
+    }
+    printf("found %d\n", found_tuples);
+
+    key_arr->clear();
+    data_arr->clear();
+    delete key_arr;
+    delete data_arr;
+    
+    mscheduler.shutdown();
+    printf("merge threads finished.\n");
+    gettimeofday(&stop_tv,0);
+    printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+
+
+    
+    Tcommit(xid);
+    Tdeinit();
+    
+    
+}
+
+
+
+/** @test
+ */
+int main()
+{
+    insertProbeIter(5000);
+
+    
+    
+    return 0;
+}
+
diff --git a/check_mergelarge.cpp b/check_mergelarge.cpp
new file mode 100644
index 0000000..692b360
--- /dev/null
+++ b/check_mergelarge.cpp
@@ -0,0 +1,264 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include "datapage.cpp"
+#include "logiterators.cpp"
+#include "merger.h"
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#undef begin
+#undef end
+
+
+
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+//must be given a sorted array
+void removeduplicates(std::vector<std::string> *arr)
+{
+
+    for(int i=arr->size()-1; i>0; i--)
+    {
+        if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i])))        
+            arr->erase(arr->begin()+i);
+            
+    }
+
+}
+
+void getnextdata(std::string &data, int avg_len)
+{
+    int str_len = (rand()%(avg_len*2)) + 3;
+
+    data = std::string(str_len, rand()%10+48);
+    /*
+    char *rc = (char*)malloc(str_len);
+    
+    for(int i=0; i<str_len-1; i++)        
+        rc[i] = rand()%10+48;
+        
+    rc[str_len-1]='\0';
+    data = std::string(rc);
+    
+    free(rc);
+    */
+
+}
+
+void preprandstr(int count, std::vector<std::string> *arr, int avg_len=50)
+{
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = (rand()%(avg_len*2)) + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        free(rc);
+
+        arr->push_back(str);
+        
+    }
+
+}
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+    srand(1000);
+    unlink("storefile.txt");
+    unlink("logfile.txt");
+
+    sync();
+
+    //data generation
+//    std::vector<std::string> * data_arr = new std::vector<std::string>;
+    std::vector<std::string> * key_arr = new std::vector<std::string>;
+    
+//    preprandstr(NUM_ENTRIES, data_arr, 10*8192);
+    preprandstr(NUM_ENTRIES+200, key_arr, 100);
+    
+    std::sort(key_arr->begin(), key_arr->end(), &mycmp);
+
+    removeduplicates(key_arr);
+    if(key_arr->size() > NUM_ENTRIES)
+        key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end());
+    
+    NUM_ENTRIES=key_arr->size();
+    
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    merge_scheduler mscheduler;    
+    logtable ltable;
+
+    int pcount = 100;
+    ltable.set_fixed_page_count(pcount);
+
+    recordid table_root = ltable.allocTable(xid);
+
+    Tcommit(xid);
+    
+    //xid = Tbegin();
+
+    int lindex = mscheduler.addlogtable(&ltable);
+    ltable.setMergeData(mscheduler.getMergeData(lindex));
+    
+    mscheduler.startlogtable(lindex);
+
+    printf("Stage 1: Writing %d keys\n", NUM_ENTRIES);
+    
+    struct timeval start_tv, stop_tv, ti_st, ti_end;
+    double insert_time = 0;
+    int dpages = 0;
+    int npages = 0;
+    DataPage<datatuple> *dp=0;
+    int64_t datasize = 0;
+    std::vector<pageid_t> dsp;
+    gettimeofday(&start_tv,0);
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        //prepare the key
+        datatuple newtuple;        
+        uint32_t keylen = (*key_arr)[i].length()+1;
+        newtuple.keylen = &keylen;
+        
+        newtuple.key = (datatuple::key_t) malloc(keylen);
+        memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen);
+        //for(int j=0; j<keylen-1; j++)
+        //    newtuple.key[j] = (*key_arr)[i][j];
+        //newtuple.key[keylen-1]='\0';
+
+        //prepare the data
+        std::string ditem;
+        getnextdata(ditem, 10*8192);
+        uint32_t datalen = ditem.length()+1;
+        newtuple.datalen = &datalen;        
+        newtuple.data = (datatuple::data_t) malloc(datalen);
+        memcpy((byte*)newtuple.data, ditem.c_str(), datalen);
+//        for(int j=0; j<datalen-1; j++)
+//            newtuple.data[j] = (*data_arr)[i][j];
+//        newtuple.data[datalen-1]='\0';        
+        
+        /*
+        printf("key: \t, keylen: %u\ndata:  datalen: %u\n",
+               //newtuple.key,
+               *newtuple.keylen,
+               //newtuple.data,
+               *newtuple.datalen);
+               */
+        
+        datasize += newtuple.byte_length();
+
+        gettimeofday(&ti_st,0);        
+        ltable.insertTuple(newtuple);
+        gettimeofday(&ti_end,0);
+        insert_time += tv_to_double(ti_end) - tv_to_double(ti_st);
+
+        free(newtuple.key);
+        free(newtuple.data);
+        
+    }
+    gettimeofday(&stop_tv,0);
+    printf("insert time: %6.1f\n", insert_time);
+    printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+
+    printf("\nTREE STRUCTURE\n");
+    //ltable.get_tree_c1()->print_tree(xid);
+    printf("datasize: %lld\n", datasize);
+    //sleep(20);
+
+    /*
+    //Tcommit(xid);
+    xid = Tbegin();
+
+
+    printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES);
+
+    int found_tuples=0;
+    for(int i=NUM_ENTRIES-1; i>=0; i--)
+    {        
+        int ri = i;
+        //printf("key index%d\n", i);
+        fflush(stdout);
+
+        //get the key
+        uint32_t keylen = (*key_arr)[ri].length()+1;        
+        datatuple::key_t rkey = (datatuple::key_t) malloc(keylen);
+        memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen);
+        //for(int j=0; j<keylen-1; j++)
+        //rkey[j] = (*key_arr)[ri][j];
+        //rkey[keylen-1]='\0';
+
+        //find the key with the given tuple
+        datatuple *dt = ltable.findTuple(xid, rkey, keylen);
+
+        assert(dt!=0);
+        //if(dt!=0)
+        {
+        found_tuples++;
+        assert(*(dt->keylen) == (*key_arr)[ri].length()+1);
+        //assert(*(dt->datalen) == (*data_arr)[ri].length()+1);
+        free(dt->keylen);
+        free(dt);
+        }
+        dt = 0;
+        free(rkey);
+    }
+    printf("found %d\n", found_tuples);
+
+    key_arr->clear();
+    //data_arr->clear();
+    delete key_arr;
+    //delete data_arr;
+    */
+    
+    mscheduler.shutdown();
+    printf("merge threads finished.\n");
+    gettimeofday(&stop_tv,0);
+    printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+    
+    //Tcommit(xid);
+    
+    Tdeinit();
+    
+    
+}
+
+
+
+/** @test
+ */
+int main()
+{
+    insertProbeIter(25000);
+
+    
+    
+    return 0;
+}
+
diff --git a/check_mergetuple.cpp b/check_mergetuple.cpp
new file mode 100644
index 0000000..914515a
--- /dev/null
+++ b/check_mergetuple.cpp
@@ -0,0 +1,409 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include "datapage.cpp"
+#include "logiterators.cpp"
+#include "merger.h"
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#undef begin
+#undef end
+
+
+
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+//must be given a sorted array
+void removeduplicates(std::vector<std::string> *arr)
+{
+
+    for(int i=arr->size()-1; i>0; i--)
+    {
+        if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i])))        
+            arr->erase(arr->begin()+i);
+            
+    }
+
+}
+
+void getnextdata(std::string &data, int avg_len)
+{
+    int str_len = (rand()%(avg_len*2)) + 3;
+
+    data = std::string(str_len, rand()%10+48);
+    /*
+    char *rc = (char*)malloc(str_len);
+    
+    for(int i=0; i<str_len-1; i++)        
+        rc[i] = rand()%10+48;
+        
+    rc[str_len-1]='\0';
+    data = std::string(rc);
+    
+    free(rc);
+    */
+
+}
+
+void preprandstr(int count, std::vector<std::string> *arr, int avg_len=50)
+{
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = (rand()%(avg_len*2)) + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        free(rc);
+
+        arr->push_back(str);
+        
+    }
+
+}
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+    srand(1000);
+    //unlink("storefile.txt");
+    //unlink("logfile.txt");
+
+    sync();
+    double delete_freq = .05;
+    double update_freq = .15;
+    
+    //data generation
+    typedef std::vector<std::string> key_v_t;
+    const static int max_partition_size = 100000;
+    int KEY_LEN = 100;
+    std::vector<key_v_t*> *key_v_list = new std::vector<key_v_t*>;
+    int list_size = NUM_ENTRIES / max_partition_size + 1;
+    for(int i =0; i<list_size; i++)
+    {
+        key_v_t * key_arr = new key_v_t;
+        if(NUM_ENTRIES < max_partition_size*(i+1))
+            preprandstr(NUM_ENTRIES-max_partition_size*i, key_arr, KEY_LEN);
+        else
+            preprandstr(max_partition_size, key_arr, KEY_LEN);
+    
+        std::sort(key_arr->begin(), key_arr->end(), &mycmp);
+        key_v_list->push_back(key_arr);
+        printf("size partition %d is %d\n", i+1, key_arr->size());
+    }
+
+
+    
+    key_v_t * key_arr = new key_v_t;
+    
+    std::vector<key_v_t::iterator*> iters;
+    for(int i=0; i<list_size; i++)
+    {
+        iters.push_back(new key_v_t::iterator((*key_v_list)[i]->begin()));
+    }
+
+    int lc = 0;
+    while(true)
+    {
+        int list_index = -1;
+        for(int i=0; i<list_size; i++)
+        {
+            if(*iters[i] == (*key_v_list)[i]->end())
+                continue;
+            
+            if(list_index == -1 || mycmp(**iters[i], **iters[list_index]))
+                list_index = i;
+        }
+
+        if(list_index == -1)
+            break;
+        
+        if(key_arr->size() == 0 || mycmp(key_arr->back(), **iters[list_index]))
+            key_arr->push_back(**iters[list_index]);
+
+        (*iters[list_index])++;        
+        lc++;
+        if(lc % max_partition_size == 0)
+            printf("%d/%d completed.\n", lc, NUM_ENTRIES);
+    }
+
+    for(int i=0; i<list_size; i++)
+    {
+        (*key_v_list)[i]->clear();
+        delete (*key_v_list)[i];
+        delete iters[i];
+    }
+    key_v_list->clear();
+    delete key_v_list;
+    
+//    preprandstr(NUM_ENTRIES, data_arr, 10*8192);
+
+    printf("key arr size: %d\n", key_arr->size());
+
+    //removeduplicates(key_arr);
+    if(key_arr->size() > NUM_ENTRIES)
+        key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end());
+    
+    NUM_ENTRIES=key_arr->size();
+    
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    merge_scheduler mscheduler;    
+    logtable ltable;
+
+    int pcount = 40;
+    ltable.set_fixed_page_count(pcount);
+
+    recordid table_root = ltable.allocTable(xid);
+
+    Tcommit(xid);
+    
+    xid = Tbegin();
+
+    int lindex = mscheduler.addlogtable(&ltable);
+    ltable.setMergeData(mscheduler.getMergeData(lindex));
+    
+    mscheduler.startlogtable(lindex);
+
+    printf("Stage 1: Writing %d keys\n", NUM_ENTRIES);
+    
+    struct timeval start_tv, stop_tv, ti_st, ti_end;
+    double insert_time = 0;
+    int dpages = 0;
+    int npages = 0;
+    int delcount = 0, upcount = 0;
+    DataPage<datatuple> *dp=0;
+    int64_t datasize = 0;
+    std::vector<pageid_t> dsp;
+    std::vector<int> del_list;
+    gettimeofday(&start_tv,0);
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        //prepare the key
+        datatuple newtuple;        
+        uint32_t keylen = (*key_arr)[i].length()+1;
+        newtuple.keylen = &keylen;
+        
+        newtuple.key = (datatuple::key_t) malloc(keylen);
+        memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen);
+        //for(int j=0; j<keylen-1; j++)
+        //    newtuple.key[j] = (*key_arr)[i][j];
+        //newtuple.key[keylen-1]='\0';
+
+        //prepare the data
+        std::string ditem;
+        getnextdata(ditem, 8192);
+        uint32_t datalen = ditem.length()+1;
+        newtuple.datalen = &datalen;        
+        newtuple.data = (datatuple::data_t) malloc(datalen);
+        memcpy((byte*)newtuple.data, ditem.c_str(), datalen);
+//        for(int j=0; j<datalen-1; j++)
+//            newtuple.data[j] = (*data_arr)[i][j];
+//        newtuple.data[datalen-1]='\0';        
+        
+        /*
+        printf("key: \t, keylen: %u\ndata:  datalen: %u\n",
+               //newtuple.key,
+               *newtuple.keylen,
+               //newtuple.data,
+               *newtuple.datalen);
+               */
+        
+        datasize += newtuple.byte_length();
+
+        gettimeofday(&ti_st,0);        
+        ltable.insertTuple(newtuple);
+        gettimeofday(&ti_end,0);
+        insert_time += tv_to_double(ti_end) - tv_to_double(ti_st);
+
+        free(newtuple.key);
+        free(newtuple.data);
+
+        double rval = ((rand() % 100)+.0)/100;        
+        if( rval < delete_freq) //delete a key 
+        {
+            int del_index = i - (rand()%50); //delete one of the last inserted 50 elements
+            if(del_index >= 0 && std::find(del_list.begin(), del_list.end(), del_index) == del_list.end())
+            {
+                delcount++;
+                datatuple deltuple;        
+                keylen = (*key_arr)[del_index].length()+1;
+                deltuple.keylen = &keylen;
+        
+                deltuple.key = (datatuple::key_t) malloc(keylen);
+                memcpy((byte*)deltuple.key, (*key_arr)[del_index].c_str(), keylen);
+
+                deltuple.datalen = &datalen;        
+                deltuple.setDelete();
+
+                gettimeofday(&ti_st,0);        
+                ltable.insertTuple(deltuple);
+                gettimeofday(&ti_end,0);
+                insert_time += tv_to_double(ti_end) - tv_to_double(ti_st);
+
+                free(deltuple.key);
+
+                del_list.push_back(del_index);
+                
+            }            
+        }
+        else if(rval < delete_freq + update_freq) //update a record
+        {
+            int up_index = i - (rand()%50); //update one of the last inserted 50 elements
+            if(up_index >= 0 && std::find(del_list.begin(), del_list.end(), up_index) == del_list.end()) 
+            {//only update non-deleted elements
+                upcount++;
+                datatuple uptuple;        
+                keylen = (*key_arr)[up_index].length()+1;
+                uptuple.keylen = &keylen;
+        
+                uptuple.key = (datatuple::key_t) malloc(keylen);
+                memcpy((byte*)uptuple.key, (*key_arr)[up_index].c_str(), keylen);
+                
+                getnextdata(ditem, 512);
+                datalen = ditem.length()+1;
+                uptuple.datalen = &datalen;        
+                uptuple.data = (datatuple::data_t) malloc(datalen);
+                memcpy((byte*)uptuple.data, ditem.c_str(), datalen);
+                
+                gettimeofday(&ti_st,0);        
+                ltable.insertTuple(uptuple);
+                gettimeofday(&ti_end,0);
+                insert_time += tv_to_double(ti_end) - tv_to_double(ti_st);
+
+                free(uptuple.key);
+                free(uptuple.data);
+                
+            }            
+
+        }
+        
+    }
+    gettimeofday(&stop_tv,0);
+    printf("insert time: %6.1f\n", insert_time);
+    printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+    printf("#deletions: %d\n#updates: %d\n", delcount, upcount);
+
+    printf("\nTREE STRUCTURE\n");
+    //ltable.get_tree_c1()->print_tree(xid);
+    printf("datasize: %lld\n", datasize);
+    //sleep(20);
+
+    Tcommit(xid);
+    xid = Tbegin();
+
+
+
+    printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES);
+
+    int found_tuples=0;
+    for(int i=NUM_ENTRIES-1; i>=0; i--)
+    {        
+        int ri = i;
+        //printf("key index%d\n", i);
+        fflush(stdout);
+
+        //get the key
+        uint32_t keylen = (*key_arr)[ri].length()+1;        
+        datatuple::key_t rkey = (datatuple::key_t) malloc(keylen);
+        memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen);
+        //for(int j=0; j<keylen-1; j++)
+        //rkey[j] = (*key_arr)[ri][j];
+        //rkey[keylen-1]='\0';
+
+        //find the key with the given tuple
+        datatuple *dt = ltable.findTuple(xid, rkey, keylen);
+
+        if(std::find(del_list.begin(), del_list.end(), i) == del_list.end())
+        {
+            assert(dt!=0);
+            assert(!dt->isDelete());
+            found_tuples++;
+            assert(*(dt->keylen) == (*key_arr)[ri].length()+1);
+            //assert(*(dt->datalen) == (*data_arr)[ri].length()+1);
+            free(dt->keylen);
+            free(dt);
+        }
+        else
+        {
+            if(dt!=0)
+            {
+                assert(*(dt->keylen) == (*key_arr)[ri].length()+1);
+                assert(dt->isDelete());
+                free(dt->keylen);
+                free(dt);
+            }
+        }
+        dt = 0;
+        free(rkey);
+    }
+    printf("found %d\n", found_tuples);
+
+
+
+
+    
+    key_arr->clear();
+    //data_arr->clear();
+    delete key_arr;
+    //delete data_arr;
+    
+    mscheduler.shutdown();
+    printf("merge threads finished.\n");
+    gettimeofday(&stop_tv,0);
+    printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+
+
+    
+    Tcommit(xid);
+    Tdeinit();
+    
+    
+}
+
+
+
+/** @test
+ */
+int main()
+{
+    //insertProbeIter(25000);
+    insertProbeIter(400000);
+    /*
+    insertProbeIter(5000);
+    insertProbeIter(2500);
+    insertProbeIter(1000);
+    insertProbeIter(500);
+    insertProbeIter(1000);
+    insertProbeIter(100);
+    insertProbeIter(10);
+    */
+    
+    return 0;
+}
+
diff --git a/check_rbtree.cpp b/check_rbtree.cpp
new file mode 100644
index 0000000..af17780
--- /dev/null
+++ b/check_rbtree.cpp
@@ -0,0 +1,214 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include "datapage.cpp"
+#include "logiterators.cpp"
+#include "merger.h"
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#undef begin
+#undef end
+
+
+
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+//must be given a sorted array
+void removeduplicates(std::vector<std::string> &arr)
+{
+
+    for(int i=arr.size()-1; i>0; i--)
+    {
+        if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i])))        
+            arr.erase(arr.begin()+i);
+            
+    }
+
+}
+
+void preprandstr(int count, std::vector<std::string> &arr, int avg_len=50, bool duplicates_allowed=false)
+{    
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = (rand()%(avg_len*2)) + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        //make sure there is no duplicate key
+        if(!duplicates_allowed)
+        {
+            bool dup = false;
+            for(int i=0; i<j; i++)        
+                if(! (mycmp(arr[i], str) || mycmp(str, arr[i])))
+                {
+                    dup=true;
+                    break;
+                }
+            if(dup)
+            {
+                j--;
+                continue;
+            }
+        }
+
+        
+        //printf("keylen-%d\t%d\t%s\n", str_len, str.length(),rc);
+        free(rc);
+
+        arr.push_back(str);
+        
+    }
+
+}
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+
+    //data generation
+    std::vector<std::string> data_arr;
+    std::vector<std::string> key_arr;
+    preprandstr(NUM_ENTRIES, data_arr, 10*8192, true);
+    preprandstr(NUM_ENTRIES+200, key_arr, 100, true);
+    
+    std::sort(key_arr.begin(), key_arr.end(), &mycmp);
+
+    removeduplicates(key_arr);
+    if(key_arr.size() > NUM_ENTRIES)
+        key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end());
+    
+    NUM_ENTRIES=key_arr.size();
+    
+    if(data_arr.size() > NUM_ENTRIES)
+        data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end());
+    
+    std::set<datatuple, datatuple> rbtree;
+    int64_t datasize = 0;
+    std::vector<pageid_t> dsp;
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        //prepare the key
+        datatuple newtuple;        
+        uint32_t keylen = key_arr[i].length()+1;
+        newtuple.keylen = (uint32_t*)malloc(sizeof(uint32_t));
+        *newtuple.keylen = keylen;
+        
+        newtuple.key = (datatuple::key_t) malloc(keylen);
+        for(int j=0; j<keylen-1; j++)
+            newtuple.key[j] = key_arr[i][j];
+        newtuple.key[keylen-1]='\0';
+
+        //prepare the data
+        uint32_t datalen = data_arr[i].length()+1;
+        newtuple.datalen = (uint32_t*)malloc(sizeof(uint32_t));
+        *newtuple.datalen = datalen;
+        
+        newtuple.data = (datatuple::data_t) malloc(datalen);
+        for(int j=0; j<datalen-1; j++)
+            newtuple.data[j] = data_arr[i][j];
+        newtuple.data[datalen-1]='\0';
+
+        /*
+        printf("key: \t, keylen: %u\ndata:  datalen: %u\n",
+               //newtuple.key,
+               *newtuple.keylen,
+               //newtuple.data,
+               *newtuple.datalen);
+               */
+        
+        datasize += newtuple.byte_length();
+
+        rbtree.insert(newtuple);
+        
+        
+    }
+
+    printf("\nTREE STRUCTURE\n");
+    //ltable.get_tree_c1()->print_tree(xid);
+    printf("datasize: %d\n", datasize);
+
+    printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES);
+
+    int found_tuples=0;
+    for(int i=NUM_ENTRIES-1; i>=0; i--)
+    {        
+        int ri = i;
+
+        //get the key
+        uint32_t keylen = key_arr[ri].length()+1;        
+        datatuple::key_t rkey = (datatuple::key_t) malloc(keylen);
+        for(int j=0; j<keylen-1; j++)
+            rkey[j] = key_arr[ri][j];
+        rkey[keylen-1]='\0';
+
+        //find the key with the given tuple
+
+        //prepare a search tuple
+        datatuple search_tuple;
+        search_tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t));
+        *(search_tuple.keylen) = keylen;
+        search_tuple.key = rkey;
+        
+        
+        datatuple *ret_tuple=0; 
+        //step 1: look in tree_c0
+
+        rbtree_t::iterator rbitr = rbtree.find(search_tuple);
+        if(rbitr != rbtree.end())
+        {
+            datatuple tuple = *rbitr;
+            byte *barr = tuple.to_bytes();
+            ret_tuple = datatuple::from_bytes(barr);
+
+            found_tuples++;
+            assert(*(ret_tuple->keylen) == key_arr[ri].length()+1);
+            assert(*(ret_tuple->datalen) == data_arr[ri].length()+1);
+            free(barr);
+            free(ret_tuple);        
+        }
+        else
+        {
+            printf("Not in scratch_tree\n");
+        }
+        
+        free(search_tuple.keylen);
+        free(rkey);
+    }
+    printf("found %d\n", found_tuples);    
+}
+
+
+
+/** @test
+ */
+int main()
+{
+    insertProbeIter(250);
+
+    
+    
+    return 0;
+}
+
diff --git a/check_server.cpp b/check_server.cpp
new file mode 100644
index 0000000..60af0cf
--- /dev/null
+++ b/check_server.cpp
@@ -0,0 +1,107 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include "datapage.cpp"
+#include "logiterators.cpp"
+#include "merger.h"
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <csignal>
+
+#undef begin
+#undef end
+
+logserver *lserver=0;
+merge_scheduler *mscheduler=0;
+
+void terminate (int param)
+{
+  printf ("Stopping server...\n");  
+  lserver->stopserver();
+  delete lserver;
+
+  printf("Stopping merge threads...\n");
+  mscheduler->shutdown();
+  delete mscheduler;
+  
+  printf("Deinitializing stasis...\n");
+  fflush(stdout);
+  Tdeinit();
+  
+  exit(0);
+}
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+    //signal handling
+    void (*prev_fn)(int);
+
+    prev_fn = signal (SIGINT,terminate);
+    //if (prev_fn==SIG_IGN)
+    //signal (SIGTERM,SIG_IGN);
+
+    
+    sync();
+    
+    bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE;
+
+    Tinit();
+
+    int xid = Tbegin();
+
+    mscheduler = new merge_scheduler;    
+    logtable ltable;
+
+    
+
+    int pcount = 40;
+    ltable.set_fixed_page_count(pcount);
+
+    recordid table_root = ltable.allocTable(xid);
+
+    Tcommit(xid);
+
+    int lindex = mscheduler->addlogtable(&ltable);
+    ltable.setMergeData(mscheduler->getMergeData(lindex));
+    
+    mscheduler->startlogtable(lindex);
+
+
+    lserver = new logserver(10, 32432);
+
+    lserver->startserver(&ltable);
+
+    
+//    Tdeinit();
+    
+    
+}
+
+
+
+/** @test
+ */
+int main()
+{
+    //insertProbeIter(25000);
+    insertProbeIter(10000);
+    /*
+    insertProbeIter(5000);
+    insertProbeIter(2500);
+    insertProbeIter(1000);
+    insertProbeIter(500);
+    insertProbeIter(1000);
+    insertProbeIter(100);
+    insertProbeIter(10);
+    */
+    
+    return 0;
+}
+
diff --git a/check_tcpclient.cpp b/check_tcpclient.cpp
new file mode 100644
index 0000000..a505e52
--- /dev/null
+++ b/check_tcpclient.cpp
@@ -0,0 +1,415 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include "logstore.h"
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sys/types.h> 
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h> 
+
+
+#undef begin
+#undef end
+
+
+
+
+bool mycmp(const std::string & k1,const std::string & k2)
+{            
+    //for char* ending with \0
+    return strcmp(k1.c_str(),k2.c_str()) < 0;
+    
+    //for int32_t
+    //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+    //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+}
+
+//must be given a sorted array
+void removeduplicates(std::vector<std::string> *arr)
+{
+
+    for(int i=arr->size()-1; i>0; i--)
+    {
+        if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i])))        
+            arr->erase(arr->begin()+i);
+            
+    }
+
+}
+
+void getnextdata(std::string &data, int avg_len)
+{
+    int str_len = (rand()%(avg_len*2)) + 3;
+
+    data = std::string(str_len, rand()%10+48);
+    /*
+    char *rc = (char*)malloc(str_len);
+    
+    for(int i=0; i<str_len-1; i++)        
+        rc[i] = rand()%10+48;
+        
+    rc[str_len-1]='\0';
+    data = std::string(rc);
+    
+    free(rc);
+    */
+
+}
+
+void preprandstr(int count, std::vector<std::string> *arr, int avg_len=50)
+{
+
+    for ( int j=0; j<count; j++)
+    {
+        int str_len = (rand()%(avg_len*2)) + 3;
+
+        char *rc = (char*)malloc(str_len);
+        
+        for(int i=0; i<str_len-1; i++)        
+            rc[i] = rand()%10+48;
+        
+        rc[str_len-1]='\0';
+        std::string str(rc);
+        
+        free(rc);
+
+        arr->push_back(str);
+        
+    }
+
+}
+
+inline void readfromsocket(int sockd, byte *buf, int count)
+{
+
+    int n = 0;
+    while( n < count )
+    {
+        n += read( sockd, buf + n, count - n);
+    }
+    
+}
+
+inline void writetosocket(int sockd, byte *buf, int count)
+{
+    int n = 0;
+    while( n < count )
+    {
+        n += write( sockd, buf + n, count - n);
+    }    
+}
+
+datatuple * sendTuple(std::string & servername, int serverport, uint8_t opcode,  datatuple &tuple)
+{
+    struct sockaddr_in serveraddr;
+    struct hostent *server;
+    
+    int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+
+    if (sockfd < 0) 
+    {
+        printf("ERROR opening socket.\n");
+        return 0;
+    }
+    
+    server = gethostbyname(servername.c_str());
+    if (server == NULL) {
+        fprintf(stderr,"ERROR, no such host as %s\n", servername.c_str());
+        exit(0);
+    }
+
+    /* build the server's Internet address */
+    bzero((char *) &serveraddr, sizeof(serveraddr));
+    serveraddr.sin_family = AF_INET;
+    bcopy((char *)server->h_addr, 
+	  (char *)&serveraddr.sin_addr.s_addr, server->h_length);
+    serveraddr.sin_port = htons(serverport);
+
+    /* connect: create a connection with the server */
+    if (connect(sockfd, (sockaddr*) &serveraddr, sizeof(serveraddr)) < 0)
+    {
+        printf("ERROR connecting\n");
+        return 0;
+    }
+
+
+    //send the opcode
+    int n = write(sockfd, (byte*) &opcode, sizeof(uint8_t));
+    assert(n == sizeof(uint8_t));
+
+    //send the tuple
+    n = write(sockfd, (byte*) tuple.keylen, sizeof(uint32_t));
+    assert( n == sizeof(uint32_t));
+
+    n = write(sockfd, (byte*) tuple.datalen, sizeof(uint32_t));
+    assert( n == sizeof(uint32_t));
+
+    writetosocket(sockfd, (byte*) tuple.key, *tuple.keylen);
+    if(!tuple.isDelete() && *tuple.datalen != 0)
+        writetosocket(sockfd, (byte*) tuple.data, *tuple.datalen);
+
+    //read the reply code
+    uint8_t rcode;
+    n = read(sockfd, (byte*) &rcode, sizeof(uint8_t));
+
+    if(rcode == logserver::OP_SENDING_TUPLE)
+    {
+        datatuple *rcvdtuple = (datatuple*)malloc(sizeof(datatuple));
+        //read the keylen
+        rcvdtuple->keylen = (uint32_t*) malloc(sizeof(uint32_t));
+        n = read(sockfd, (byte*) rcvdtuple->keylen, sizeof(uint32_t));
+        assert(n == sizeof(uint32_t));
+        //read the datalen
+        rcvdtuple->datalen = (uint32_t*) malloc(sizeof(uint32_t));
+        n = read(sockfd, (byte*) rcvdtuple->datalen, sizeof(uint32_t));
+        assert(n == sizeof(uint32_t));
+        //read key
+        rcvdtuple->key = (byte*) malloc(*rcvdtuple->keylen);
+        readfromsocket(sockfd, (byte*) rcvdtuple->key, *rcvdtuple->keylen);
+        if(!rcvdtuple->isDelete())
+        {
+            //read key
+            rcvdtuple->data = (byte*) malloc(*rcvdtuple->datalen);
+            readfromsocket(sockfd, (byte*) rcvdtuple->data, *rcvdtuple->datalen);        
+        }
+
+        close(sockfd);
+        return rcvdtuple;
+    }
+    else
+        assert(rcode == logserver::OP_SUCCESS);
+    
+    close(sockfd);
+    return 0;
+}
+
+
+void insertProbeIter(int  NUM_ENTRIES)
+{
+    srand(1000);
+    std::string servername = "sherpa4";
+    int serverport = 32432;
+
+    double delete_freq = .05;
+    double update_freq = .15;
+    
+    //data generation
+    typedef std::vector<std::string> key_v_t;
+    const static int max_partition_size = 100000;
+    int KEY_LEN = 100;
+    std::vector<key_v_t*> *key_v_list = new std::vector<key_v_t*>;
+    int list_size = NUM_ENTRIES / max_partition_size + 1;
+    for(int i =0; i<list_size; i++)
+    {
+        key_v_t * key_arr = new key_v_t;
+        if(NUM_ENTRIES < max_partition_size*(i+1))
+            preprandstr(NUM_ENTRIES-max_partition_size*i, key_arr, KEY_LEN);
+        else
+            preprandstr(max_partition_size, key_arr, KEY_LEN);
+    
+        std::sort(key_arr->begin(), key_arr->end(), &mycmp);
+        key_v_list->push_back(key_arr);
+        printf("size partition %d is %d\n", i+1, key_arr->size());
+    }
+
+
+    
+    key_v_t * key_arr = new key_v_t;
+    
+    std::vector<key_v_t::iterator*> iters;
+    for(int i=0; i<list_size; i++)
+    {
+        iters.push_back(new key_v_t::iterator((*key_v_list)[i]->begin()));
+    }
+
+    int lc = 0;
+    while(true)
+    {
+        int list_index = -1;
+        for(int i=0; i<list_size; i++)
+        {
+            if(*iters[i] == (*key_v_list)[i]->end())
+                continue;
+            
+            if(list_index == -1 || mycmp(**iters[i], **iters[list_index]))
+                list_index = i;
+        }
+
+        if(list_index == -1)
+            break;
+        
+        if(key_arr->size() == 0 || mycmp(key_arr->back(), **iters[list_index]))
+            key_arr->push_back(**iters[list_index]);
+
+        (*iters[list_index])++;        
+        lc++;
+        if(lc % max_partition_size == 0)
+            printf("%d/%d completed.\n", lc, NUM_ENTRIES);
+    }
+
+    for(int i=0; i<list_size; i++)
+    {
+        (*key_v_list)[i]->clear();
+        delete (*key_v_list)[i];
+        delete iters[i];
+    }
+    key_v_list->clear();
+    delete key_v_list;
+    
+//    preprandstr(NUM_ENTRIES, data_arr, 10*8192);
+
+    printf("key arr size: %d\n", key_arr->size());
+
+    //removeduplicates(key_arr);
+    if(key_arr->size() > NUM_ENTRIES)
+        key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end());
+    
+    NUM_ENTRIES=key_arr->size();
+    
+    printf("Stage 1: Writing %d keys\n", NUM_ENTRIES);
+    
+    struct timeval start_tv, stop_tv, ti_st, ti_end;
+    double insert_time = 0;
+    int dpages = 0;
+    int npages = 0;
+    int delcount = 0, upcount = 0;
+    int64_t datasize = 0;
+    std::vector<pageid_t> dsp;
+    std::vector<int> del_list;
+    gettimeofday(&start_tv,0);
+    for(int i = 0; i < NUM_ENTRIES; i++)
+    {
+        //prepare the key
+        datatuple newtuple;        
+        uint32_t keylen = (*key_arr)[i].length()+1;
+        newtuple.keylen = &keylen;
+        
+        newtuple.key = (datatuple::key_t) malloc(keylen);
+        memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen);
+
+        //prepare the data
+        std::string ditem;
+        getnextdata(ditem, 8192);
+        uint32_t datalen = ditem.length()+1;
+        newtuple.datalen = &datalen;        
+        newtuple.data = (datatuple::data_t) malloc(datalen);
+        memcpy((byte*)newtuple.data, ditem.c_str(), datalen);
+
+        /*
+        printf("key: \t, keylen: %u\ndata:  datalen: %u\n",
+               //newtuple.key,
+               *newtuple.keylen,
+               //newtuple.data,
+               *newtuple.datalen);
+               */
+        
+        datasize += newtuple.byte_length();
+
+        gettimeofday(&ti_st,0);        
+
+        //send the data
+        sendTuple(servername, serverport, logserver::OP_INSERT, newtuple);
+        
+        gettimeofday(&ti_end,0);
+        insert_time += tv_to_double(ti_end) - tv_to_double(ti_st);
+
+        free(newtuple.key);
+        free(newtuple.data);
+
+        if(i % 10000 == 0 && i > 0)
+            printf("%d / %d inserted.\n", i, NUM_ENTRIES);
+            
+    }
+    gettimeofday(&stop_tv,0);
+    printf("insert time: %6.1f\n", insert_time);
+    printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+    printf("#deletions: %d\n#updates: %d\n", delcount, upcount);
+
+    
+
+    printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES);
+
+    int found_tuples=0;
+    for(int i=NUM_ENTRIES-1; i>=0; i--)
+    {        
+        int ri = i;        
+        //printf("key index%d\n", i);
+        fflush(stdout);
+        
+        //get the key
+        uint32_t keylen = (*key_arr)[ri].length()+1;        
+        datatuple searchtuple;
+        searchtuple.keylen = (uint32_t*)malloc(2*sizeof(uint32_t) + keylen);
+        *searchtuple.keylen = keylen;
+
+        searchtuple.datalen = searchtuple.keylen + 1;
+        *searchtuple.datalen = 0;
+
+        searchtuple.key = (datatuple::key_t)(searchtuple.keylen + 2);
+        memcpy((byte*)searchtuple.key, (*key_arr)[ri].c_str(), keylen);
+
+        //find the key with the given tuple
+        datatuple *dt = sendTuple(servername, serverport, logserver::OP_FIND,
+                                  searchtuple);
+        
+        assert(dt!=0);
+        assert(!dt->isDelete());
+        found_tuples++;
+        assert(*(dt->keylen) == (*key_arr)[ri].length()+1);
+
+        //free dt
+        free(dt->keylen);
+        free(dt->datalen);
+        free(dt->key);
+        free(dt->data);        
+        free(dt);
+        
+        dt = 0;
+
+        free(searchtuple.keylen);        
+        
+    }
+    printf("found %d\n", found_tuples);
+
+    
+
+
+    
+    key_arr->clear();
+    //data_arr->clear();
+    delete key_arr;
+    //delete data_arr;
+    
+    gettimeofday(&stop_tv,0);
+    printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv)));
+    
+}
+
+
+
+/** @test
+ */
+int main()
+{
+    //insertProbeIter(25000);
+    insertProbeIter(100000);
+    /*
+    insertProbeIter(5000);
+    insertProbeIter(2500);
+    insertProbeIter(1000);
+    insertProbeIter(500);
+    insertProbeIter(1000);
+    insertProbeIter(100);
+    insertProbeIter(10);
+    */
+    
+    return 0;
+}
+
diff --git a/cmds.txt b/cmds.txt
new file mode 100644
index 0000000..5b24608
--- /dev/null
+++ b/cmds.txt
@@ -0,0 +1,9 @@
+ dd if=/dev/zero of=storefile.txt bs=1M count=20000
+
+
+/dhtRecOpsGenerator -d clientType=LogStoreClient host=sherpa4 numOps=10ls existingStartKey=100 existingEndKey=1000 insertRatio=1.0
+
+
+
+
+dhtRecOpsGeneratorWrapper startClientID=1 endClientID=4 -d clientType=LogStoreClient host=sherpa4.corp.re1.yahoo.com numOps=5000000 existingStartKey=100 existingEndKey=10000000 insertRatio=1.0 readRatio=0 numClients=3
diff --git a/datapage.cpp b/datapage.cpp
new file mode 100644
index 0000000..b931e10
--- /dev/null
+++ b/datapage.cpp
@@ -0,0 +1,507 @@
+
+#include "logstore.h"
+#include "datapage.h"
+
+template <class TUPLE>
+const int32_t DataPage<TUPLE>::HEADER_SIZE = sizeof(int32_t);
+
+template <class TUPLE>
+DataPage<TUPLE>::DataPage(int xid, pageid_t pid):
+    alloc_region(0),
+    alloc_state(0),
+    fix_pcount(-1)
+{
+    assert(pid!=0);
+    
+    pcount = readPageCount(xid, pid);
+    
+    pidarr = (pageid_t *) malloc(sizeof(pageid_t) * pcount);
+
+    for(int i=0; i<pcount; i++)
+        pidarr[i] = i + pid;
+
+    byte_offset = HEADER_SIZE; //step over the header info
+    
+}
+
+template <class TUPLE>
+DataPage<TUPLE>::DataPage(int xid, int fix_pcount, pageid_t (*alloc_region)(int, void*), void * alloc_state)
+{
+    assert(fix_pcount >= 1);
+    byte_offset = -1;
+
+    this->fix_pcount = fix_pcount;
+
+    if(alloc_region != 0)    
+        this->alloc_region = alloc_region;
+    if(alloc_state != 0)
+        this->alloc_state = alloc_state;
+    
+    initialize(xid);
+}
+
+template<class TUPLE>
+DataPage<TUPLE>::~DataPage()
+{
+    if(pidarr)
+        free(pidarr);
+}
+
+
+template<class TUPLE>
+void DataPage<TUPLE>::initialize(int xid)
+{
+    //initializes to an empty datapage
+    //alloc a new page
+    pageid_t pid = alloc_region(xid, alloc_state);
+
+    //load the first page
+    //Page *p = loadPage(xid, pid);
+    Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE);
+    writelock(p->rwlatch,0);
+    
+    //initialize header
+    
+    //set number of pages to 1
+    int32_t * numpages_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, 0);
+    *numpages_ptr = 1;
+    
+    //write 0 to first data size    
+    int32_t * size_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, HEADER_SIZE);
+    *size_ptr = 0;
+
+    //set the page dirty
+    stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p);
+    
+    //release the page
+    unlock(p->rwlatch);
+    releasePage(p);
+    
+    //set the class variables
+    byte_offset = HEADER_SIZE;
+    pcount = 1;
+    pidarr = (pageid_t *) malloc(fix_pcount * sizeof(pageid_t));
+    pidarr[0] = pid;
+    
+}
+
+template <class TUPLE>
+inline bool DataPage<TUPLE>::append(int xid, TUPLE const & dat)
+{
+    assert(byte_offset >= HEADER_SIZE);
+    assert(fix_pcount >= 1);
+    
+    //check if there is enough space (for the data length + data)
+    int32_t blen = dat.byte_length() + sizeof(int32_t);    
+    if(PAGE_SIZE * fix_pcount - byte_offset < blen)
+    {
+        //check if the record is too large
+        // and if so do we wanna accomodate here by going over the fix_pcount
+        if(PAGE_SIZE * fix_pcount - HEADER_SIZE < blen && //record is larger than datapage           
+           PAGE_SIZE * fix_pcount - HEADER_SIZE > 2 * byte_offset)//accept if i am less than half full
+        {
+            //nothing
+        }
+        else
+        {
+            //printf("page has %d bytes left, we needed %d. (byte_offset %d)\n",
+            //PAGE_SIZE * fix_pcount - byte_offset, blen, byte_offset);
+            return false;   //not enough mana, return
+        }
+    }
+
+    //write the length of the data
+    int32_t dsize = blen - sizeof(int32_t);
+    
+    if(!writebytes(xid, sizeof(int32_t), (byte*)(&dsize)))    
+        return false;    
+    byte_offset += sizeof(int32_t);
+
+    //write the data
+    byte * barr = dat.to_bytes();
+    if(!writebytes(xid, dsize, barr)) //if write fails, undo the previous write
+    {        
+        byte_offset -= sizeof(int32_t);        
+        free(barr);
+        //write 0 for the next tuple size, if there is enough space in this page
+        if(PAGE_SIZE - (byte_offset % PAGE_SIZE) >= sizeof(int32_t))
+        {
+            dsize = 0;
+            writebytes(xid, sizeof(int32_t), (byte*)(&dsize));//this will succeed, since there is enough space on the page
+        }        
+        return false;
+    }
+    free(barr);
+    byte_offset += dsize;
+
+    //write 0 for the next tuple size, if there is enough space in this page
+    if(PAGE_SIZE - (byte_offset % PAGE_SIZE) >= sizeof(int32_t))
+    {
+        dsize = 0;
+        writebytes(xid, sizeof(int32_t), (byte*)(&dsize));//this will succeed, since there is enough space on the page
+    }
+    
+    return true;
+}
+
+template <class TUPLE>
+bool DataPage<TUPLE>::writebytes(int xid, int count, byte *data)
+{
+
+    int32_t bytes_copied = 0;
+    while(bytes_copied < count)
+    {
+        //load the page to copy into
+        int pindex = (byte_offset + bytes_copied) / PAGE_SIZE;
+        if(pindex == pcount) //then this page must be allocated
+        {
+            pageid_t newid = alloc_region(xid, alloc_state);
+            //check continuity
+            if(pidarr[pindex-1] != newid - 1)//so we started a new region and that is not right after the prev region in the file
+            {                
+                return false;//we cant store this
+            }
+
+            //check whether we need to extend the pidarr, add fix_pcount many pageid_t slots
+            if(pindex >= fix_pcount && (pindex % fix_pcount==0))
+            {
+                pidarr = (pageid_t*)realloc(pidarr, (pindex + fix_pcount)*sizeof(pageid_t));
+            }
+            pidarr[pindex] = newid;
+            pcount++;
+            incrementPageCount(xid, pidarr[0]);
+        }
+        //Page *p = loadPage(xid, pidarr[pindex]);
+        Page *p = loadPageOfType(xid, pidarr[pindex], SEGMENT_PAGE);
+        writelock(p->rwlatch,0);
+        
+        //copy the portion of bytes we can copy in this page
+        int32_t page_offset = (byte_offset+bytes_copied) % PAGE_SIZE;
+        int32_t copy_len = ( (PAGE_SIZE - page_offset < count - bytes_copied ) ? PAGE_SIZE - page_offset: count - bytes_copied);
+
+        byte * pb_ptr = stasis_page_byte_ptr_from_start(p, page_offset);
+        memcpy(pb_ptr, data+bytes_copied  ,copy_len);
+    
+        //release the page
+        stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p);
+        unlock(p->rwlatch);
+        releasePage(p);        
+        
+        //update the copied bytes_count
+        bytes_copied += copy_len;
+
+
+    }
+
+    assert(bytes_copied == count);
+    return true;
+}
+
+template <class TUPLE>
+bool DataPage<TUPLE>::recordRead(int xid, typename TUPLE::key_t key, size_t keySize,  TUPLE ** buf)
+{
+    RecordIterator itr(this);
+
+    int match = -1;
+    while((*buf=itr.getnext(xid)) != 0)
+        {
+            match = TUPLE::compare((*buf)->get_key(), key);
+            
+            if(match<0) //keep searching
+            {
+                free((*buf)->keylen);
+                free(*buf);                
+                *buf=0;
+            }
+            else if(match==0) //found
+            {
+                return true;
+            }
+            else // match > 0, then does not exist
+            {
+                free((*buf)->keylen);
+                free(*buf);
+                *buf = 0;
+                break;
+            }
+        }
+    
+    return false;
+}
+
+template <class TUPLE>
+void DataPage<TUPLE>::readbytes(int xid, int32_t offset, int count, byte **data)
+{
+
+    if(*data==NULL)
+        *data = (byte*)malloc(count);
+    
+    int32_t bytes_copied = 0;
+    while(bytes_copied < count)
+    {
+        //load the page to copy from
+        int pindex = (offset + bytes_copied) / PAGE_SIZE;
+        
+        //Page *p = loadPage(xid, pidarr[pindex]);
+        Page *p = loadPageOfType(xid, pidarr[pindex], SEGMENT_PAGE);
+        readlock(p->rwlatch,0);
+        
+        //copy the portion of bytes we can copy from this page
+        int32_t page_offset = (offset+bytes_copied) % PAGE_SIZE;
+        int32_t copy_len = ( (PAGE_SIZE - page_offset < count - bytes_copied ) ? PAGE_SIZE - page_offset : count - bytes_copied);
+
+        byte * pb_ptr = stasis_page_byte_ptr_from_start(p, page_offset);
+        memcpy((*data)+bytes_copied, pb_ptr, copy_len);
+    
+        //release the page
+        unlock(p->rwlatch);
+        releasePage(p);        
+        
+        //update the copied bytes_count
+        bytes_copied += copy_len;
+    }
+
+    assert(bytes_copied == count);
+}
+
+
+template <class TUPLE>
+inline int DataPage<TUPLE>::readPageCount(int xid, pageid_t pid)
+{
+
+    //Page *p = loadPage(xid, pid);
+    Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE);
+    readlock(p->rwlatch,0);
+
+    int32_t numpages = *((int32_t*)stasis_page_byte_ptr_from_start(p, 0));
+    
+    unlock(p->rwlatch);
+    releasePage(p);
+
+    return numpages;
+}
+
+template <class TUPLE>
+inline void DataPage<TUPLE>::incrementPageCount(int xid, pageid_t pid, int add)
+{
+    //Page *p = loadPage(xid, pid);
+    Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE);
+    writelock(p->rwlatch,0);
+
+    int32_t *numpages_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, 0);
+
+    *numpages_ptr = *numpages_ptr + add;
+
+    stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p);
+    
+    unlock(p->rwlatch);
+    releasePage(p);
+
+
+    
+}
+
+
+template <class TUPLE>
+inline uint16_t DataPage<TUPLE>::recordCount(int xid)
+{
+
+    return 0;
+}
+
+template <class TUPLE>
+pageid_t DataPage<TUPLE>::dp_alloc_region(int xid, void *conf)
+{
+    RegionAllocConf_t* a = (RegionAllocConf_t*)conf;
+
+    
+    if(a->nextPage == a->endOfRegion) {
+        if(a->regionList.size == -1) {
+            //DEBUG("nextPage: %lld\n", a->nextPage);
+            a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t));
+        DEBUG("regionList.page: %lld\n", a->regionList.page);
+        DEBUG("regionList.slot: %d\n", a->regionList.slot);
+        DEBUG("regionList.size: %lld\n", a->regionList.size);
+        
+        a->regionCount = 0;
+    }
+        DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page);
+    TarrayListExtend(xid,a->regionList,1);
+    a->regionList.slot = a->regionCount;
+    DEBUG("region lst slot %d\n",a->regionList.slot);
+    a->regionCount++;
+    DEBUG("region count %lld\n",a->regionCount);
+    a->nextPage = TregionAlloc(xid, a->regionSize,12);
+    DEBUG("next page %lld\n",a->nextPage);
+    a->endOfRegion = a->nextPage + a->regionSize;
+    Tset(xid,a->regionList,&a->nextPage);
+    DEBUG("next page %lld\n",a->nextPage);
+  }
+    
+  DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion);
+  pageid_t ret = a->nextPage;
+  // Ensure the page is in buffer cache without accessing disk (this
+  // sets it to clean and all zeros if the page is not in cache).
+  // Hopefully, future reads will get a cache hit, and avoid going to
+  // disk.
+
+  Page * p = loadUninitializedPage(xid, ret);
+  //writelock(p->rwlatch,0);
+  p->pageType = SEGMENT_PAGE;
+  //unlock(p->rwlatch);  
+  releasePage(p);  
+  DEBUG("ret %lld\n",ret);
+  (a->nextPage)++;
+  return ret;
+
+}
+
+template <class TUPLE>
+pageid_t DataPage<TUPLE>::dp_alloc_region_rid(int xid, void * ridp) {
+  recordid rid = *(recordid*)ridp;
+  RegionAllocConf_t conf;
+  Tread(xid,rid,&conf);
+  pageid_t ret = dp_alloc_region(xid,&conf);
+  //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page);
+  // XXX get rid of Tset by storing next page in memory, and losing it
+  //     on crash.
+  Tset(xid,rid,&conf);
+  return ret;
+}
+
+template <class TUPLE>
+void DataPage<TUPLE>::dealloc_region_rid(int xid, void *conf)
+{
+    RegionAllocConf_t a = *((RegionAllocConf_t*)conf);
+    DEBUG("{%lld <- dealloc region arraylist}\n", a.regionList.page);
+
+    for(int i = 0; i < a.regionCount; i++) {
+     a.regionList.slot = i;
+     pageid_t pid;
+     Tread(xid,a.regionList,&pid);
+     TregionDealloc(xid,pid);
+    }
+}
+
+template <class TUPLE>
+void DataPage<TUPLE>::force_region_rid(int xid, void *conf)
+{
+    recordid rid = *(recordid*)conf;
+    RegionAllocConf_t a;
+    Tread(xid,rid,&a);
+    
+    for(int i = 0; i < a.regionCount; i++)
+    {
+        a.regionList.slot = i;
+        pageid_t pid;
+        Tread(xid,a.regionList,&pid);
+        stasis_dirty_page_table_flush_range((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), pid, pid+a.regionSize);
+        forcePageRange(pid, pid+a.regionSize);
+    }
+}
+
+
+///////////////////////////////////////////////////////////////
+//RECORD ITERATOR
+///////////////////////////////////////////////////////////////
+
+
+template <class TUPLE>
+TUPLE* DataPage<TUPLE>::RecordIterator::getnext(int xid)
+{
+
+    
+    int pindex = offset / PAGE_SIZE;
+
+    if(pindex == dp->pcount)//past end
+        return 0;
+    if(pindex == dp->pcount - 1 && (PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)))
+        return 0;
+    
+    //Page *p = loadPage(xid, dp->pidarr[pindex]);
+    Page *p = loadPageOfType(xid, dp->pidarr[pindex], SEGMENT_PAGE);
+    readlock(p->rwlatch,0);    
+
+    int32_t *dsize_ptr;
+    if(PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)) //int spread in two pages
+    {
+        dsize_ptr = 0;
+        dp->readbytes(xid, offset, sizeof(int32_t), (byte**)(&dsize_ptr));
+    }
+    else //int in a single page
+        dsize_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, offset % PAGE_SIZE);
+    
+    offset += sizeof(int32_t);
+                
+    if(*dsize_ptr == 0) //no more keys
+    {            
+        unlock(p->rwlatch);
+        releasePage(p);
+        return 0;
+    }
+    
+    byte* tb=0;
+    dp->readbytes(xid, offset, *dsize_ptr, &tb);
+
+    TUPLE *tup = TUPLE::from_bytes(tb);
+
+    offset += *dsize_ptr;
+
+    unlock(p->rwlatch);
+    releasePage(p);
+
+    return tup;
+}
+
+
+
+template <class TUPLE>
+void DataPage<TUPLE>::RecordIterator::advance(int xid, int count)
+{
+
+    int pindex = -1;
+    Page *p = 0;
+    
+    for(int i=0; i<count; i++)
+    {
+        if(pindex != offset / PAGE_SIZE) //advance to new page if necessary
+        {
+            if(p!=NULL)
+            {
+                unlock(p->rwlatch);
+                releasePage(p);
+            }
+            
+            pindex = offset / PAGE_SIZE;
+
+            if(pindex == dp->pcount)//past end
+                return;
+            
+            //p = loadPage(xid, dp->pidarr[pindex]);
+            p = loadPageOfType(xid, dp->pidarr[pindex], SEGMENT_PAGE);
+            readlock(p->rwlatch,0);            
+        }
+
+        if(pindex == dp->pcount - 1 && (PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)))
+            return;
+
+        int32_t *dsize_ptr=0;
+        if(PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)) //int spread in two pages        
+            dp->readbytes(xid, offset, sizeof(int32_t), (byte**)(&dsize_ptr));        
+        else //int in a single page
+            dsize_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, offset % PAGE_SIZE);
+        
+        offset += sizeof(int32_t);
+                
+        if(*dsize_ptr == 0) //no more keys
+        {            
+            unlock(p->rwlatch);
+            releasePage(p);
+            return;
+        }
+
+        offset += *dsize_ptr;
+
+    }
+
+}
diff --git a/datapage.h b/datapage.h
new file mode 100644
index 0000000..f26f454
--- /dev/null
+++ b/datapage.h
@@ -0,0 +1,110 @@
+#ifndef _SIMPLE_DATA_PAGE_H_
+#define _SIMPLE_DATA_PAGE_H_
+
+#include <limits.h>
+
+#include <stasis/page.h>
+#include <stasis/constants.h>
+
+
+
+template<class TUPLE>
+class DataPage
+{
+public:
+    
+    class RecordIterator
+    {
+    public:
+        RecordIterator(DataPage *dp)
+            {
+                offset = HEADER_SIZE;
+                this->dp = dp;
+            }
+
+        RecordIterator(const RecordIterator &rhs)
+            {
+                this->offset = rhs.offset;
+                this->dp = rhs.dp;            
+            }
+
+        void operator=(const RecordIterator &rhs)
+            {
+                this->offset = rhs.offset;
+                this->dp = rhs.dp;
+            }
+        
+
+        //returns the next tuple and also advances the iterator
+        TUPLE *getnext(int xid);
+
+        //advance the iterator by count tuples, i.e. skip over count tuples         
+        void advance(int xid, int count=1);
+        
+        
+        int32_t offset ;
+        DataPage *dp;
+        
+        
+    };
+
+    
+public:
+
+    //to be used when reading an existing data page from disk
+    DataPage( int xid, pageid_t pid );
+
+    //to be used to create new data pages
+    DataPage( int xid, int fix_pcount, pageid_t (*alloc_region)(int, void*), void * alloc_state);
+
+    ~DataPage();
+
+    inline bool append(int xid, TUPLE const & dat);
+    bool recordRead(int xid, typename TUPLE::key_t key, size_t keySize,  TUPLE ** buf);
+
+    inline uint16_t recordCount(int xid);
+
+
+    RecordIterator begin(){return RecordIterator(this);}
+
+    pageid_t get_start_pid(){return pidarr[0];}
+    int get_page_count(){return pcount;}
+
+    static pageid_t dp_alloc_region(int xid, void *conf);
+    
+    static pageid_t dp_alloc_region_rid(int xid, void * ridp);
+
+    static void dealloc_region_rid(int xid, void* conf);
+
+    static void force_region_rid(int xid, void *conf);
+
+public:
+    
+private:
+
+    void initialize(int xid);
+
+    //reads the page count information from the first page
+    int readPageCount(int xid, pageid_t pid);
+    void incrementPageCount(int xid, pageid_t pid, int add=1);
+
+    bool writebytes(int xid, int count, byte *data);
+    inline void readbytes(int xid, int32_t offset, int count, byte **data=0);
+
+private:
+    int fix_pcount; //number of pages in a standard data page
+    int pcount;
+    pageid_t *pidarr;
+    int32_t byte_offset;//points to the next free byte
+
+
+    //page alloc function
+    pageid_t (*alloc_region)(int, void*);
+    void *alloc_state;
+
+    static const int32_t HEADER_SIZE;
+    
+
+};
+
+#endif
diff --git a/datatuple.h b/datatuple.h
new file mode 100644
index 0000000..0e1e4ce
--- /dev/null
+++ b/datatuple.h
@@ -0,0 +1,147 @@
+#ifndef _DATATUPLE_H_
+#define _DATATUPLE_H_
+
+
+typedef unsigned char uchar;
+
+#include <string>
+
+//#define byte unsigned char
+typedef unsigned char byte;
+#include <cstring>
+
+//#include <stdio.h>
+//#include <stdlib.h>
+//#include <errno.h>
+
+typedef struct datatuple
+{
+    typedef uchar* key_t;
+    typedef uchar* data_t;
+    uint32_t *keylen;    //key length should be size of string + 1 for \n
+    uint32_t *datalen; 
+    key_t key;
+    data_t data;
+
+    //this is used by the stl set
+    bool operator() (const datatuple& lhs, const datatuple& rhs) const
+        {
+            //std::basic_string<uchar> s1(lhs.key);
+            //std::basic_string<uchar> s2(rhs.key);
+            return strcmp((char*)lhs.key,(char*)rhs.key) < 0;
+            //return (*((int32_t*)lhs.key)) <= (*((int32_t*)rhs.key));
+        }
+
+    /**
+     * return -1 if k1 < k2
+     * 0 if k1 == k2
+     * 1 of k1 > k2
+    **/
+    static int compare(const key_t k1,const key_t k2)
+        {            
+            //for char* ending with \0
+            return strcmp((char*)k1,(char*)k2);
+
+            //for int32_t
+            //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2)));
+            //return (*((int32_t*)k1)) <= (*((int32_t*)k2));
+        }
+
+    void setDelete()
+        {
+            *datalen = UINT_MAX;
+        }
+
+    inline bool isDelete() const
+        {
+            return *datalen == UINT_MAX;
+        }
+
+    static std::string key_to_str(const byte* k)
+        {
+            //for strings
+            return std::string((char*)k);
+            //for int
+            /*
+            std::ostringstream ostr;
+            ostr << *((int32_t*)k);            
+            return ostr.str();
+            */
+        }
+
+    //returns the length of the byte array representation
+    int32_t byte_length() const{
+        static const size_t isize = sizeof(uint32_t);
+        if(isDelete())
+            return isize + *keylen + isize; 
+        else
+            return isize + *keylen + isize + (*datalen);
+    }
+
+    //format: key length _   data length _ key _ data
+    byte * to_bytes() const {
+        static const size_t isize = sizeof(uint32_t);
+        byte * ret;
+        if(!isDelete())
+            ret = (byte*) malloc(isize + *keylen + isize + *datalen);
+        else
+            ret = (byte*) malloc(isize + *keylen + isize);
+        
+        memcpy(ret, (byte*)(keylen), isize);        
+        memcpy(ret+isize, (byte*)(datalen), isize);
+        memcpy(ret+isize+isize, key, *keylen);
+        if(!isDelete())
+            memcpy(ret+isize+isize+*keylen, data, *datalen);
+        return ret;
+    }
+
+    //does not copy the data again
+    //just sets the pointers in the datatuple to
+    //right positions in the given arr
+    
+    static datatuple* from_bytes(const byte * arr)
+        {
+            static const size_t isize = sizeof(uint32_t);
+            datatuple *dt = (datatuple*) malloc(sizeof(datatuple));
+
+            dt->keylen = (uint32_t*) arr;
+            dt->datalen = (uint32_t*) (arr+isize);
+            dt->key = (key_t) (arr+isize+isize);
+            if(!dt->isDelete())
+                dt->data = (data_t) (arr+isize+isize+ *(dt->keylen));
+            else
+                dt->data = 0;
+
+            return dt;
+        }
+    /*
+    static datatuple form_tuple(const byte * arr)
+        {
+            static const size_t isize = sizeof(uint32_t);
+            datatuple dt;
+
+            dt.keylen = (uint32_t*) arr;
+            dt.datalen = (uint32_t*) (arr+isize);
+            dt.key = (key_t) (arr+isize+isize);
+            if(!dt.isDelete())
+                dt.data = (data_t) (arr+isize+isize+ *(dt.keylen));
+            else
+                dt.data = 0;
+
+            return dt;
+        }
+    */
+    
+    byte * get_key() { return (byte*) key; }
+    byte * get_data() { return (byte*) data; }
+
+    //releases only the tuple
+    static void release(datatuple *dt)
+        {
+            free(dt);
+        }
+    
+} datatuple;
+
+
+#endif
diff --git a/hello.cpp b/hello.cpp
new file mode 100644
index 0000000..118fccb
--- /dev/null
+++ b/hello.cpp
@@ -0,0 +1,48 @@
+
+#include <string>
+#include <string.h>
+#include <iostream> 
+#include<stasis/transactional.h>
+
+typedef unsigned char uchar;
+typedef struct datatuple
+{
+
+  typedef byte* key_t;
+  typedef byte* data_t;
+  uint32_t keylen;
+  uint32_t datalen;
+  key_t key;
+  data_t data;
+  
+
+};
+
+int main(int argc, char** argv) {
+
+bool * m1 = new bool(false);
+std::cout << *m1 << std::endl;
+
+  datatuple t;
+  std::cout << "size of datatuple:\t" << sizeof(datatuple) << std::endl;
+
+  t.key = (datatuple::key_t) malloc(10);
+  const char * str = "12345678";
+  strcpy((char*)t.key, (str));
+
+  t.keylen = strlen((char*)t.key);
+
+  t.data = (datatuple::data_t) malloc(10);
+  const char * str2 = "1234567";
+  strcpy((char*)t.data, (str2));
+
+  t.datalen = strlen((char*)t.data);
+
+  std::cout << "size of datatuple:\t" << sizeof(datatuple) << std::endl;
+  std::cout << "keylen:\t" << t.keylen << 
+    "\tdatalen:\t" << t.datalen << 
+    "\t" << t.key << 
+    "\t" << t.data <<
+    std::endl;
+
+}
diff --git a/logiterators.cpp b/logiterators.cpp
new file mode 100644
index 0000000..80a079b
--- /dev/null
+++ b/logiterators.cpp
@@ -0,0 +1,200 @@
+
+#include "logstore.h"
+//#include "datapage.cpp"
+#include "logiterators.h"
+
+
+ 
+
+//template <class MEMTREE, class TUPLE>
+/*
+template <>
+const byte* toByteArray<std::set<datatuple,datatuple>, datatuple>(
+    memTreeIterator<std::set<datatuple,datatuple>, datatuple> * const t)
+{
+    return (*(t->it_)).to_bytes();
+}
+*/
+
+
+/////////////////////////////////////////////////////////////////////
+// tree iterator implementation
+/////////////////////////////////////////////////////////////////////
+
+template <class TUPLE>
+treeIterator<TUPLE>::treeIterator(recordid tree) :
+    tree_(tree),    
+    lsmIterator_(logtreeIterator::open(-1,tree)),
+    curr_tuple(0)
+{
+    init_helper();
+}
+
+template <class TUPLE>
+treeIterator<TUPLE>::treeIterator(recordid tree, TUPLE& key) :
+    tree_(tree),
+    //scratch_(),
+    lsmIterator_(logtreeIterator::openAt(-1,tree,key.get_key()))//toByteArray())),
+    //slot_(0)
+{
+    init_helper();
+
+    /*
+    treeIterator * end = this->end();
+    for(;*this != *end && **this < key; ++(*this))
+    {
+        DEBUG("treeIterator was not at the given TUPLE");
+    }
+    delete end;
+    */
+
+}
+
+template <class TUPLE>
+treeIterator<TUPLE>::~treeIterator()
+{
+    if(lsmIterator_) 
+        logtreeIterator::close(-1, lsmIterator_);
+
+    if(curr_tuple != NULL)
+        free(curr_tuple);
+    
+    if(curr_page!=NULL)
+    {
+        delete curr_page;
+        curr_page = 0;
+    }
+
+    
+}
+
+template <class TUPLE>
+void treeIterator<TUPLE>::init_helper()
+{
+    if(!lsmIterator_)
+    {
+        printf("treeIterator:\t__error__ init_helper():\tnull lsmIterator_");
+        curr_page = 0;
+        dp_itr = 0;
+    }
+    else
+    {
+        if(logtreeIterator::next(-1, lsmIterator_) == 0)
+        {    
+            //printf("treeIterator:\t__error__ init_helper():\tlogtreeIteratr::next returned 0." );
+            curr_page = 0;
+            dp_itr = 0;
+        }
+        else
+        {
+            pageid_t * pid_tmp;
+            pageid_t ** hack = &pid_tmp;
+            logtreeIterator::value(-1,lsmIterator_,(byte**)hack);
+            
+            curr_pageid = *pid_tmp;
+            curr_page = new DataPage<TUPLE>(-1, curr_pageid);
+            dp_itr = new DPITR_T(curr_page->begin());
+        }
+        
+    }
+}
+
+template <class TUPLE>
+TUPLE * treeIterator<TUPLE>::getnext()
+{
+    assert(this->lsmIterator_);
+
+    if(dp_itr == 0)
+        return 0;
+    
+    TUPLE* readTuple = dp_itr->getnext(-1);
+
+    
+    if(!readTuple)
+    {
+        delete dp_itr;
+        dp_itr = 0;
+        delete curr_page;
+        curr_page = 0;
+        
+        if(logtreeIterator::next(-1,lsmIterator_))
+        {
+            pageid_t *pid_tmp;
+
+            pageid_t **hack = &pid_tmp;
+            logtreeIterator::value(-1,lsmIterator_,(byte**)hack);
+            curr_pageid = *pid_tmp;
+            curr_page = new DataPage<TUPLE>(-1, curr_pageid);
+            dp_itr = new DPITR_T(curr_page->begin());
+            
+
+            readTuple = dp_itr->getnext(-1); 
+            assert(readTuple);
+        }
+        else
+        {
+            // TODO: what is this?
+            //past end of iterator!  "end" should contain the pageid of the
+            // last leaf, and 1+ numslots on that page.
+            //abort();            
+        }
+    }
+    
+    return curr_tuple=readTuple;
+}
+
+
+
+/*
+template <class TUPLE>
+treeIterator<TUPLE>::treeIterator(treeIteratorHandle* tree, TUPLE& key) :
+    tree_(tree?tree->r_:NULLRID),
+    scratch_(),
+    lsmIterator_(logtreeIterator::openAt(-1,tree?tree->r_:NULLRID,key.get_key())),//toByteArray())),
+    slot_(0)
+{
+    init_helper();
+    if(lsmIterator_) {
+        treeIterator * end = this->end();
+        for(;*this != *end && **this < key; ++(*this)) { }
+        delete end;
+    } else {
+        this->slot_ = 0;
+        this->pageid_ = 0;
+    }
+}
+
+template <class TUPLE>
+treeIterator<TUPLE>::treeIterator(recordid tree, TUPLE &scratch) :
+    tree_(tree),
+    scratch_(scratch),
+    lsmIterator_(logtreeIterator::open(-1,tree)),
+    slot_(0)
+{
+    init_helper();
+}
+
+template <class TUPLE>
+treeIterator<TUPLE>::treeIterator(treeIteratorHandle* tree) :
+    tree_(tree?tree->r_:NULLRID),
+    scratch_(),
+    lsmIterator_(logtreeIterator::open(-1,tree?tree->r_:NULLRID)),
+    slot_(0)
+{
+    init_helper();
+}
+
+template <class TUPLE>
+treeIterator<TUPLE>::treeIterator(treeIterator& t) :
+    tree_(t.tree_),
+    scratch_(t.scratch_),    
+    lsmIterator_(t.lsmIterator_?logtreeIterator::copy(-1,t.lsmIterator_):0),
+    slot_(t.slot_),
+    pageid_(t.pageid_),
+    p_((Page*)((t.p_)?loadPage(-1,t.p_->id):0))
+    //currentPage_((PAGELAYOUT*)((p_)?p_->impl:0))
+{
+    if(p_)
+        readlock(p_->rwlatch,0);
+}
+*/
diff --git a/logiterators.h b/logiterators.h
new file mode 100644
index 0000000..8d61867
--- /dev/null
+++ b/logiterators.h
@@ -0,0 +1,173 @@
+#ifndef _LOG_ITERATORS_H_
+#define _LOG_ITERATORS_H_
+
+#include <assert.h>
+#include <stasis/iterator.h>
+
+#undef begin
+#undef end
+
+template <class MEMTREE, class TUPLE> class memTreeIterator;
+
+template <class MEMTREE, class TUPLE>
+const byte* toByteArray(memTreeIterator<MEMTREE,TUPLE> * const t);
+
+template <class TUPLE>
+class DataPage;
+
+//////////////////////////////////////////////////////////////
+// memTreeIterator
+/////////////////////////////////////////////////////////////
+
+template<class MEMTREE, class TUPLE>
+class memTreeIterator{
+
+private:
+    typedef typename MEMTREE::const_iterator MTITER;
+    
+public:    
+    memTreeIterator( MEMTREE *s )
+        {
+            it_ = s->begin();
+            itend_ = s->end();
+        }
+    
+    
+    memTreeIterator( MTITER& it, MTITER& itend )
+        {
+            it_ = it;
+            itend_ = itend;
+        }
+   
+    explicit memTreeIterator(memTreeIterator &i)
+        {
+            it_ = i.it_;
+            itend_ = i.itend_;
+        }
+
+    const TUPLE& operator* ()
+        {
+            return *it_;
+        }
+
+    void seekEnd()
+        {
+            it_ = itend_;
+        }
+
+    
+    memTreeIterator * end()
+        {
+            return new memTreeIterator<MEMTREE,TUPLE>(itend_,itend_);
+        }
+    
+    inline bool operator==(const memTreeIterator &o) const {
+        return it_ == o.it_;
+    }
+    inline bool operator!=(const memTreeIterator &o) const {
+        return !(*this == o);
+    }
+    inline void operator++() {
+        ++it_;
+    }
+    inline void operator--() {
+        --it_;
+    }
+
+    inline int  operator-(memTreeIterator &i) {
+        return it_ - i.it_;
+    }
+
+    inline void operator=(memTreeIterator const &i)
+        {
+            it_ = i.it_;
+            itend_ = i.itend_;
+        }
+
+public:
+    typedef MEMTREE* handle;
+    
+private:
+
+    MTITER it_;
+    MTITER itend_;
+    
+    friend const byte* toByteArray<MEMTREE,TUPLE>(memTreeIterator<MEMTREE,TUPLE> * const t);
+
+};
+
+template <class MEMTREE, class TUPLE>
+const byte* toByteArray(memTreeIterator<MEMTREE,TUPLE> * const t)
+{
+    return (*(t->it_)).to_bytes();//toByteArray();
+}
+
+/////////////////////////////////////////////////////////////////
+
+/**
+   Scans through an LSM tree's leaf pages, each tuple in the tree, in
+   order.  This iterator is designed for maximum forward scan
+   performance, and does not support all STL operations.
+**/
+template <class TUPLE>
+class treeIterator
+{
+
+ public:
+    //  typedef recordid handle;
+    class treeIteratorHandle
+    {
+    public:
+        treeIteratorHandle() : r_(NULLRID) {}
+        treeIteratorHandle(const recordid r) : r_(r) {}
+        
+        treeIteratorHandle * operator=(const recordid &r) {
+            r_ = r;
+            return this;
+        }
+        
+        recordid r_;
+    };
+    
+    typedef treeIteratorHandle* handle;
+
+    explicit treeIterator(recordid tree);
+
+    explicit treeIterator(recordid tree,TUPLE &key);
+    
+    //explicit treeIterator(treeIteratorHandle* tree, TUPLE& key);
+    
+    //explicit treeIterator(treeIteratorHandle* tree);
+    
+    //explicit treeIterator(treeIterator& t);
+
+    ~treeIterator();
+    
+    TUPLE * getnext();
+
+    //void advance(int count=1);
+    
+private:
+    inline void init_helper();
+
+  explicit treeIterator() { abort(); }
+  void operator=(treeIterator & t) { abort(); }
+  int operator-(treeIterator & t) { abort(); }    
+    
+private:
+    recordid tree_; //root of the tree
+    
+    lladdIterator_t * lsmIterator_; //logtree iterator
+    
+    pageid_t curr_pageid; //current page id
+    DataPage<TUPLE>    *curr_page;   //current page
+    typedef typename DataPage<TUPLE>::RecordIterator DPITR_T;
+    DPITR_T *dp_itr;
+    TUPLE    *curr_tuple;  //current tuple
+};
+
+
+
+
+#endif
+
diff --git a/logserver.cpp b/logserver.cpp
new file mode 100644
index 0000000..3f9eb54
--- /dev/null
+++ b/logserver.cpp
@@ -0,0 +1,649 @@
+
+
+
+#include "logserver.h"
+#include "datatuple.h"
+
+#include "logstore.h"
+
+#include <sys/types.h> 
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <sys/select.h>
+#include <errno.h>
+
+#undef begin
+#undef end
+#undef try
+
+
+//server codes
+uint8_t logserver::OP_SUCCESS = 1;
+uint8_t logserver::OP_FAIL = 2;
+uint8_t logserver::OP_SENDING_TUPLE = 3;
+
+//client codes
+uint8_t logserver::OP_FIND = 4;
+uint8_t logserver::OP_INSERT = 5;
+
+uint8_t logserver::OP_DONE = 6;
+
+uint8_t logserver::OP_INVALID = 32;
+
+void *serverLoop(void *args);
+
+void logserver::startserver(logtable *ltable)
+{
+    sys_alive = true;
+    this->ltable = ltable;
+
+    selcond = new pthread_cond_t;
+    pthread_cond_init(selcond, 0);
+    
+    //initialize threads
+    for(int i=0; i<nthreads; i++)
+    {
+        struct pthread_item *worker_th = new pthread_item;
+        th_list.push_back(worker_th);
+        
+        worker_th->th_handle = new pthread_t;
+        struct pthread_data *worker_data = new pthread_data;
+        worker_th->data = worker_data;
+
+        worker_data->idleth_queue = &idleth_queue;        
+        worker_data->ready_queue = &ready_queue;
+        worker_data->work_queue = &work_queue;
+
+        worker_data->qlock = qlock;
+
+        worker_data->selcond = selcond;
+        
+        worker_data->th_cond = new pthread_cond_t;
+        pthread_cond_init(worker_data->th_cond,0);
+        
+        worker_data->th_mut = new pthread_mutex_t;
+        pthread_mutex_init(worker_data->th_mut,0);
+
+        worker_data->workitem = new int;
+        *(worker_data->workitem) = -1;
+
+        //worker_data->table_lock = lsmlock;
+
+        worker_data->ltable = ltable;
+
+        worker_data->sys_alive = &sys_alive;
+        
+        pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th);
+
+        idleth_queue.push(*worker_th);
+                
+        
+    }
+
+    
+
+    //start server socket
+    sdata = new serverth_data;
+    sdata->server_socket = &serversocket;
+    sdata->server_port = server_port;
+    sdata->idleth_queue = &idleth_queue;
+    sdata->ready_queue = &ready_queue;
+    sdata->selcond = selcond;
+    sdata->qlock = qlock;
+    
+    pthread_create(&server_thread, 0, serverLoop, sdata);
+
+    //start monitoring loop
+    eventLoop();
+
+}
+
+void logserver::stopserver()
+{
+    //close the server socket
+    //stops receiving data on the server socket
+    shutdown(serversocket, 0);
+    
+    //wait for all threads to be idle
+    while(idleth_queue.size() != nthreads)
+        sleep(1);
+
+    #ifdef STATS_ENABLED
+    printf("\n\nSTATISTICS\n");
+    std::map<std::string, int> num_reqsc;
+    std::map<std::string, double> work_timec;
+    #endif
+    
+    //set the system running flag to false
+    sys_alive = false;
+    for(int i=0; i<nthreads; i++)    
+    {
+        pthread_item *idle_th = th_list[i];
+        
+        //wake up the thread 
+        pthread_mutex_lock(idle_th->data->th_mut);        
+        pthread_cond_signal(idle_th->data->th_cond);
+        pthread_mutex_unlock(idle_th->data->th_mut);
+        //wait for it to join
+        pthread_join(*(idle_th->th_handle), 0);
+        //free the thread variables
+        pthread_cond_destroy(idle_th->data->th_cond);
+
+        #ifdef STATS_ENABLED
+        if(i == 0)
+        {
+            tot_threadwork_time = 0;
+            num_reqs = 0;
+        }
+
+        tot_threadwork_time += idle_th->data->work_time;
+        num_reqs += idle_th->data->num_reqs;
+
+        printf("thread %d: work_time %.3f\t#calls %d\tavg req process time:\t%.3f\n",
+               i,
+               idle_th->data->work_time,
+               idle_th->data->num_reqs,
+               (( idle_th->data->num_reqs == 0 ) ? 0 : idle_th->data->work_time / idle_th->data->num_reqs)
+               );
+
+        for(std::map<std::string, int>::const_iterator itr = idle_th->data->num_reqsc.begin();
+            itr != idle_th->data->num_reqsc.end(); itr++)
+        {
+            std::string ckey = (*itr).first;
+            printf("\t%s\t%d\t%.3f\t%.3f\n", ckey.c_str(), (*itr).second, idle_th->data->work_timec[ckey],
+                   idle_th->data->work_timec[ckey] / (*itr).second);
+
+            if(num_reqsc.find(ckey) == num_reqsc.end()){
+                num_reqsc[ckey] = 0;
+                work_timec[ckey] = 0;                
+            }
+            num_reqsc[ckey] += (*itr).second;
+            work_timec[ckey] += idle_th->data->work_timec[ckey];
+        }
+        #endif
+        
+        delete idle_th->data->th_cond;
+        delete idle_th->data->th_mut;
+        delete idle_th->data->workitem;
+        delete idle_th->data;
+        delete idle_th->th_handle;        
+    }
+
+    th_list.clear();
+
+    #ifdef STATS_ENABLED
+
+    printf("\n\nAggregated Stats:\n");
+    for(std::map<std::string, int>::const_iterator itr = num_reqsc.begin();
+        itr != num_reqsc.end(); itr++)
+    {
+        std::string ckey = (*itr).first;
+        printf("\t%s\t%d\t%.3f\t%.3f\n", ckey.c_str(), (*itr).second, work_timec[ckey],
+               work_timec[ckey] / (*itr).second);
+    }
+    
+    tot_time = (stop_tv.tv_sec - start_tv.tv_sec) * 1000 +
+               (stop_tv.tv_usec / 1000 - start_tv.tv_usec / 1000);
+    
+    printf("\ntot time:\t%f\n",tot_time);
+    printf("tot work time:\t%f\n", tot_threadwork_time);       
+    printf("load avg:\t%f\n", tot_threadwork_time / tot_time);
+
+    printf("tot num reqs\t%d\n", num_reqs);
+    if(num_reqs!= 0)
+    {
+        printf("tot work time / num reqs:\t%.3f\n", tot_threadwork_time / num_reqs);
+        printf("tot time / num reqs:\t%.3f\n", tot_time / num_reqs );
+    }
+    #endif
+    
+    //close(serversocket);
+
+    return;
+}
+
+void logserver::eventLoop()
+{
+
+    fd_set readfs;
+    std::vector<int> sel_list;
+    
+    int maxfd;
+
+    struct timeval Timeout;
+    struct timespec   ts;
+         
+    while(true)
+    {
+        //clear readset
+        FD_ZERO(&readfs);
+        maxfd = -1;
+
+        ts.tv_nsec = 250000; //nanosec
+        ts.tv_sec = 0;
+
+        //Timeout.tv_usec = 250;  /* microseconds */
+        //Timeout.tv_sec  = 0;  /* seconds */
+        
+        //update select set
+        pthread_mutex_lock(qlock);
+
+        //while(ready_queue.size() == 0)
+        if(sel_list.size() == 0)
+        {
+            while(ready_queue.size() == 0)
+                pthread_cond_wait(selcond, qlock);
+            //pthread_cond_timedwait(selcond, qlock, &ts);
+            //printf("awoke\n");
+        }
+        
+        //new connections + processed conns are in ready_queue
+        //add them to select list
+        while(ready_queue.size() > 0)
+        {
+            sel_list.push_back(ready_queue.front());
+            ready_queue.pop();
+        }
+        pthread_mutex_unlock(qlock);
+
+        //ready select set
+        for(std::vector<int>::const_iterator itr=sel_list.begin();
+            itr != sel_list.end(); itr++)
+        {
+            if(maxfd < *itr)
+                maxfd = *itr;
+            FD_SET(*itr, &readfs);
+        }
+
+        //select events
+        int sel_res = select(maxfd+1, &readfs, NULL, NULL, NULL);// &Timeout);
+        //printf("sel_res %d %d\n", sel_res, errno);        
+        //fflush(stdout);
+        //job assignment to threads
+        //printf("sel_list size:\t%d ready_cnt\t%d\n", sel_list.size(), sel_res);
+
+        #ifdef STATS_ENABLED
+        if(num_selcalls == 0)
+            gettimeofday(&start_tv, 0);        
+        
+        num_selevents += sel_res;
+        num_selcalls++;
+        #endif
+
+        pthread_mutex_lock(qlock);
+        for(int i=0; i<sel_list.size(); i++ )
+        {
+            int currsock = sel_list[i];
+
+            if (FD_ISSET(currsock, &readfs))
+            {
+                //printf("sock %d ready\n", currsock);
+//                pthread_mutex_lock(qlock);
+
+                if(idleth_queue.size() > 0) //assign the job to an indle thread
+                {
+                    pthread_item idle_th = idleth_queue.front();
+                    idleth_queue.pop();
+                    
+                    //wake up the thread to do work
+                    pthread_mutex_lock(idle_th.data->th_mut);
+                    //set the job of the idle thread
+                    *(idle_th.data->workitem) = currsock;
+                    pthread_cond_signal(idle_th.data->th_cond);
+                    pthread_mutex_unlock(idle_th.data->th_mut);
+                    //printf("%d:\tconn %d assigned.\n", i, currsock);
+                }
+                else
+                {
+                    //insert the given element to the work queue
+                    work_queue.push(currsock);                    
+                    //printf("work queue size:\t%d\n", work_queue.size());
+                }
+
+//                pthread_mutex_unlock(qlock);                
+                
+                //remove from the sel_list
+                sel_list.erase(sel_list.begin()+i);
+                i--;                
+            }
+        }
+
+        pthread_mutex_unlock(qlock);
+
+        #ifdef STATS_ENABLED
+        gettimeofday(&stop_tv, 0);
+        #endif
+        
+    }
+    
+}
+
+void *serverLoop(void *args)
+{
+
+    serverth_data *sdata = (serverth_data*)args;
+    
+    int sockfd; //socket descriptor
+    struct sockaddr_in serv_addr;
+    struct sockaddr_in cli_addr;
+    int newsockfd; //newly created 
+    socklen_t clilen = sizeof(cli_addr);
+    
+
+    //open a socket
+    sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    if (sockfd < 0) 
+    {
+        printf("ERROR opening socket\n");
+        return 0;
+    }
+    
+    bzero((char *) &serv_addr, sizeof(serv_addr));     
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    serv_addr.sin_port = htons(sdata->server_port);
+    
+    if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) 
+    {
+        printf("ERROR on binding.\n");
+        return 0;
+    }
+    
+    //start listening on the server socket
+    //second arg is the max number of coonections waiting in queue
+    if(listen(sockfd,SOMAXCONN)==-1)
+    {
+        printf("ERROR on listen.\n");
+         return 0;
+    }
+
+    printf("LSM Server listenning...\n");
+
+    *(sdata->server_socket) = sockfd;
+    int flag, result;
+    while(true)
+    {
+        newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen);
+        if (newsockfd < 0) 
+        {
+            printf("ERROR on accept.\n");
+            return 0; // we probably want to continue instead of return here (when not debugging)
+        }
+
+        flag = 1;
+        result = setsockopt(newsockfd,            /* socket affected */
+                            IPPROTO_TCP,     /* set option at TCP level */
+                            TCP_NODELAY,     /* name of option */
+                            (char *) &flag,  /* the cast is historical
+                                                cruft */
+                            sizeof(int));    /* length of option value */
+        if (result < 0)
+        {
+            printf("ERROR on setting socket option TCP_NODELAY.\n");
+            return 0; 
+        }        
+
+        char clientip[20];
+        inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20);
+        printf("Connection from:\t%s\n", clientip);
+
+        //printf("Number of idle threads %d\n", idleth_queue.size());
+
+        pthread_mutex_lock(sdata->qlock);
+
+        //insert the given element to the ready queue
+        sdata->ready_queue->push(newsockfd);
+
+        if(sdata->ready_queue->size() == 1) //signal the event loop
+            pthread_cond_signal(sdata->selcond);
+        
+        pthread_mutex_unlock(sdata->qlock);
+        
+    }
+
+    
+}
+
+inline void readfromsocket(int sockd, byte *buf, int count)
+{
+
+    int n = 0;
+    while( n < count )
+    {
+        n += read( sockd, buf + n, count - n);
+    }
+    
+}
+
+inline void writetosocket(int sockd, byte *buf, int count)
+{
+    int n = 0;
+    while( n < count )
+    {
+        n += write( sockd, buf + n, count - n);
+    }    
+}
+
+
+
+
+
+void * thread_work_fn( void * args)
+{
+    pthread_item * item = (pthread_item *) args;
+
+    pthread_mutex_lock(item->data->th_mut);
+    while(true)
+    {        
+        while(*(item->data->workitem) == -1)
+        {
+            if(!*(item->data->sys_alive))
+                break;
+            pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job
+        }
+
+        
+        #ifdef STATS_ENABLED
+        gettimeofday(& (item->data->start_tv), 0);
+        std::ostringstream ostr;
+        ostr << *(item->data->workitem) << "_";
+        #endif
+        
+        if(!*(item->data->sys_alive))
+        {
+            //printf("thread quitted.\n");
+            break;
+        }
+
+        //step 1: read the opcode
+        uint8_t opcode;
+        ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t));
+        assert( n == sizeof(uint8_t));
+        assert( opcode < logserver::OP_INVALID );
+
+        if( opcode == logserver::OP_DONE ) //close the conn on failure
+        {
+            pthread_mutex_lock(item->data->qlock);            
+            printf("client done. conn closed. (%d, %d, %d, %d)\n",
+                   n, errno, *(item->data->workitem), item->data->work_queue->size());
+            close(*(item->data->workitem));
+                
+            if(item->data->work_queue->size() > 0)
+            {
+                int new_work = item->data->work_queue->front();
+                item->data->work_queue->pop();
+                //printf("work queue size:\t%d\n", item->data->work_queue->size());
+                *(item->data->workitem) = new_work;
+            }
+            else
+            {
+                //set work to -1
+                *(item->data->workitem) = -1;
+                //add self to idle queue
+                item->data->idleth_queue->push(*item);
+            }
+            
+            pthread_mutex_unlock(item->data->qlock);
+            continue;            
+        }
+
+        
+        //step 2: read the tuple from client        
+        datatuple tuple;
+        tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t));
+        tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t));
+        
+        //read the key length
+        n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t));
+        assert( n == sizeof(uint32_t));
+        //read the data length
+        n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t));
+        assert( n == sizeof(uint32_t));
+        
+        //read the key
+        tuple.key = (byte*) malloc(*tuple.keylen);
+        readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen);
+        //read the data
+        if(!tuple.isDelete() && opcode != logserver::OP_FIND)
+        {
+            tuple.data = (byte*) malloc(*tuple.datalen);
+            readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen);
+        }
+        else
+            tuple.data = 0;
+
+        //step 3: process the tuple
+        //pthread_mutex_lock(item->data->table_lock);
+        //readlock(item->data->table_lock,0);
+        
+        if(opcode == logserver::OP_INSERT)
+        {
+            //insert/update/delete
+            item->data->ltable->insertTuple(tuple);
+            //unlock the lsmlock
+            //pthread_mutex_unlock(item->data->table_lock);
+            //unlock(item->data->table_lock);
+            //step 4: send response
+            uint8_t rcode = logserver::OP_SUCCESS;
+            n = write(*(item->data->workitem), &rcode, sizeof(uint8_t));
+            assert(n == sizeof(uint8_t));
+            
+        }
+        else if(opcode == logserver::OP_FIND)
+        {
+            //find the tuple
+            datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen);
+            //unlock the lsmlock
+            //pthread_mutex_unlock(item->data->table_lock);
+            //unlock(item->data->table_lock);
+
+            #ifdef STATS_ENABLED
+
+            if(dt == 0)
+                printf("key not found:\t%s\n", datatuple::key_to_str(tuple.key).c_str());
+            else if( *dt->datalen != 1024)
+                printf("data len for\t%s:\t%d\n", datatuple::key_to_str(tuple.key).c_str(),
+                       *dt->datalen);
+
+            if(datatuple::compare(tuple.key, dt->key) != 0)
+                printf("key not equal:\t%s\t%s\n", datatuple::key_to_str(tuple.key).c_str(),
+                       datatuple::key_to_str(dt->key).c_str());
+            
+            #endif
+            
+            if(dt == 0)  //tuple deleted
+            {
+                dt = (datatuple*) malloc(sizeof(datatuple));
+                dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen);
+                *dt->keylen = *tuple.keylen;
+                dt->datalen = dt->keylen + 1;
+                dt->key = (datatuple::key_t) (dt->datalen+1);
+                memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen);
+                dt->setDelete();
+            }
+
+            //send the reply code
+            uint8_t rcode = logserver::OP_SENDING_TUPLE;
+            n = write(*(item->data->workitem), &rcode, sizeof(uint8_t));
+            assert(n == sizeof(uint8_t));
+
+            //send the tuple
+            writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length());
+
+            //free datatuple
+            free(dt->keylen);
+            free(dt);
+        }
+
+        //close the socket
+        //close(*(item->data->workitem));
+
+        //free the tuple
+        free(tuple.keylen);
+        free(tuple.datalen);
+        free(tuple.key);
+        free(tuple.data);
+
+        //printf("socket %d: work completed.", *(item->data->workitem));
+        
+        pthread_mutex_lock(item->data->qlock);
+        
+        //add conn desc to ready queue
+        item->data->ready_queue->push(*(item->data->workitem));
+        //printf("ready queue size: %d sock(%d)\n", item->data->ready_queue->size(), *(item->data->workitem));
+        if(item->data->ready_queue->size() == 1) //signal the event loop
+            pthread_cond_signal(item->data->selcond);
+
+        //printf("work complete, added to ready queue %d (size %d)\n", *(item->data->workitem),
+        //       item->data->ready_queue->size());
+        
+        if(item->data->work_queue->size() > 0)
+        {
+            int new_work = item->data->work_queue->front();
+            item->data->work_queue->pop();
+            //printf("work queue size:\t%d\n", item->data->work_queue->size());
+            *(item->data->workitem) = new_work;
+        }
+        else
+        {
+            //set work to -1
+            *(item->data->workitem) = -1;
+            //add self to idle queue
+            item->data->idleth_queue->push(*item);
+        }
+        
+        pthread_mutex_unlock(item->data->qlock);
+
+        #ifdef STATS_ENABLED
+        if( item->data->num_reqs == 0 )
+            item->data->work_time = 0;
+        gettimeofday(& (item->data->stop_tv), 0);
+        (item->data->num_reqs)++;
+        //item->data->work_time += tv_to_double(item->data->stop_tv) - tv_to_double(item->data->start_tv);
+        item->data->work_time += (item->data->stop_tv.tv_sec - item->data->start_tv.tv_sec) * 1000 +
+               (item->data->stop_tv.tv_usec / 1000 - item->data->start_tv.tv_usec / 1000);
+
+        int iopcode = opcode;
+        ostr << iopcode;
+        std::string clientkey = ostr.str();
+        if(item->data->num_reqsc.find(clientkey) == item->data->num_reqsc.end())
+        {
+            item->data->num_reqsc[clientkey]=0;
+            item->data->work_timec[clientkey]=0;
+        }
+        
+        item->data->num_reqsc[clientkey]++;
+        item->data->work_timec[clientkey] += (item->data->stop_tv.tv_sec - item->data->start_tv.tv_sec) * 1000 +
+            (item->data->stop_tv.tv_usec / 1000 - item->data->start_tv.tv_usec / 1000);;        
+        #endif
+        
+
+    }
+    pthread_mutex_unlock(item->data->th_mut);
+
+
+}
+                       
+
diff --git a/logserver.h b/logserver.h
new file mode 100644
index 0000000..dd9888a
--- /dev/null
+++ b/logserver.h
@@ -0,0 +1,197 @@
+#ifndef _LOGSERVER_H_
+#define _LOGSERVER_H_
+
+
+#include <queue>
+#include <vector>
+
+//#include "logstore.h"
+
+#include "datatuple.h"
+
+
+
+#include <stasis/transactional.h>
+#include <pthread.h>
+
+#undef begin
+#undef try
+#undef end
+
+#define STATS_ENABLED 1
+
+#ifdef STATS_ENABLED
+#include <sys/time.h>
+#include <time.h>
+#include <map>
+#endif
+
+class logtable;
+
+
+
+struct pthread_item;
+
+struct pthread_data {
+    std::queue<pthread_item> *idleth_queue;
+    std::queue<int> *ready_queue;
+    std::queue<int> *work_queue;
+    pthread_mutex_t * qlock;
+
+    pthread_cond_t *selcond;
+    
+    pthread_cond_t * th_cond;
+    pthread_mutex_t * th_mut;
+    
+    int *workitem; //id of the socket to work
+
+    //pthread_mutex_t * table_lock;
+    //rwl *table_lock;
+    logtable *ltable;
+    bool *sys_alive;
+
+    #ifdef STATS_ENABLED
+    int num_reqs;
+    struct timeval start_tv, stop_tv;
+    double work_time;
+    std::map<std::string, int> num_reqsc;
+    std::map<std::string, double> work_timec;
+    #endif
+    
+};
+
+struct pthread_item{
+    pthread_t * th_handle;
+    pthread_data *data;
+};
+
+
+//struct work_item
+//{
+//    int sockd; //socket id
+//    datatuple in_tuple; //request
+//    datatuple out_tuple; //response
+//};
+
+struct serverth_data
+{
+    int *server_socket;
+    int server_port;
+    std::queue<pthread_item> *idleth_queue;
+    std::queue<int> *ready_queue;
+
+    pthread_cond_t *selcond;
+    
+    pthread_mutex_t *qlock;
+    
+    
+
+};
+
+void * thread_work_fn( void *);    
+
+class logserver
+{
+public:
+    //server codes
+    static uint8_t OP_SUCCESS;
+    static uint8_t OP_FAIL;
+    static uint8_t OP_SENDING_TUPLE;
+
+    //client codes
+    static uint8_t OP_FIND;
+    static uint8_t OP_INSERT;
+
+    static uint8_t OP_DONE;
+    
+    static uint8_t OP_INVALID;
+    
+public:
+    logserver(int nthreads, int server_port){
+        this->nthreads = nthreads;
+        this->server_port = server_port;
+        //lsmlock = new pthread_mutex_t;
+        //pthread_mutex_init(lsmlock,0);
+
+        //lsmlock = initlock();
+
+        qlock = new pthread_mutex_t;
+        pthread_mutex_init(qlock,0);
+
+        ltable = 0;
+
+        #ifdef STATS_ENABLED        
+        num_selevents = 0;
+        num_selcalls = 0;
+        #endif
+
+
+    }
+
+    ~logserver()
+        {
+            //delete lsmlock;
+            //deletelock(lsmlock);
+            delete qlock;
+        }
+    
+    void startserver(logtable *ltable);
+
+    void stopserver();
+    
+    
+public:
+
+private:
+
+    //main loop of server
+    //accept connections, assign jobs to threads
+    //void dispatchLoop();
+
+    void eventLoop();
+    
+
+private:
+
+    int server_port;
+    
+    int nthreads;
+
+    bool sys_alive;
+    
+    int serversocket; //server socket file descriptor
+
+    //ccqueue<int> conn_queue; //list of active connections (socket list)
+
+    //ccqueue<pthread_item> idleth_queue; //list of idle threads
+
+    std::queue<int> ready_queue; //connections to go inside select
+    std::queue<int> work_queue;  //connections to be processed by worker threads
+    std::queue<pthread_item> idleth_queue;
+    pthread_mutex_t *qlock;
+
+    pthread_t server_thread;
+    serverth_data *sdata;
+    pthread_cond_t *selcond; //server loop cond
+    
+    std::vector<pthread_item *> th_list; // list of threads
+
+    //rwl *lsmlock; //lock for using lsm table
+
+    logtable *ltable;
+
+
+    #ifdef STATS_ENABLED
+    int num_reqs;
+    int num_selevents;
+    int num_selcalls;
+    struct timeval start_tv, stop_tv;
+    double tot_threadwork_time;
+    double tot_time;
+    #endif
+
+    
+};
+
+
+#endif
diff --git a/logserver_pers.cpp b/logserver_pers.cpp
new file mode 100644
index 0000000..4c7f2bb
--- /dev/null
+++ b/logserver_pers.cpp
@@ -0,0 +1,519 @@
+
+
+
+#include "logserver.h"
+#include "datatuple.h"
+
+#include "logstore.h"
+
+#include <sys/types.h> 
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <sys/select.h>
+#include <errno.h>
+
+#undef begin
+#undef end
+#undef try
+
+
+//server codes
+uint8_t logserver::OP_SUCCESS = 1;
+uint8_t logserver::OP_FAIL = 2;
+uint8_t logserver::OP_SENDING_TUPLE = 3;
+
+//client codes
+uint8_t logserver::OP_FIND = 4;
+uint8_t logserver::OP_INSERT = 5;
+
+uint8_t logserver::OP_DONE = 6;
+
+uint8_t logserver::OP_INVALID = 32;
+
+void *serverLoop(void *args);
+
+void logserver::startserver(logtable *ltable)
+{
+    sys_alive = true;
+    this->ltable = ltable;
+
+    selcond = new pthread_cond_t;
+    pthread_cond_init(selcond, 0);
+    
+    //initialize threads
+    for(int i=0; i<nthreads; i++)
+    {
+        struct pthread_item *worker_th = new pthread_item;
+        th_list.push_back(worker_th);
+        
+        worker_th->th_handle = new pthread_t;
+        struct pthread_data *worker_data = new pthread_data;
+        worker_th->data = worker_data;
+
+        worker_data->idleth_queue = &idleth_queue;        
+        worker_data->ready_queue = &ready_queue;
+        worker_data->work_queue = &work_queue;
+
+        worker_data->qlock = qlock;
+
+        worker_data->selcond = selcond;
+        
+        worker_data->th_cond = new pthread_cond_t;
+        pthread_cond_init(worker_data->th_cond,0);
+        
+        worker_data->th_mut = new pthread_mutex_t;
+        pthread_mutex_init(worker_data->th_mut,0);
+
+        worker_data->workitem = new int;
+        *(worker_data->workitem) = -1;
+
+        worker_data->table_lock = lsmlock;
+
+        worker_data->ltable = ltable;
+
+        worker_data->sys_alive = &sys_alive;
+        
+        pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th);
+
+        idleth_queue.push(*worker_th);
+                
+        
+    }
+
+    
+
+    //start server socket
+    sdata = new serverth_data;
+    sdata->server_socket = &serversocket;
+    sdata->server_port = server_port;
+    sdata->idleth_queue = &idleth_queue;
+    sdata->ready_queue = &ready_queue;
+    sdata->selcond = selcond;
+    sdata->qlock = qlock;
+    
+    pthread_create(&server_thread, 0, serverLoop, sdata);
+
+    //start monitoring loop
+    eventLoop();
+
+}
+
+void logserver::stopserver()
+{
+    //close the server socket
+    //stops receiving data on the server socket
+    shutdown(serversocket, 0);
+
+    //wait for all threads to be idle
+    while(idleth_queue.size() != nthreads)
+        sleep(1); 
+
+    //set the system running flag to false
+    sys_alive = false;
+    for(int i=0; i<nthreads; i++)    
+    {
+        pthread_item *idle_th = th_list[i];
+        
+        //wake up the thread 
+        pthread_mutex_lock(idle_th->data->th_mut);        
+        pthread_cond_signal(idle_th->data->th_cond);
+        pthread_mutex_unlock(idle_th->data->th_mut);
+        //wait for it to join
+        pthread_join(*(idle_th->th_handle), 0);
+        //free the thread variables
+        pthread_cond_destroy(idle_th->data->th_cond);
+        delete idle_th->data->th_cond;
+        delete idle_th->data->th_mut;
+        delete idle_th->data->workitem;
+        delete idle_th->data;
+        delete idle_th->th_handle;        
+    }
+
+    th_list.clear();
+
+    //close(serversocket);
+
+    return;
+}
+
+void logserver::eventLoop()
+{
+
+    fd_set readfs;
+    std::vector<int> sel_list;
+    
+    int maxfd;
+
+    struct timeval Timeout;
+    struct timespec   ts;
+         
+    while(true)
+    {
+        //clear readset
+        FD_ZERO(&readfs);
+        maxfd = -1;
+
+        ts.tv_nsec = 250000; //nanosec
+        ts.tv_sec = 0;
+
+        //Timeout.tv_usec = 250;  /* microseconds */
+        //Timeout.tv_sec  = 0;  /* seconds */
+        
+        //update select set
+        pthread_mutex_lock(qlock);
+
+        while(ready_queue.size() == 0)
+        {
+            pthread_cond_wait(selcond, qlock);
+            //pthread_cond_timedwait(selcond, qlock, &ts);
+            //printf("awoke\n");
+        }
+        
+        //new connections + processed conns are in ready_queue
+        //add them to select list
+        while(ready_queue.size() > 0)
+        {
+            sel_list.push_back(ready_queue.front());
+            ready_queue.pop();
+        }
+        pthread_mutex_unlock(qlock);
+
+        //ready select set
+        for(std::vector<int>::const_iterator itr=sel_list.begin();
+            itr != sel_list.end(); itr++)
+        {
+            if(maxfd < *itr)
+                maxfd = *itr;
+            FD_SET(*itr, &readfs);
+        }
+
+        //select events
+        int sel_res = select(maxfd+1, &readfs, NULL, NULL, NULL);// &Timeout);
+        //printf("sel_res %d %d\n", sel_res, errno);        
+        //fflush(stdout);
+        //job assignment to threads
+        
+        for(int i=0; i<sel_list.size(); i++ )
+        {
+            int currsock = sel_list[i];
+
+            if (FD_ISSET(currsock, &readfs))
+            {
+                //printf("sock %d ready\n", currsock);
+                pthread_mutex_lock(qlock);
+
+                if(idleth_queue.size() > 0) //assign the job to an indle thread
+                {
+                    pthread_item idle_th = idleth_queue.front();
+                    idleth_queue.pop();
+                    
+                    //wake up the thread to do work
+                    pthread_mutex_lock(idle_th.data->th_mut);
+                    //set the job of the idle thread
+                    *(idle_th.data->workitem) = currsock;
+                    pthread_cond_signal(idle_th.data->th_cond);
+                    pthread_mutex_unlock(idle_th.data->th_mut);            
+                }
+                else
+                {
+                    //insert the given element to the work queue
+                    work_queue.push(currsock);                    
+                    printf("work queue size:\t%d\n", work_queue.size());
+                }
+                
+                //remove from the sel_list
+                sel_list.erase(sel_list.begin()+i);
+                i--;
+
+                pthread_mutex_unlock(qlock);
+                
+            }
+        }
+    }
+    
+}
+
+void *serverLoop(void *args)
+{
+
+    serverth_data *sdata = (serverth_data*)args;
+    
+    int sockfd; //socket descriptor
+    struct sockaddr_in serv_addr;
+    struct sockaddr_in cli_addr;
+    int newsockfd; //newly created 
+    socklen_t clilen = sizeof(cli_addr);
+    
+
+    //open a socket
+    sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    if (sockfd < 0) 
+    {
+        printf("ERROR opening socket\n");
+        return 0;
+    }
+    
+    bzero((char *) &serv_addr, sizeof(serv_addr));     
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    serv_addr.sin_port = htons(sdata->server_port);
+    
+    if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) 
+    {
+        printf("ERROR on binding.\n");
+        return 0;
+    }
+    
+    //start listening on the server socket
+    //second arg is the max number of coonections waiting in queue
+    if(listen(sockfd,SOMAXCONN)==-1)
+    {
+        printf("ERROR on listen.\n");
+         return 0;
+    }
+
+    printf("LSM Server listenning...\n");
+
+    *(sdata->server_socket) = sockfd;
+    int flag, result;
+    while(true)
+    {
+        newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen);
+        if (newsockfd < 0) 
+        {
+            printf("ERROR on accept.\n");
+            return 0; // we probably want to continue instead of return here (when not debugging)
+        }
+
+        flag = 1;
+        result = setsockopt(newsockfd,            /* socket affected */
+                            IPPROTO_TCP,     /* set option at TCP level */
+                            TCP_NODELAY,     /* name of option */
+                            (char *) &flag,  /* the cast is historical
+                                                cruft */
+                            sizeof(int));    /* length of option value */
+        if (result < 0)
+        {
+            printf("ERROR on setting socket option TCP_NODELAY.\n");
+            return 0; 
+        }        
+
+        char clientip[20];
+        inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20);
+        printf("Connection from:\t%s\n", clientip);
+
+        //printf("Number of idle threads %d\n", idleth_queue.size());
+
+        pthread_mutex_lock(sdata->qlock);
+
+        //insert the given element to the ready queue
+        sdata->ready_queue->push(newsockfd);
+
+        if(sdata->ready_queue->size() == 1) //signal the event loop
+            pthread_cond_signal(sdata->selcond);
+        
+        pthread_mutex_unlock(sdata->qlock);
+        
+    }
+
+    
+}
+
+inline void readfromsocket(int sockd, byte *buf, int count)
+{
+
+    int n = 0;
+    while( n < count )
+    {
+        n += read( sockd, buf + n, count - n);
+    }
+    
+}
+
+inline void writetosocket(int sockd, byte *buf, int count)
+{
+    int n = 0;
+    while( n < count )
+    {
+        n += write( sockd, buf + n, count - n);
+    }    
+}
+
+
+
+
+
+void * thread_work_fn( void * args)
+{
+    pthread_item * item = (pthread_item *) args;
+
+    pthread_mutex_lock(item->data->th_mut);
+    while(true)
+    {        
+        while(*(item->data->workitem) == -1)
+        {
+            if(!*(item->data->sys_alive))
+                break;
+            pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job
+        }
+
+        
+        if(!*(item->data->sys_alive))
+        {
+            //printf("thread quitted.\n");
+            break;
+        }
+
+        //step 1: read the opcode
+        uint8_t opcode;
+        ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t));
+        assert( n == sizeof(uint8_t));
+        assert( opcode < logserver::OP_INVALID );
+
+        if( opcode == logserver::OP_DONE ) //close the conn on failure
+        {
+            pthread_mutex_lock(item->data->qlock);            
+            printf("client done. conn closed. (%d, %d, %d, %d)\n",
+                   n, errno, *(item->data->workitem), item->data->work_queue->size());
+            close(*(item->data->workitem));
+                
+            if(item->data->work_queue->size() > 0)
+            {
+                int new_work = item->data->work_queue->front();
+                item->data->work_queue->pop();
+                printf("work queue size:\t%d\n", item->data->work_queue->size());
+                *(item->data->workitem) = new_work;
+            }
+            else
+            {
+                //set work to -1
+                *(item->data->workitem) = -1;
+                //add self to idle queue
+                item->data->idleth_queue->push(*item);
+            }            
+
+            pthread_cond_signal(item->data->selcond);
+            
+            pthread_mutex_unlock(item->data->qlock);
+            continue;            
+        }
+
+        
+        //step 2: read the tuple from client        
+        datatuple tuple;
+        tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t));
+        tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t));
+        
+        //read the key length
+        n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t));
+        assert( n == sizeof(uint32_t));
+        //read the data length
+        n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t));
+        assert( n == sizeof(uint32_t));
+        
+        //read the key
+        tuple.key = (byte*) malloc(*tuple.keylen);
+        readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen);
+        //read the data
+        if(!tuple.isDelete() && opcode != logserver::OP_FIND)
+        {
+            tuple.data = (byte*) malloc(*tuple.datalen);
+            readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen);
+        }
+        else
+            tuple.data = 0;
+
+        //step 3: process the tuple
+        //pthread_mutex_lock(item->data->table_lock);
+        //readlock(item->data->table_lock,0);
+        
+        if(opcode == logserver::OP_INSERT)
+        {
+            //insert/update/delete
+            item->data->ltable->insertTuple(tuple);
+            //unlock the lsmlock
+            //pthread_mutex_unlock(item->data->table_lock);
+            //unlock(item->data->table_lock);
+            //step 4: send response
+            uint8_t rcode = logserver::OP_SUCCESS;
+            n = write(*(item->data->workitem), &rcode, sizeof(uint8_t));
+            assert(n == sizeof(uint8_t));
+            
+        }
+        else if(opcode == logserver::OP_FIND)
+        {
+            //find the tuple
+            datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen);
+            //unlock the lsmlock
+            //pthread_mutex_unlock(item->data->table_lock);
+            //unlock(item->data->table_lock);
+            
+            if(dt == 0)  //tuple deleted
+            {
+                dt = (datatuple*) malloc(sizeof(datatuple));
+                dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen);
+                *dt->keylen = *tuple.keylen;
+                dt->datalen = dt->keylen + 1;
+                dt->key = (datatuple::key_t) (dt->datalen+1);
+                memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen);
+                dt->setDelete();
+            }
+
+            //send the reply code
+            uint8_t rcode = logserver::OP_SENDING_TUPLE;
+            n = write(*(item->data->workitem), &rcode, sizeof(uint8_t));
+            assert(n == sizeof(uint8_t));
+
+            //send the tuple
+            writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length());
+
+            //free datatuple
+            free(dt->keylen);
+            free(dt);
+        }
+
+        //close the socket
+        //close(*(item->data->workitem));
+
+        //free the tuple
+        free(tuple.keylen);
+        free(tuple.datalen);
+        free(tuple.key);
+        free(tuple.data);
+
+        //printf("socket %d: work completed.\n", *(item->data->workitem));
+        
+        pthread_mutex_lock(item->data->qlock);
+
+        //add conn desc to ready queue
+        item->data->ready_queue->push(*(item->data->workitem));
+        //printf("ready queue size: %d sock(%d)\n", item->data->ready_queue->size(), *(item->data->workitem));
+        if(item->data->ready_queue->size() == 1) //signal the event loop
+            pthread_cond_signal(item->data->selcond);
+        
+        if(item->data->work_queue->size() > 0)
+        {
+            int new_work = item->data->work_queue->front();
+            item->data->work_queue->pop();
+            printf("work queue size:\t%d\n", item->data->work_queue->size());
+            *(item->data->workitem) = new_work;
+        }
+        else
+        {
+            //set work to -1
+            *(item->data->workitem) = -1;
+            //add self to idle queue
+            item->data->idleth_queue->push(*item);
+        }
+        
+        pthread_mutex_unlock(item->data->qlock);
+
+    }
+    pthread_mutex_unlock(item->data->th_mut);
+
+
+}
+                       
+
diff --git a/logserver_pers.h b/logserver_pers.h
new file mode 100644
index 0000000..94a10b7
--- /dev/null
+++ b/logserver_pers.h
@@ -0,0 +1,163 @@
+#ifndef _LOGSERVER_H_
+#define _LOGSERVER_H_
+
+
+#include <queue>
+#include <vector>
+
+//#include "logstore.h"
+
+#include "datatuple.h"
+
+
+
+#include <stasis/transactional.h>
+#include <pthread.h>
+
+#undef begin
+#undef try
+#undef end
+
+class logtable;
+
+
+
+struct pthread_item;
+
+struct pthread_data {
+    std::queue<pthread_item> *idleth_queue;
+    std::queue<int> *ready_queue;
+    std::queue<int> *work_queue;
+    pthread_mutex_t * qlock;
+
+    pthread_cond_t *selcond;
+    
+    pthread_cond_t * th_cond;
+    pthread_mutex_t * th_mut;
+    
+    int *workitem; //id of the socket to work
+
+    //pthread_mutex_t * table_lock;
+    rwl *table_lock;
+    logtable *ltable;
+    bool *sys_alive;
+};
+
+struct pthread_item{
+    pthread_t * th_handle;
+    pthread_data *data;
+};
+
+
+//struct work_item
+//{
+//    int sockd; //socket id
+//    datatuple in_tuple; //request
+//    datatuple out_tuple; //response
+//};
+
+struct serverth_data
+{
+    int *server_socket;
+    int server_port;
+    std::queue<pthread_item> *idleth_queue;
+    std::queue<int> *ready_queue;
+
+    pthread_cond_t *selcond;
+    
+    pthread_mutex_t *qlock;
+    
+    
+
+};
+
+void * thread_work_fn( void *);    
+
+class logserver
+{
+public:
+    //server codes
+    static uint8_t OP_SUCCESS;
+    static uint8_t OP_FAIL;
+    static uint8_t OP_SENDING_TUPLE;
+
+    //client codes
+    static uint8_t OP_FIND;
+    static uint8_t OP_INSERT;
+
+    static uint8_t OP_DONE;
+    
+    static uint8_t OP_INVALID;
+    
+public:
+    logserver(int nthreads, int server_port){
+        this->nthreads = nthreads;
+        this->server_port = server_port;
+        //lsmlock = new pthread_mutex_t;
+        //pthread_mutex_init(lsmlock,0);
+
+        lsmlock = initlock();
+
+        qlock = new pthread_mutex_t;
+        pthread_mutex_init(qlock,0);
+
+        ltable = 0;
+
+    }
+
+    ~logserver()
+        {
+            //delete lsmlock;
+            deletelock(lsmlock);
+            delete qlock;
+        }
+    
+    void startserver(logtable *ltable);
+
+    void stopserver();
+    
+    
+public:
+
+private:
+
+    //main loop of server
+    //accept connections, assign jobs to threads
+    //void dispatchLoop();
+
+    void eventLoop();
+    
+
+private:
+
+    int server_port;
+    
+    int nthreads;
+
+    bool sys_alive;
+    
+    int serversocket; //server socket file descriptor
+
+    //ccqueue<int> conn_queue; //list of active connections (socket list)
+
+    //ccqueue<pthread_item> idleth_queue; //list of idle threads
+
+    std::queue<int> ready_queue; //connections to go inside select
+    std::queue<int> work_queue;  //connections to be processed by worker threads
+    std::queue<pthread_item> idleth_queue;
+    pthread_mutex_t *qlock;
+
+    pthread_t server_thread;
+    serverth_data *sdata;
+    pthread_cond_t *selcond; //server loop cond
+    
+    std::vector<pthread_item *> th_list; // list of threads
+
+    rwl *lsmlock; //lock for using lsm table
+
+    logtable *ltable;
+    
+};
+
+
+#endif
diff --git a/logserver_simple.cpp b/logserver_simple.cpp
new file mode 100644
index 0000000..56f9ceb
--- /dev/null
+++ b/logserver_simple.cpp
@@ -0,0 +1,409 @@
+
+
+
+#include "logserver.h"
+#include "datatuple.h"
+
+#include "logstore.h"
+
+#include <sys/types.h> 
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#undef begin
+#undef end
+#undef try
+
+
+//server codes
+uint8_t logserver::OP_SUCCESS = 1;
+uint8_t logserver::OP_FAIL = 2;
+uint8_t logserver::OP_SENDING_TUPLE = 3;
+
+//client codes
+uint8_t logserver::OP_FIND = 4;
+uint8_t logserver::OP_INSERT = 5;
+
+uint8_t logserver::OP_INVALID = 32;
+
+
+void logserver::startserver(logtable *ltable)
+{
+    sys_alive = true;
+    this->ltable = ltable;
+    //initialize threads
+    for(int i=0; i<nthreads; i++)
+    {
+        struct pthread_item *worker_th = new pthread_item;
+        th_list.push_back(worker_th);
+        
+        worker_th->th_handle = new pthread_t;
+        struct pthread_data *worker_data = new pthread_data;
+        worker_th->data = worker_data;
+
+        worker_data->idleth_queue = &idleth_queue;
+        
+        worker_data->conn_queue = &conn_queue;
+
+        worker_data->qlock = qlock;
+        
+        worker_data->th_cond = new pthread_cond_t;
+        pthread_cond_init(worker_data->th_cond,0);
+        
+        worker_data->th_mut = new pthread_mutex_t;
+        pthread_mutex_init(worker_data->th_mut,0);
+
+        worker_data->workitem = new int;
+        *(worker_data->workitem) = -1;
+
+        worker_data->table_lock = lsmlock;
+
+        worker_data->ltable = ltable;
+
+        worker_data->sys_alive = &sys_alive;
+        
+        pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th);
+
+        idleth_queue.push(*worker_th);
+                
+        
+    }
+
+    dispatchLoop();
+
+}
+
+void logserver::stopserver()
+{
+    //close the server socket
+    //stops receiving data on the server socket
+    shutdown(serversocket, 0);
+
+    //wait for all threads to be idle
+    while(idleth_queue.size() != nthreads)
+        sleep(1); 
+
+    //set the system running flag to false
+    sys_alive = false;
+    for(int i=0; i<nthreads; i++)    
+    {
+        pthread_item *idle_th = th_list[i];
+        
+        //wake up the thread 
+        pthread_mutex_lock(idle_th->data->th_mut);        
+        pthread_cond_signal(idle_th->data->th_cond);
+        pthread_mutex_unlock(idle_th->data->th_mut);
+        //wait for it to join
+        pthread_join(*(idle_th->th_handle), 0);
+        //free the thread variables
+        pthread_cond_destroy(idle_th->data->th_cond);
+        delete idle_th->data->th_cond;
+        delete idle_th->data->th_mut;
+        delete idle_th->data->workitem;
+        delete idle_th->data;
+        delete idle_th->th_handle;        
+    }
+
+    th_list.clear();
+
+    return;
+}
+
+void logserver::dispatchLoop()
+{
+    
+    int sockfd; //socket descriptor
+    struct sockaddr_in serv_addr;
+    struct sockaddr_in cli_addr;
+    int newsockfd; //newly created 
+    socklen_t clilen = sizeof(cli_addr);
+    
+
+    //open a socket
+    sockfd = socket(AF_INET, SOCK_STREAM, 0);
+    if (sockfd < 0) 
+    {
+        printf("ERROR opening socket\n");
+        return;
+    }
+    
+    bzero((char *) &serv_addr, sizeof(serv_addr));     
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    serv_addr.sin_port = htons(server_port);
+    
+    if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) 
+    {
+        printf("ERROR on binding.\n");
+        return;
+    }
+    
+    //start listening on the server socket
+    //second arg is the max number of coonections waiting in queue
+    if(listen(sockfd,SOMAXCONN)==-1)
+    {
+        printf("ERROR on listen.\n");
+         return;
+    }
+
+    printf("LSM Server listenning...\n");
+
+    serversocket = sockfd;
+    int flag, result;
+    while(true)
+    {
+        newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen);
+        if (newsockfd < 0) 
+        {
+            printf("ERROR on accept.\n");
+            return; // we probably want to continue instead of return here (when not debugging)
+        }
+
+        flag = 1;
+        result = setsockopt(newsockfd,            /* socket affected */
+                            IPPROTO_TCP,     /* set option at TCP level */
+                            TCP_NODELAY,     /* name of option */
+                            (char *) &flag,  /* the cast is historical
+                                                cruft */
+                            sizeof(int));    /* length of option value */
+        if (result < 0)
+        {
+            printf("ERROR on setting socket option TCP_NODELAY.\n");
+            return; 
+        }        
+
+        char clientip[20];
+        inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20);
+        //printf("Connection from:\t%s\n", clientip);
+
+        //printf("Number of idle threads %d\n", idleth_queue.size());
+
+        pthread_mutex_lock(qlock);
+
+        if(idleth_queue.size() > 0)
+        {
+            pthread_item idle_th = idleth_queue.front();
+            idleth_queue.pop();
+
+            //wake up the thread to do work
+            pthread_mutex_lock(idle_th.data->th_mut);
+            //set the job of the idle thread
+            *(idle_th.data->workitem) = newsockfd;
+            pthread_cond_signal(idle_th.data->th_cond);
+            pthread_mutex_unlock(idle_th.data->th_mut);            
+        }
+        else
+        {
+            //insert the given element to the queue
+            conn_queue.push(newsockfd);
+            //printf("Number of queued connections:\t%d\n", conn_queue.size());
+        }
+
+        pthread_mutex_unlock(qlock);
+
+        /*
+        try
+        {
+            
+            pthread_item idle_th = idleth_queue.pop();
+            //wake up the thread to do work
+            pthread_mutex_lock(idle_th.data->th_mut);
+            //set the job of the idle thread
+            *(idle_th.data->workitem) = newsockfd;
+            pthread_cond_signal(idle_th.data->th_cond);
+            pthread_mutex_unlock(idle_th.data->th_mut);
+            
+        }
+        catch(int empty_exception)
+        {
+            //insert the given element to the queue
+            conn_queue.push(newsockfd);
+            //printf("Number of queued connections:\t%d\n", conn_queue.size());
+        }
+        */
+    }
+
+    
+}
+
+inline void readfromsocket(int sockd, byte *buf, int count)
+{
+
+    int n = 0;
+    while( n < count )
+    {
+        n += read( sockd, buf + n, count - n);
+    }
+    
+}
+
+inline void writetosocket(int sockd, byte *buf, int count)
+{
+    int n = 0;
+    while( n < count )
+    {
+        n += write( sockd, buf + n, count - n);
+    }    
+}
+
+
+
+
+
+void * thread_work_fn( void * args)
+{
+    pthread_item * item = (pthread_item *) args;
+
+    pthread_mutex_lock(item->data->th_mut);
+    while(true)
+    {        
+        while(*(item->data->workitem) == -1)
+        {
+            if(!*(item->data->sys_alive))
+                break;
+            pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job
+        }
+
+        
+        if(!*(item->data->sys_alive))
+        {
+            //printf("thread quitted.\n");
+            break;
+        }
+
+        //step 1: read the opcode
+        uint8_t opcode;
+        ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t));
+        assert( n == sizeof(uint8_t));
+        assert( opcode < logserver::OP_INVALID );
+        
+        //step 2: read the tuple from client        
+        datatuple tuple;
+        tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t));
+        tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t));
+        
+        //read the key length
+        n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t));
+        assert( n == sizeof(uint32_t));
+        //read the data length
+        n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t));
+        assert( n == sizeof(uint32_t));
+        
+        //read the key
+        tuple.key = (byte*) malloc(*tuple.keylen);
+        readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen);
+        //read the data
+        if(!tuple.isDelete() && opcode != logserver::OP_FIND)
+        {
+            tuple.data = (byte*) malloc(*tuple.datalen);
+            readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen);
+        }
+        else
+            tuple.data = 0;
+
+        //step 3: process the tuple
+        //pthread_mutex_lock(item->data->table_lock);
+        //readlock(item->data->table_lock,0);
+        
+        if(opcode == logserver::OP_INSERT)
+        {
+            //insert/update/delete
+            item->data->ltable->insertTuple(tuple);
+            //unlock the lsmlock
+            //pthread_mutex_unlock(item->data->table_lock);
+            //unlock(item->data->table_lock);
+            //step 4: send response
+            uint8_t rcode = logserver::OP_SUCCESS;
+            n = write(*(item->data->workitem), &rcode, sizeof(uint8_t));
+            assert(n == sizeof(uint8_t));
+            
+        }
+        else if(opcode == logserver::OP_FIND)
+        {
+            //find the tuple
+            datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen);
+            //unlock the lsmlock
+            //pthread_mutex_unlock(item->data->table_lock);
+            //unlock(item->data->table_lock);
+            
+            if(dt == 0)  //tuple deleted
+            {
+                dt = (datatuple*) malloc(sizeof(datatuple));
+                dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen);
+                *dt->keylen = *tuple.keylen;
+                dt->datalen = dt->keylen + 1;
+                dt->key = (datatuple::key_t) (dt->datalen+1);
+                memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen);
+                dt->setDelete();
+            }
+
+            //send the reply code
+            uint8_t rcode = logserver::OP_SENDING_TUPLE;
+            n = write(*(item->data->workitem), &rcode, sizeof(uint8_t));
+            assert(n == sizeof(uint8_t));
+
+            //send the tuple
+            writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length());
+
+            //free datatuple
+            free(dt->keylen);
+            free(dt);
+        }
+
+        //close the socket
+        close(*(item->data->workitem));
+
+        //free the tuple
+        free(tuple.keylen);
+        free(tuple.datalen);
+        free(tuple.key);
+        free(tuple.data);
+
+        //printf("socket %d: work completed.\n", *(item->data->workitem));
+
+        pthread_mutex_lock(item->data->qlock);
+
+        if(item->data->conn_queue->size() > 0)
+        {
+            int new_work = item->data->conn_queue->front();
+            item->data->conn_queue->pop();
+            *(item->data->workitem) = new_work;
+        }
+        else
+        {
+            //set work to -1
+            *(item->data->workitem) = -1;
+            //add self to idle queue
+            item->data->idleth_queue->push(*item);
+        }
+        
+        pthread_mutex_unlock(item->data->qlock);
+
+        /*
+        //check if there is new work this thread can do
+        try
+        {            
+            int new_work = item->data->conn_queue->pop();
+            *(item->data->workitem) = new_work; //set new work
+            //printf("socket %d: new work found.\n", *(item->data->workitem));
+        }
+        catch(int empty_exception)
+        {
+            //printf("socket %d: no new work found.\n", *(item->data->workitem));
+            //set work to -1
+            *(item->data->workitem) = -1;
+            //add self to idle queue
+            item->data->idleth_queue->push(*item);
+
+        }
+        */
+
+    }
+    pthread_mutex_unlock(item->data->th_mut);
+
+
+}
+                       
+
diff --git a/logserver_simple.h b/logserver_simple.h
new file mode 100644
index 0000000..48fbea6
--- /dev/null
+++ b/logserver_simple.h
@@ -0,0 +1,198 @@
+#ifndef _LOGSERVER_H_
+#define _LOGSERVER_H_
+
+
+#include <queue>
+#include <vector>
+
+//#include "logstore.h"
+
+#include "datatuple.h"
+
+
+
+#include <stasis/transactional.h>
+#include <pthread.h>
+
+#undef begin
+#undef try
+#undef end
+
+class logtable;
+
+template<class T>
+class ccqueue
+{
+public:
+    ccqueue()
+        {
+            qmut = new pthread_mutex_t;
+            pthread_mutex_init(qmut,0);            
+        }
+    
+    int size()
+        {
+            pthread_mutex_lock(qmut);
+            int qsize = m_queue.size();            
+            pthread_mutex_unlock(qmut);
+            return qsize;
+        }
+
+    //inserts a copy of the given element to the queue
+    void push(const T &item)
+        {
+            pthread_mutex_lock(qmut);
+            m_queue.push(item);
+            pthread_mutex_unlock(qmut);
+            return;
+        }
+
+    //returns a copy of the next element
+    //deletes the copy in the queue
+    //throws an exception with -1 on empty queue
+    T pop() throw (int)
+        {
+            pthread_mutex_lock(qmut);
+ 
+            if(m_queue.size() > 0)
+            {
+                T item = m_queue.front();
+                m_queue.pop();
+                pthread_mutex_unlock(qmut);
+                return item;
+            }
+            
+            
+            pthread_mutex_unlock(qmut);
+            throw(-1);
+                
+            
+        }
+
+    
+
+    ~ccqueue()
+        {
+            delete qmut;
+        }
+    
+private:
+
+    std::queue<T> m_queue;
+
+    pthread_mutex_t *qmut;
+
+};
+
+struct pthread_item;
+
+struct pthread_data {
+    std::queue<pthread_item> *idleth_queue;
+    std::queue<int> *conn_queue;
+    pthread_mutex_t * qlock;
+
+    pthread_cond_t * th_cond;
+    pthread_mutex_t * th_mut;
+    
+    int *workitem; //id of the socket to work
+
+    //pthread_mutex_t * table_lock;
+    rwl *table_lock;
+    logtable *ltable;
+    bool *sys_alive;
+};
+
+struct pthread_item{
+    pthread_t * th_handle;
+    pthread_data *data;
+};
+
+struct work_item
+{
+    int sockd; //socket id
+    datatuple in_tuple; //request
+    datatuple out_tuple; //response
+};
+    
+
+void * thread_work_fn( void *);    
+
+class logserver
+{
+public:
+    //server codes
+    static uint8_t OP_SUCCESS;
+    static uint8_t OP_FAIL;
+    static uint8_t OP_SENDING_TUPLE;
+
+    //client codes
+    static uint8_t OP_FIND;
+    static uint8_t OP_INSERT;
+
+    static uint8_t OP_INVALID;
+    
+public:
+    logserver(int nthreads, int server_port){
+        this->nthreads = nthreads;
+        this->server_port = server_port;
+        //lsmlock = new pthread_mutex_t;
+        //pthread_mutex_init(lsmlock,0);
+
+        lsmlock = initlock();
+
+        qlock = new pthread_mutex_t;
+        pthread_mutex_init(qlock,0);
+
+        ltable = 0;
+
+    }
+
+    ~logserver()
+        {
+            //delete lsmlock;
+            deletelock(lsmlock);
+            delete qlock;
+        }
+    
+    void startserver(logtable *ltable);
+
+    void stopserver();
+    
+    
+public:
+
+private:
+
+    //main loop of server
+    //accept connections, assign jobs to threads
+    void dispatchLoop();
+    
+
+private:
+
+    int server_port;
+    
+    int nthreads;
+
+    bool sys_alive;
+    
+    int serversocket; //server socket file descriptor
+
+    //ccqueue<int> conn_queue; //list of active connections (socket list)
+
+    //ccqueue<pthread_item> idleth_queue; //list of idle threads
+
+    std::queue<int> conn_queue;
+    std::queue<pthread_item> idleth_queue;
+    pthread_mutex_t *qlock;
+
+    std::vector<pthread_item *> th_list; // list of threads
+
+    rwl *lsmlock; //lock for using lsm table
+
+    logtable *ltable;
+    
+};
+
+
+#endif
diff --git a/logstore.cpp b/logstore.cpp
new file mode 100644
index 0000000..08d28b7
--- /dev/null
+++ b/logstore.cpp
@@ -0,0 +1,1606 @@
+
+
+
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <ctype.h>
+
+
+#include "merger.h"
+#include "logstore.h"
+#include "logiterators.h"
+
+
+#include "datapage.cpp"
+
+
+#include <stasis/page/slotted.h>
+
+/////////////////////////////////////////////////////////////////
+// LOGTREE implementation
+/////////////////////////////////////////////////////////////////
+
+const RegionAllocConf_t logtree::REGION_ALLOC_STATIC_INITIALIZER = { {0,0,-1}, 0, -1, -1, 1000 };
+const RegionAllocConf_t
+logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER = { {0,0,-1}, 0, -1, -1, 50000 };
+
+#undef DEBUG
+#define DEBUG(...) \
+
+//printf(__VA_ARGS__); fflush(NULL)
+
+#define LOGTREE_ROOT_PAGE SLOTTED_PAGE
+
+//LSM_ROOT_PAGE 
+
+const int64_t logtree::DEPTH = 0;      //in root this is the slot num where the DEPTH (of tree) is stored
+const int64_t logtree::COMPARATOR = 1; //in root this is the slot num where the COMPARATOR id is stored
+const int64_t logtree::FIRST_SLOT = 2; //this is the first unused slot in all index pages
+const size_t logtree::root_rec_size = sizeof(int64_t);
+const int64_t logtree::PREV_LEAF = 0; //pointer to prev leaf page
+const int64_t logtree::NEXT_LEAF = 1; //pointer to next leaf page
+
+
+
+logtree::logtree()
+{
+
+}
+
+void logtree::free_region_rid(int xid, recordid tree,
+          logtree_page_deallocator_t dealloc, void *allocator_state)
+{
+  //  Tdealloc(xid,tree);
+  dealloc(xid,allocator_state);
+  // XXX fishy shouldn't caller do this?
+  Tdealloc(xid, *(recordid*)allocator_state);
+}
+
+
+void logtree::dealloc_region_rid(int xid, void *conf)
+{
+    recordid rid = *(recordid*)conf;
+    RegionAllocConf_t a;
+    Tread(xid,rid,&a);
+    DEBUG("{%lld <- dealloc region arraylist}\n", a.regionList.page);
+
+    for(int i = 0; i < a.regionCount; i++) {
+     a.regionList.slot = i;
+     pageid_t pid;
+     Tread(xid,a.regionList,&pid);
+     TregionDealloc(xid,pid);
+    }
+}
+
+
+void logtree::force_region_rid(int xid, void *conf)
+{
+    recordid rid = *(recordid*)conf;
+    RegionAllocConf_t a;
+    Tread(xid,rid,&a);
+    
+    for(int i = 0; i < a.regionCount; i++)
+    {
+        a.regionList.slot = i;
+        pageid_t pid;
+        Tread(xid,a.regionList,&pid);
+        stasis_dirty_page_table_flush_range((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), pid, pid+a.regionSize);
+        forcePageRange(pid, pid+a.regionSize);
+    }
+}
+
+
+pageid_t logtree::alloc_region(int xid, void *conf)
+{
+    RegionAllocConf_t* a = (RegionAllocConf_t*)conf;
+
+    
+  if(a->nextPage == a->endOfRegion) {
+    if(a->regionList.size == -1) {
+        //DEBUG("nextPage: %lld\n", a->nextPage);
+        a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t));
+        DEBUG("regionList.page: %lld\n", a->regionList.page);
+        DEBUG("regionList.slot: %d\n", a->regionList.slot);
+        DEBUG("regionList.size: %lld\n", a->regionList.size);
+        
+        a->regionCount = 0;
+    }
+    DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page);
+    TarrayListExtend(xid,a->regionList,1);
+    a->regionList.slot = a->regionCount;
+    DEBUG("region lst slot %d\n",a->regionList.slot);
+    a->regionCount++;
+    DEBUG("region count %lld\n",a->regionCount);
+    a->nextPage = TregionAlloc(xid, a->regionSize,12);
+    DEBUG("next page %lld\n",a->nextPage);
+    a->endOfRegion = a->nextPage + a->regionSize;
+    Tset(xid,a->regionList,&a->nextPage);
+    DEBUG("next page %lld\n",a->nextPage);
+  }
+    
+  DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion);
+  pageid_t ret = a->nextPage;
+  // Ensure the page is in buffer cache without accessing disk (this
+  // sets it to clean and all zeros if the page is not in cache).
+  // Hopefully, future reads will get a cache hit, and avoid going to
+  // disk.
+
+  Page * p = loadUninitializedPage(xid, ret);
+  releasePage(p);
+  DEBUG("ret %lld\n",ret);
+  (a->nextPage)++;
+  return ret;
+
+}
+
+pageid_t logtree::alloc_region_rid(int xid, void * ridp) {
+  recordid rid = *(recordid*)ridp;
+  RegionAllocConf_t conf;
+  Tread(xid,rid,&conf);
+  pageid_t ret = alloc_region(xid,&conf);
+  //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page);
+  // XXX get rid of Tset by storing next page in memory, and losing it
+  //     on crash.
+  Tset(xid,rid,&conf);
+  return ret;
+}
+
+
+
+recordid logtree::create(int xid)
+{
+
+    tree_state = Talloc(xid,sizeof(RegionAllocConf_t));
+
+    //int ptype = TpageGetType(xid, tree_state.page);
+    //DEBUG("page type %d\n", ptype); //returns a slotted page
+    
+    Tset(xid,tree_state, &REGION_ALLOC_STATIC_INITIALIZER);
+
+    pageid_t root = alloc_region_rid(xid, &tree_state); 
+    DEBUG("Root = %lld\n", root);
+    recordid ret = { root, 0, 0 };
+    
+    Page *p = loadPage(xid, ret.page);
+    writelock(p->rwlatch,0);
+    
+    stasis_page_slotted_initialize_page(p);
+    
+    //*stasis_page_type_ptr(p) = SLOTTED_PAGE; //LOGTREE_ROOT_PAGE;
+    
+    //logtree_state *state = (logtree_state*) ( malloc(sizeof(logtree_state)));
+    //state->lastLeaf = -1;
+    
+    //p->impl = state;
+    lastLeaf = -1;
+
+    //initialize root node
+    recordid tmp  = stasis_record_alloc_begin(xid, p, root_rec_size);
+    stasis_record_alloc_done(xid,p,tmp);
+    
+    assert(tmp.page == ret.page
+           && tmp.slot == DEPTH
+           && tmp.size == root_rec_size);
+
+    writeRecord(xid, p, tmp, (byte*)&DEPTH, root_rec_size);
+    
+    tmp = stasis_record_alloc_begin(xid, p, root_rec_size);
+    stasis_record_alloc_done(xid,p,tmp);
+
+    assert(tmp.page == ret.page
+           && tmp.slot == COMPARATOR
+           && tmp.size == root_rec_size);
+
+    writeRecord(xid, p, tmp, (byte*)&COMPARATOR, root_rec_size);
+    
+    
+    unlock(p->rwlatch);
+    releasePage(p);    
+
+    root_rec = ret;
+    
+    return ret;
+}
+
+
+/**
+ * TODO: what happen if there is already such a record with a different size?
+ * I guess this should never happen in rose, but what if? 
+ **/
+void logtree::writeRecord(int xid, Page *p, recordid &rid,
+                          const byte *data, size_t datalen)
+{
+    byte *byte_arr = stasis_record_write_begin(xid, p, rid);
+    memcpy(byte_arr, data, datalen); //TODO: stasis write call
+    stasis_record_write_done(xid, p, rid, byte_arr);
+    stasis_page_lsn_write(xid, p, 0); // XXX need real LSN?   
+
+}
+
+void logtree::writeNodeRecord(int xid, Page * p, recordid & rid, 
+                              const byte *key, size_t keylen, pageid_t ptr)
+{
+    DEBUG("writenoderecord:\tp->id\t%lld\tkey:\t%s\tkeylen: %d\tval_page\t%lld\n",
+          p->id, datatuple::key_to_str(key).c_str(), keylen, ptr);
+    indexnode_rec *nr = (indexnode_rec*)stasis_record_write_begin(xid, p, rid);
+    nr->ptr = ptr;
+    memcpy(nr+1, key, keylen);
+    stasis_record_write_done(xid, p, rid, (byte*)nr);
+    stasis_page_lsn_write(xid, p, 0); // XXX need real LSN?   
+}
+
+void logtree::writeRecord(int xid, Page *p, slotid_t slot,
+                          const byte *data, size_t datalen)
+{
+    recordid rid;
+    rid.page = p->id;
+    rid.slot = slot;
+    rid.size = datalen;
+    byte *byte_arr = stasis_record_write_begin(xid, p, rid);
+    memcpy(byte_arr, data, datalen); //TODO: stasis write call
+    stasis_record_write_done(xid, p, rid, byte_arr);
+    stasis_page_lsn_write(xid, p, 0); // XXX need real LSN?   
+
+}
+
+const byte* logtree::readRecord(int xid, Page * p, recordid &rid)
+{    
+    //byte *ret = (byte*)malloc(rid.size);
+    //const byte *nr = stasis_record_read_begin(xid,p,rid);
+    //memcpy(ret, nr, rid.size);
+    //stasis_record_read_done(xid,p,rid,nr);
+
+    const byte *nr = stasis_record_read_begin(xid,p,rid);
+    return nr;
+
+    //DEBUG("reading {%lld, %d, %d}\n",
+    //      p->id, rid.slot, rid.size );
+
+    //return ret;
+}
+
+const byte* logtree::readRecord(int xid, Page * p, slotid_t slot, int64_t size)
+{
+    recordid rid;
+    rid.page = p->id;
+    rid.slot = slot;
+    rid.size = size;
+    //byte *ret = (byte*)malloc(rid.size);
+    //stasis_record_read(xid,p,rid,ret);
+    //return ret;
+    const byte *nr = stasis_record_read_begin(xid,p,rid);
+    return nr;
+//    return readRecord(xid, p, rid);
+
+}
+
+int32_t logtree::readRecordLength(int xid, Page *p, slotid_t slot)
+{
+    recordid rec = {p->id, slot, 0};
+    int32_t reclen = stasis_record_length_read(xid, p, rec);
+    return reclen;
+}
+
+void logtree::initializeNodePage(int xid, Page *p)
+{
+    stasis_page_slotted_initialize_page(p);            
+    recordid reserved1 = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec));
+    stasis_record_alloc_done(xid, p, reserved1);
+    recordid reserved2 = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec));
+    stasis_record_alloc_done(xid, p, reserved2);
+}
+
+
+recordid logtree::appendPage(int xid, recordid tree, pageid_t & rmLeafID,
+                             const byte *key, size_t keySize,
+                             lsm_page_allocator_t allocator, void *allocator_state,
+                             long val_page)
+{
+  Page *p = loadPage(xid, tree.page);
+  writelock(p->rwlatch, 0);
+  //logtree_state *s = (logtree_state*)p->impl;
+  
+  tree.slot = 0;
+  //tree.size = sizeof(lsmTreeNodeRecord)+keySize;
+
+  const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0);  
+  int64_t depth = *((int64_t*)nr);
+  
+  if(rmLeafID == -1) {
+    rmLeafID = findLastLeaf(xid, p, depth);
+  }
+
+  Page *lastLeaf;
+
+  if(rmLeafID != tree.page)
+  {
+    lastLeaf= loadPage(xid, rmLeafID);
+    writelock(lastLeaf->rwlatch, 0);
+  } else 
+    lastLeaf = p;
+  
+
+  recordid ret = stasis_record_alloc_begin(xid, lastLeaf,
+                                           sizeof(indexnode_rec)+keySize);
+
+  if(ret.size == INVALID_SLOT)
+  {      
+      if(lastLeaf->id != p->id)
+      {
+          assert(rmLeafID != tree.page);
+          unlock(lastLeaf->rwlatch);
+          releasePage(lastLeaf); // don't need that page anymore...
+          lastLeaf = 0;
+      }
+      // traverse down the root of the tree.
+      
+      tree.slot = 0;
+      
+      assert(tree.page == p->id);
+      
+      ret = appendInternalNode(xid, p, depth, key, keySize, val_page,
+                               rmLeafID == tree.page ? -1 : rmLeafID,
+                               allocator, allocator_state);
+      
+      if(ret.size == INVALID_SLOT)
+      {
+          DEBUG("Need to split root; depth = %d\n", depth);
+          
+          pageid_t child = allocator(xid, allocator_state);
+          Page *lc = loadPage(xid, child);
+          writelock(lc->rwlatch,0);
+          
+          initializeNodePage(xid, lc);
+          
+          //creates a copy of the root page records in the
+          //newly allocated child page
+          for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(p); i++)
+          {
+              //read the record from the root page
+              const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,p,i,0);
+              int reclen = readRecordLength(xid, p, i);
+              
+              recordid cnext = stasis_record_alloc_begin(xid, lc,reclen);
+          
+              assert(i == cnext.slot);
+              assert(cnext.size != INVALID_SLOT);
+        
+              stasis_record_alloc_done(xid, lc, cnext);
+              
+              writeRecord(xid,lc,i,(byte*)(nr),reclen);              
+          }          
+      
+          // deallocate old entries, and update pointer on parent node.
+          // NOTE: stasis_record_free call goes to slottedFree in slotted.c
+          // this function only reduces the numslots when you call it
+          // with the last slot. so thats why i go backwards here.
+          for(int i = *stasis_page_slotted_numslots_ptr(p)-1; i>FIRST_SLOT; i--)
+          {
+              const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,p,i,0);
+              int reclen = readRecordLength(xid, p, i);
+              recordid tmp_rec= {p->id, i, reclen};
+              stasis_record_free(xid, p, tmp_rec);              
+          }
+          
+          //TODO: could change with stasis_slotted_page_initialize(...);
+          // reinsert first.
+          
+          recordid pFirstSlot = { p->id, FIRST_SLOT, readRecordLength(xid, p, FIRST_SLOT)};
+          
+          assert(*stasis_page_slotted_numslots_ptr(p) == FIRST_SLOT+1);
+      
+          indexnode_rec *nr
+              = (indexnode_rec*)stasis_record_write_begin(xid, p, pFirstSlot);
+
+          // don't overwrite key...
+          nr->ptr = child;
+          stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr);
+          stasis_page_lsn_write(xid, p, 0); // XXX need real LSN?
+          
+          if(!depth) {
+              rmLeafID = lc->id;
+              pageid_t tmpid = -1;
+              writeRecord(xid,lc,PREV_LEAF,(byte*)(&tmpid), root_rec_size);
+              writeRecord(xid,lc,NEXT_LEAF,(byte*)(&tmpid), root_rec_size);
+          }
+
+          unlock(lc->rwlatch);
+          releasePage(lc);
+          
+          //update the depth info at the root
+          depth ++;
+          writeRecord(xid,p,DEPTH,(byte*)(&depth), root_rec_size);
+          
+          assert(tree.page == p->id);
+          ret = appendInternalNode(xid, p, depth, key, keySize, val_page,
+                                   rmLeafID == tree.page ? -1 : rmLeafID,
+                                   allocator, allocator_state);
+
+          assert(ret.size != INVALID_SLOT);
+      
+      }
+      else {
+          DEBUG("Appended new internal node tree depth = %lld key = %s\n",
+                depth, datatuple::key_to_str(key).c_str());
+      }
+      
+      rmLeafID = ret.page;      
+      DEBUG("lastleaf is %lld\n", rmLeafID);
+      
+      
+  }
+  else
+  {
+    // write the new value to an existing page
+      DEBUG("Writing %s\t%d to existing page# %lld\n", datatuple::key_to_str(key).c_str(),
+            val_page, lastLeaf->id);
+
+      stasis_record_alloc_done(xid, lastLeaf, ret);
+
+      logtree::writeNodeRecord(xid, lastLeaf, ret, key, keySize, val_page);
+
+    if(lastLeaf->id != p->id) {
+      assert(rmLeafID != tree.page);
+      unlock(lastLeaf->rwlatch);
+      releasePage(lastLeaf);
+    }
+  }
+
+  unlock(p->rwlatch);
+  releasePage(p);
+
+  return ret;
+}
+
+/* adding pages:
+
+  1) Try to append value to lsmTreeState->lastLeaf
+
+  2) If that fails, traverses down the root of the tree, split pages while
+     traversing back up.
+
+  3) Split is done by adding new page at end of row (no key
+     redistribution), except at the root, where root contents are
+     pushed into the first page of the next row, and a new path from root to
+     leaf is created starting with the root's immediate second child.
+
+*/
+
+recordid logtree::appendInternalNode(int xid, Page *p,
+                                     int64_t depth,
+                                     const byte *key, size_t key_len,
+                                     pageid_t val_page, pageid_t lastLeaf,
+                                     logtree_page_allocator_t allocator,
+                                     void *allocator_state)
+{
+//    assert(*stasis_page_type_ptr(p) == LOGTREE_ROOT_PAGE ||
+//           *stasis_page_type_ptr(p) == SLOTTED_PAGE);
+    assert(p->pageType == LOGTREE_ROOT_PAGE ||
+           p->pageType == SLOTTED_PAGE);
+
+  DEBUG("appendInternalNode\tdepth %lldkeylen%d\tnumslots %d\n", depth, key_len, *stasis_page_slotted_numslots_ptr(p));
+  
+  if(!depth)
+  {
+      // leaf node.
+      recordid ret = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)+key_len);
+      if(ret.size != INVALID_SLOT) {
+          stasis_record_alloc_done(xid, p, ret);
+          writeNodeRecord(xid,p,ret,key,key_len,val_page);
+      }
+      return ret;
+  }
+  else
+  {
+    // recurse
+      int slot = *stasis_page_slotted_numslots_ptr(p)-1;//*recordcount_ptr(p)-1;
+
+      assert(slot >= FIRST_SLOT); // there should be no empty nodes
+      const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid, p, slot, 0);
+      pageid_t child_id = nr->ptr;
+      nr = 0;
+      recordid ret;
+      {
+          Page *child_page = loadPage(xid, child_id);
+          writelock(child_page->rwlatch,0);
+          ret = appendInternalNode(xid, child_page, depth-1, key, key_len,
+                                   val_page, lastLeaf, allocator, allocator_state);
+          
+          unlock(child_page->rwlatch);
+          releasePage(child_page);
+      }
+      
+      if(ret.size == INVALID_SLOT)  // subtree is full; split
+      {          
+          ret = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)+key_len);
+          DEBUG("keylen %d\tnumslots %d for page id %lld ret.size %lld  prv rec len %d\n",
+                key_len,
+                *stasis_page_slotted_numslots_ptr(p),
+                p->id,
+                ret.size,
+                readRecordLength(xid, p, slot)); 
+          if(ret.size != INVALID_SLOT)
+          {
+              stasis_record_alloc_done(xid, p, ret);
+              ret = buildPathToLeaf(xid, ret, p, depth, key, key_len, val_page,
+                                    lastLeaf, allocator, allocator_state);
+              
+              DEBUG("split tree rooted at %lld, wrote value to {%d %d %lld}\n",
+                    p->id, ret.page, ret.slot, ret.size);
+          } else {
+              // ret is NULLRID; this is the root of a full tree. Return
+              // NULLRID to the caller.
+          }
+      } else {
+          // we inserted the value in to a subtree rooted here.
+      }
+      return ret;
+  }
+}
+
+recordid logtree::buildPathToLeaf(int xid, recordid root, Page *root_p,
+                                  int64_t depth, const byte *key, size_t key_len,
+                                  pageid_t val_page, pageid_t lastLeaf,
+                                  logtree_page_allocator_t allocator,
+                                  void *allocator_state)
+{
+    
+  // root is the recordid on the root page that should point to the
+  // new subtree.
+  assert(depth);
+  DEBUG("buildPathToLeaf(depth=%lld) (lastleaf=%lld) called\n",depth, lastLeaf);
+
+  pageid_t child = allocator(xid,allocator_state);
+  DEBUG("new child = %lld internal? %lld\n", child, depth-1);
+
+  Page *child_p = loadPage(xid, child);
+  writelock(child_p->rwlatch,0);
+  initializeNodePage(xid, child_p);
+
+  recordid ret;
+
+  if(depth-1) {
+    // recurse: the page we just allocated is not a leaf.
+    recordid child_rec = stasis_record_alloc_begin(xid, child_p, sizeof(indexnode_rec)+key_len);
+    assert(child_rec.size != INVALID_SLOT);
+    stasis_record_alloc_done(xid, child_p, child_rec);
+
+    ret = buildPathToLeaf(xid, child_rec, child_p, depth-1, key, key_len,
+			  val_page,lastLeaf, allocator, allocator_state);
+
+    unlock(child_p->rwlatch);
+    releasePage(child_p);
+
+  } else {
+    // set leaf      
+
+    // backward link.//these writes do not need alloc_begin as it is done in page initialization
+      writeRecord(xid, child_p, PREV_LEAF, (byte*)(&lastLeaf), root_rec_size);
+    //writeNodeRecord(xid,child_p,PREV_LEAF,dummy,key_len,lastLeaf);
+    
+    // forward link (initialize to -1)
+
+      pageid_t tmp_pid = -1;
+      writeRecord(xid, child_p, NEXT_LEAF, (byte*)(&tmp_pid), root_rec_size);
+      //writeNodeRecord(xid,child_p,NEXT_LEAF,dummy,key_len,-1);
+
+      recordid leaf_rec = stasis_record_alloc_begin(xid, child_p,
+                                       sizeof(indexnode_rec)+key_len);
+
+      assert(leaf_rec.slot == FIRST_SLOT);
+
+      stasis_record_alloc_done(xid, child_p, leaf_rec);
+      writeNodeRecord(xid,child_p,leaf_rec,key,key_len,val_page);
+
+      ret = leaf_rec;
+
+      unlock(child_p->rwlatch);
+      releasePage(child_p);
+      if(lastLeaf != -1)
+      {
+          // install forward link in previous page
+          Page *lastLeafP = loadPage(xid, lastLeaf);
+          writelock(lastLeafP->rwlatch,0);
+          writeRecord(xid,lastLeafP,NEXT_LEAF,(byte*)(&child),root_rec_size);
+          unlock(lastLeafP->rwlatch);
+          releasePage(lastLeafP);
+      }
+      
+      DEBUG("%lld <-> %lld\n", lastLeaf, child);      
+  }
+
+  writeNodeRecord(xid, root_p, root, key, key_len, child);
+
+  return ret;
+    
+}
+
+
+
+/**
+ * Traverse from the root of the page to the right most leaf (the one
+ * with the higest base key value).
+ **/
+pageid_t logtree::findLastLeaf(int xid, Page *root, int64_t depth)
+{
+  if(!depth)
+  {
+      DEBUG("Found last leaf = %lld\n", root->id);
+      return root->id;
+  }
+  else
+  {
+      const indexnode_rec *nr = (indexnode_rec*) readRecord(xid, root,
+                                                            (*stasis_page_slotted_numslots_ptr(root))-1, 0);      
+      pageid_t ret;
+      
+      Page *p = loadPage(xid, nr->ptr);
+      readlock(p->rwlatch,0);
+      ret = findLastLeaf(xid,p,depth-1);
+      unlock(p->rwlatch);
+      releasePage(p);
+      
+      return ret;
+  }
+}
+
+
+/**
+ *  Traverse from the root of the tree to the left most (lowest valued
+ *  key) leaf.
+ */
+pageid_t logtree::findFirstLeaf(int xid, Page *root, int64_t depth)
+{
+    if(!depth) //if depth is 0, then returns the id of the page
+        return root->id;    
+    else
+    {
+        const indexnode_rec *nr = (indexnode_rec*)readRecord(xid,root,FIRST_SLOT,0);
+        Page *p = loadPage(xid, nr->ptr);
+        readlock(p->rwlatch,0);
+        pageid_t ret = findFirstLeaf(xid,p,depth-1);
+        unlock(p->rwlatch);
+        releasePage(p);
+        return ret;
+    }
+}
+
+
+pageid_t logtree::findPage(int xid, recordid tree, const byte *key, size_t keySize)
+{
+  Page *p = loadPage(xid, tree.page);
+  readlock(p->rwlatch,0);
+
+  const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0);  
+  
+  int64_t depth = *((int64_t*)depth_nr);  
+
+  recordid rid = lookup(xid, p, depth, key, keySize);
+  pageid_t ret = lookupLeafPageFromRid(xid,rid);//,keySize);
+  unlock(p->rwlatch);
+  releasePage(p);
+
+  return ret;
+
+}
+
+pageid_t logtree::lookupLeafPageFromRid(int xid, recordid rid)
+{
+  pageid_t pid = -1;
+  if(rid.page != NULLRID.page || rid.slot != NULLRID.slot)
+  {
+      Page * p2 = loadPage(xid, rid.page);
+      readlock(p2->rwlatch,0);
+      pid = ((const indexnode_rec*)(readRecord(xid,p2,rid.slot,0)))->ptr;
+      unlock(p2->rwlatch);
+      releasePage(p2);
+  }
+  return pid;
+}
+
+
+recordid logtree::lookup(int xid,
+                            Page *node,
+                            int64_t depth,
+                            const byte *key, size_t keySize )
+{
+    //DEBUG("lookup: pid %lld\t depth %lld\n", node->id, depth);
+    if(*stasis_page_slotted_numslots_ptr(node) == FIRST_SLOT) 
+        return NULLRID;
+
+    assert(*stasis_page_slotted_numslots_ptr(node) > FIRST_SLOT);
+    
+    int match = FIRST_SLOT;
+    
+    // don't need to compare w/ first item in tree.    
+    const indexnode_rec * rec = (indexnode_rec*)readRecord(xid,node,FIRST_SLOT,0); //TODO: why read it then?
+    
+    for(int i = FIRST_SLOT+1; i < *stasis_page_slotted_numslots_ptr(node); i++)
+    {
+        rec = (const indexnode_rec*)readRecord(xid,node,i,0);
+        int cmpval = datatuple::compare((datatuple::key_t) (rec+1),(datatuple::key_t) key);
+        if(cmpval>0) //changed it from >
+            break;        
+        match = i;
+    }
+    
+    
+    if(depth)
+    {
+        pageid_t child_id = ((const indexnode_rec*)readRecord(xid,node,match,0))->ptr;
+        Page* child_page = loadPage(xid, child_id);
+        readlock(child_page->rwlatch,0);
+        recordid ret = lookup(xid,child_page,depth-1,key,0);
+        unlock(child_page->rwlatch);
+        releasePage(child_page);
+        return ret;
+    }
+    else
+    {
+        recordid ret = {node->id, match, keySize};
+        return ret;
+    }
+}
+
+
+void logtree::print_tree(int xid)
+{
+    Page *p = loadPage(xid, root_rec.page);
+    readlock(p->rwlatch,0);
+    
+    const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0);
+    
+    int64_t depth = *((int64_t*)depth_nr);
+    
+    print_tree(xid, root_rec.page, depth);
+
+    unlock(p->rwlatch);
+    releasePage(p);
+
+}
+
+void logtree::print_tree(int xid, pageid_t pid, int64_t depth)
+{
+
+    Page *node = loadPage(xid, pid);
+    readlock(node->rwlatch,0);
+    
+    //const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0);
+    
+    printf("page_id:%lld\tnum_slots:%d\t\n", node->id, *stasis_page_slotted_numslots_ptr(node));
+    
+    if(*stasis_page_slotted_numslots_ptr(node) == FIRST_SLOT) 
+        return;
+
+    assert(*stasis_page_slotted_numslots_ptr(node) > FIRST_SLOT);
+
+    if(depth)
+    {
+        printf("\tnot_leaf\n");
+        
+        for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(node); i++)
+        {
+            const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,i,0);            
+            printf("\tchild_page_id:%lld\tkey:%s\n", nr->ptr,
+                   datatuple::key_to_str((byte*)(nr+1)).c_str()); 
+            
+        }
+        
+        for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(node); i++)
+        {
+            const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,i,0);
+            print_tree(xid, nr->ptr, depth-1);
+            
+        }
+
+    }
+    else
+    {
+        printf("\tis_leaf\t\n");
+        const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,FIRST_SLOT,0);            
+        printf("\tdata_page_id:%lld\tkey:%s\n", nr->ptr,
+               datatuple::key_to_str((byte*)(nr+1)).c_str());
+        printf("\t...\n");
+        nr = (const indexnode_rec*)readRecord(xid,node,(*stasis_page_slotted_numslots_ptr(node))-1,0);        
+        printf("\tdata_page_id:%lld\tkey:%s\n", nr->ptr, 
+                           datatuple::key_to_str((byte*)(nr+1)).c_str());
+        
+         
+    }
+
+    
+    unlock(node->rwlatch);
+    releasePage(node);
+
+    
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG TABLE IMPLEMENTATION
+/////////////////////////////////////////////////////////////////
+
+template class DataPage<datatuple>;
+
+
+logtable::logtable()
+{
+
+    tree_c0 = NULL;
+    tree_c1 = NULL;
+    tree_c2 = NULL;
+//    rbtree_mut = NULL;
+    this->mergedata = 0;
+    fixed_page_count = -1;
+    //tmerger = new tuplemerger(&append_merger);
+    tmerger = new tuplemerger(&replace_merger);
+
+    tsize = 0;
+    tree_bytes = 0;
+        
+    
+}
+
+logtable::~logtable()
+{
+    if(tree_c1 != NULL)        
+        delete tree_c1;
+    if(tree_c2 != NULL)
+        delete tree_c2;
+
+    if(tree_c0 != NULL)
+    {
+        for(rbtree_t::iterator delitr=tree_c0->begin();
+            delitr != tree_c0->end(); delitr++)
+            free((*delitr).keylen);
+
+        delete tree_c0;
+    }
+
+    delete tmerger;
+    
+    /*
+    if(rbtree_mut)
+        delete rbtree_mut;
+    if(tree_c0)
+        delete tree_c0;
+    if(input_needed)
+        delete input_needed;
+    */
+}
+
+recordid logtable::allocTable(int xid)
+{
+
+    table_rec = Talloc(xid, sizeof(tbl_header));
+    
+    //create the big tree
+    tree_c2 = new logtree();
+    tree_c2->create(xid);
+
+    tbl_header.c2_dp_state = Talloc(xid, sizeof(RegionAllocConf_t));
+    Tset(xid, tbl_header.c2_dp_state, &DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER);    
+                         
+
+    //create the small tree
+    tree_c1 = new logtree();
+    tree_c1->create(xid);
+    tbl_header.c1_dp_state = Talloc(xid, sizeof(RegionAllocConf_t));
+    Tset(xid, tbl_header.c1_dp_state, &DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER);
+    
+    tbl_header.c2_root = tree_c2->get_root_rec();
+    tbl_header.c2_state = tree_c2->get_tree_state();
+    tbl_header.c1_root = tree_c1->get_root_rec();
+    tbl_header.c1_state = tree_c1->get_tree_state();
+    
+    Tset(xid, table_rec, &tbl_header);    
+    
+    return table_rec;
+}
+
+void logtable::flushTable()
+{
+    struct timeval start_tv, stop_tv;
+    double start, stop;
+    
+    static double last_start;
+    static bool first = 1;
+    static int merge_count = 0;
+    
+    gettimeofday(&start_tv,0);
+    start = tv_to_double(start_tv);
+
+    
+    writelock(mergedata->header_lock,0);
+    pthread_mutex_lock(mergedata->rbtree_mut);
+    
+    int expmcount = merge_count;
+
+
+    //this is for waiting the previous merger of the mem-tree
+    //hopefullly this wont happen
+    printf("prv merge not complete\n");
+
+
+    while(*mergedata->old_c0) {
+        unlock(mergedata->header_lock);
+//        pthread_mutex_lock(mergedata->rbtree_mut);
+        if(tree_bytes >= MAX_C0_SIZE)
+            pthread_cond_wait(mergedata->input_needed_cond, mergedata->rbtree_mut);
+        else
+        {
+            pthread_mutex_unlock(mergedata->rbtree_mut);
+            return;
+        }
+
+        
+        pthread_mutex_unlock(mergedata->rbtree_mut);
+        
+        writelock(mergedata->header_lock,0);
+        pthread_mutex_lock(mergedata->rbtree_mut);
+        
+        if(expmcount != merge_count)
+        {
+            unlock(mergedata->header_lock);
+            pthread_mutex_unlock(mergedata->rbtree_mut);
+            return;                    
+        }
+        
+    }
+
+    printf("prv merge complete\n");
+
+    gettimeofday(&stop_tv,0);
+    stop = tv_to_double(stop_tv);
+    
+    //rbtree_ptr *tmp_ptr = new rbtree_ptr_t; //(typeof(h->scratch_tree)*) malloc(sizeof(void*));
+    //*tmp_ptr = tree_c0;
+    *(mergedata->old_c0) = tree_c0; 
+
+//    pthread_mutex_lock(mergedata->rbtree_mut);
+    pthread_cond_signal(mergedata->input_ready_cond);
+//    pthread_mutex_unlock(mergedata->rbtree_mut);
+
+    merge_count ++;
+    tree_c0 = new rbtree_t;    
+    tsize = 0;
+    tree_bytes = 0;
+    
+    pthread_mutex_unlock(mergedata->rbtree_mut);
+    unlock(mergedata->header_lock);
+    if(first)
+    {
+        printf("flush waited %f sec\n", stop-start);
+        first = 0;
+    }
+    else
+    {
+        printf("flush waited %f sec (worked %f)\n",
+               stop-start, start-last_start);
+    }
+    last_start = stop;
+
+}
+
+datatuple * logtable::findTuple(int xid, datatuple::key_t key, size_t keySize)
+{
+    //prepare a search tuple
+    datatuple search_tuple;
+    search_tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t));
+    *(search_tuple.keylen) = keySize;
+    search_tuple.key = key;
+
+    readlock(mergedata->header_lock,0);
+    pthread_mutex_lock(mergedata->rbtree_mut);
+
+    datatuple *ret_tuple=0; 
+
+    //step 1: look in tree_c0
+    rbtree_t::iterator rbitr = tree_c0->find(search_tuple);
+    if(rbitr != tree_c0->end())
+    {
+        DEBUG("tree_c0 size %d\n", tree_c0->size());
+        datatuple tuple = *rbitr;
+        byte *barr = (byte*)malloc(tuple.byte_length());
+        memcpy(barr, (byte*)tuple.keylen, tuple.byte_length());
+        ret_tuple = datatuple::from_bytes(barr);
+    }
+
+    bool done = false;
+    //step: 2 look into first in tree if exists (a first level merge going on)
+    if(*(mergedata->old_c0) != 0)
+    {
+        DEBUG("old mem tree not null %d\n", (*(mergedata->old_c0))->size());
+        rbitr = (*(mergedata->old_c0))->find(search_tuple);
+        if(rbitr != (*(mergedata->old_c0))->end())
+        {
+            datatuple tuple = *rbitr;
+
+            if(tuple.isDelete())  //tuple deleted          
+                done = true;  //return ret_tuple            
+            else if(ret_tuple != 0)  //merge the two
+            {
+                datatuple *mtuple = tmerger->merge(&tuple, ret_tuple);  //merge the two
+                free(ret_tuple->keylen); //free tuple from current tree
+                free(ret_tuple);
+                ret_tuple = mtuple; //set return tuple to merge result
+            }
+            else //key first found in old mem tree
+            {
+                byte *barr = (byte*)malloc(tuple.byte_length());
+                memcpy(barr, (byte*)tuple.keylen, tuple.byte_length());
+                ret_tuple = datatuple::from_bytes(barr);                
+            }
+            //we cannot free tuple from old-tree 'cos it is not a copy
+        }            
+    }
+
+    //release the memtree lock
+    pthread_mutex_unlock(mergedata->rbtree_mut);
+    
+    //step 3: check c1    
+    if(!done)
+    {
+        datatuple *tuple_c1 = findTuple(xid, key, keySize, tree_c1);
+        if(tuple_c1 != NULL)
+        {
+            bool use_copy = false;
+            if(tuple_c1->isDelete()) //tuple deleted
+                done = true;        
+            else if(ret_tuple != 0) //merge the two
+            {
+                datatuple *mtuple = tmerger->merge(tuple_c1, ret_tuple);  //merge the two
+                free(ret_tuple->keylen); //free tuple from before
+                free(ret_tuple);
+                ret_tuple = mtuple; //set return tuple to merge result            
+            }            
+            else //found for the first time
+            {
+                use_copy = true;
+                ret_tuple = tuple_c1;
+                //byte *barr = (byte*)malloc(tuple_c1->byte_length());
+                //memcpy(barr, (byte*)tuple_c1->keylen, tuple_c1->byte_length());
+                //ret_tuple = datatuple::from_bytes(barr);
+            }
+
+            if(!use_copy)
+            {
+                free(tuple_c1->keylen); //free tuple from tree c1
+                free(tuple_c1);
+            }
+        }
+    }
+
+    //step 4: check old c1 if exists
+    if(!done && *(mergedata->diskmerge_args->in_tree) != 0)
+    {
+        DEBUG("old c1 tree not null\n");
+        datatuple *tuple_oc1 = findTuple(xid, key, keySize,
+                                             (logtree*)( *(mergedata->diskmerge_args->in_tree)));
+        
+        if(tuple_oc1 != NULL)
+        {
+            bool use_copy = false;
+            if(tuple_oc1->isDelete())
+                done = true;        
+            else if(ret_tuple != 0) //merge the two
+            {
+                datatuple *mtuple = tmerger->merge(tuple_oc1, ret_tuple);  //merge the two
+                free(ret_tuple->keylen); //free tuple from before
+                free(ret_tuple);
+                ret_tuple = mtuple; //set return tuple to merge result            
+            }
+            else //found for the first time
+            {
+                use_copy = true;
+                ret_tuple = tuple_oc1;
+                //byte *barr = (byte*)malloc(tuple_oc1->byte_length());
+                //memcpy(barr, (byte*)tuple_oc1->keylen, tuple_oc1->byte_length());
+                //ret_tuple = datatuple::from_bytes(barr);
+            }
+
+            if(!use_copy)
+            {
+                free(tuple_oc1->keylen); //free tuple from tree old c1
+                free(tuple_oc1);
+            }
+        }        
+    }
+
+    //step 5: check c2
+    if(!done)
+    {
+        DEBUG("Not in old first disk tree\n");        
+        datatuple *tuple_c2 = findTuple(xid, key, keySize, tree_c2);
+
+        if(tuple_c2 != NULL)
+        {
+            bool use_copy = false;
+            if(tuple_c2->isDelete())
+                done = true;        
+            else if(ret_tuple != 0)
+            {
+                datatuple *mtuple = tmerger->merge(tuple_c2, ret_tuple);  //merge the two
+                free(ret_tuple->keylen); //free tuple from before
+                free(ret_tuple);
+                ret_tuple = mtuple; //set return tuple to merge result            
+            }
+            else //found for the first time
+            {
+                use_copy = true;
+                ret_tuple = tuple_c2;                
+                //byte *barr = (byte*)malloc(tuple_c2->byte_length());
+                //memcpy(barr, (byte*)tuple_c2->keylen, tuple_c2->byte_length());
+                //ret_tuple = datatuple::from_bytes(barr);
+            }
+
+            if(!use_copy)
+            {
+                free(tuple_c2->keylen); //free tuple from tree c2
+                free(tuple_c2);
+            }
+        }        
+    }     
+
+    //pthread_mutex_unlock(mergedata->rbtree_mut);
+    unlock(mergedata->header_lock);
+    free(search_tuple.keylen);
+    
+    return ret_tuple;
+
+}
+
+/*
+ * returns the first record found with the matching key
+ * (not to be used together with diffs)
+ **/
+datatuple * logtable::findTuple_first(int xid, datatuple::key_t key, size_t keySize)
+{
+    //prepare a search tuple
+    datatuple search_tuple;
+    search_tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t));
+    *(search_tuple.keylen) = keySize;
+    search_tuple.key = key;
+        
+    pthread_mutex_lock(mergedata->rbtree_mut);
+
+    datatuple *ret_tuple=0; 
+    //step 1: look in tree_c0
+
+    rbtree_t::iterator rbitr = tree_c0->find(search_tuple);
+    if(rbitr != tree_c0->end())
+    {
+        DEBUG("tree_c0 size %d\n", tree_c0->size());
+        datatuple tuple = *rbitr;
+        byte *barr = (byte*)malloc(tuple.byte_length());
+        memcpy(barr, (byte*)tuple.keylen, tuple.byte_length());        
+        ret_tuple = datatuple::from_bytes(barr);
+        
+    }
+    else
+    {
+        DEBUG("Not in mem tree %d\n", tree_c0->size());
+        //step: 2 look into first in tree if exists (a first level merge going on)
+        if(*(mergedata->old_c0) != 0)
+        {
+            DEBUG("old mem tree not null %d\n", (*(mergedata->old_c0))->size());
+            rbitr = (*(mergedata->old_c0))->find(search_tuple);
+            if(rbitr != (*(mergedata->old_c0))->end())
+            {
+                datatuple tuple = *rbitr;
+                byte *barr = (byte*)malloc(tuple.byte_length());
+                memcpy(barr, (byte*)tuple.keylen, tuple.byte_length());
+                ret_tuple = datatuple::from_bytes(barr);                
+            }            
+        }
+
+        if(ret_tuple == 0)
+        {
+            DEBUG("Not in old mem tree\n");
+
+            //step 3: check c1
+            ret_tuple = findTuple(xid, key, keySize, tree_c1);    
+        }
+
+        if(ret_tuple == 0)
+        {
+            DEBUG("Not in first disk tree\n");
+
+            //step 4: check old c1 if exists
+            if( *(mergedata->diskmerge_args->in_tree) != 0)
+            {
+                DEBUG("old c1 tree not null\n");
+                ret_tuple = findTuple(xid, key, keySize,
+                                      (logtree*)( *(mergedata->diskmerge_args->in_tree)));
+            }
+                
+        }
+
+        if(ret_tuple == 0)
+        {
+            DEBUG("Not in old first disk tree\n");
+
+            //step 5: check c2
+            ret_tuple = findTuple(xid, key, keySize, tree_c2);            
+        }        
+    }
+
+
+     
+
+    pthread_mutex_unlock(mergedata->rbtree_mut);
+    free(search_tuple.keylen);
+    
+    return ret_tuple;
+
+}
+
+void logtable::insertTuple(struct datatuple &tuple)
+{
+    //static int count = LATCH_INTERVAL;
+    //static int tsize = 0; //number of tuples
+    //static int64_t tree_bytes = 0; //number of bytes
+    static const size_t isize = sizeof(uint32_t);
+
+    //lock the red-black tree
+    readlock(mergedata->header_lock,0);
+    pthread_mutex_lock(mergedata->rbtree_mut);
+    //find the previous tuple with same key in the memtree if exists
+    rbtree_t::iterator rbitr = tree_c0->find(tuple);
+    if(rbitr != tree_c0->end())
+    {        
+        datatuple pre_t = *rbitr;
+        //do the merging
+        datatuple *new_t = tmerger->merge(&pre_t, &tuple);
+        tree_c0->erase(pre_t); //remove the previous tuple        
+
+        tree_c0->insert( *new_t); //insert the new tuple
+
+        //update the tree size (+ new_t size - pre_t size)
+        tree_bytes += (new_t->byte_length() - pre_t.byte_length());
+                
+        free(pre_t.keylen); //free the previous tuple
+        free(new_t); // frees the malloc(sizeof(datatuple)) coming from merge
+    }
+    else //no tuple with same key exists in mem-tree
+    {
+    
+        //create a copy    
+        datatuple t;
+        byte *arr = (byte*) malloc(tuple.byte_length());
+        
+        t.keylen = (uint32_t*) arr;
+        *t.keylen = *tuple.keylen;
+        t.datalen = (uint32_t*) (arr+isize);
+        *t.datalen = *tuple.datalen;
+        t.key = (datatuple::key_t) (arr+isize+isize);
+        memcpy((byte*)t.key, (byte*)tuple.key, *t.keylen);
+        if(!tuple.isDelete())
+        {
+            t.data = (datatuple::data_t) (arr+isize+isize+ *(t.keylen));
+            memcpy((byte*)t.data, (byte*)tuple.data, *t.datalen);
+        }
+        else
+            t.data = 0;
+    
+        //insert tuple into the rbtree        
+        tree_c0->insert(t);
+        tsize++;
+        tree_bytes += t.byte_length() + RB_TREE_OVERHEAD;
+
+    }
+
+    //flushing logic
+    /*
+    bool go = false;
+    if(tree_bytes >= MAX_C0_SIZE)
+    {
+        go = *mergedata->input_needed;
+        DEBUG("go %d\n", go);
+     }
+    */
+
+    if(tree_bytes >= MAX_C0_SIZE )
+    {
+        DEBUG("tree size before merge %d tuples %lld bytes.\n", tsize, tree_bytes);
+        pthread_mutex_unlock(mergedata->rbtree_mut);
+        unlock(mergedata->header_lock);
+        flushTable();
+
+        readlock(mergedata->header_lock,0);        
+        pthread_mutex_lock(mergedata->rbtree_mut);
+        
+        //tsize = 0;
+        //tree_bytes = 0;
+
+    }
+    
+    //unlock
+    pthread_mutex_unlock(mergedata->rbtree_mut);
+    unlock(mergedata->header_lock);
+
+
+    DEBUG("tree size %d tuples %lld bytes.\n", tsize, tree_bytes);
+}
+
+
+DataPage<datatuple>* logtable::insertTuple(int xid, struct datatuple &tuple, recordid &dpstate, logtree *ltree)
+{
+
+    //create a new data page    
+    
+    DataPage<datatuple> * dp = 0;
+
+    while(dp==0)
+    {
+        dp = new DataPage<datatuple>(xid, fixed_page_count,
+                                     &DataPage<datatuple>::dp_alloc_region_rid,
+                                     &dpstate );
+
+        //insert the record into the data page
+        if(!dp->append(xid, tuple))
+        {            
+            delete dp;
+            dp = 0;
+        }
+    }
+    
+
+    RegionAllocConf_t alloc_conf;
+    //insert the record key and id of the first page of the datapage to the logtree
+    Tread(xid,ltree->get_tree_state(), &alloc_conf);
+    logtree::appendPage(xid, ltree->get_root_rec(), ltree->lastLeaf,
+                        tuple.get_key(),
+                        *tuple.keylen,
+                        ltree->alloc_region,
+                        &alloc_conf,
+                        dp->get_start_pid()
+                        );
+    Tset(xid,ltree->get_tree_state(),&alloc_conf);
+                        
+
+    //return the datapage
+    return dp;
+}
+
+datatuple * logtable::findTuple(int xid, datatuple::key_t key, size_t keySize,  logtree *ltree)
+{
+    datatuple * tup=0;
+
+    //find the datapage
+    pageid_t pid = ltree->findPage(xid, ltree->get_root_rec(), (byte*)key, keySize);
+
+    if(pid!=-1)
+    {
+        DataPage<datatuple> * dp = new DataPage<datatuple>(xid, pid);
+        dp->recordRead(xid, key, keySize, &tup);
+        delete dp;           
+    }
+    return tup;
+}
+
+
+/////////////////////////////////////////////////
+//logtreeIterator implementation
+/////////////////////////////////////////////////
+
+lladdIterator_t* logtreeIterator::open(int xid, recordid root)
+{
+    if(root.page == 0 && root.slot == 0 && root.size == -1)
+        return 0;
+    
+    Page *p = loadPage(xid,root.page);
+    readlock(p->rwlatch,0);
+    
+    //size_t keySize = getKeySize(xid,p);
+    DEBUG("ROOT_REC_SIZE %d\n", logtree::root_rec_size);
+    const byte * nr = logtree::readRecord(xid,p,
+                                                  logtree::DEPTH,
+                                                  logtree::root_rec_size);
+    int64_t depth = *((int64_t*)nr);
+    DEBUG("DEPTH = %lld\n", depth);
+    
+    pageid_t leafid = logtree::findFirstLeaf(xid, p, depth);
+    if(leafid != root.page)
+    {        
+        unlock(p->rwlatch);        
+        releasePage(p);        
+        p = loadPage(xid,leafid);
+        readlock(p->rwlatch,0);
+        assert(depth != 0);        
+    }
+    else
+        assert(depth == 0);
+
+    
+    logtreeIterator_s *impl = (logtreeIterator_s*)malloc(sizeof(logtreeIterator_s));
+    impl->p = p;    
+    {
+        recordid rid = { p->id, 1, 0};//keySize }; //TODO: why does this start from 1? 
+        impl->current = rid;        
+    }
+    //DEBUG("keysize = %d, slot = %d\n", keySize, impl->current.slot);
+    impl->t = 0;    
+    impl->justOnePage = (depth == 0);    
+
+    lladdIterator_t *it = (lladdIterator_t*) malloc(sizeof(lladdIterator_t));
+    it->type = -1; // XXX  LSM_TREE_ITERATOR;    
+    it->impl = impl;    
+    return it;    
+}
+
+lladdIterator_t* logtreeIterator::openAt(int xid, recordid root, const byte* key)
+{
+  if(root.page == NULLRID.page && root.slot == NULLRID.slot)
+      return 0;
+
+  Page *p = loadPage(xid,root.page);
+  readlock(p->rwlatch,0);
+  //size_t keySize = getKeySize(xid,p);
+  //assert(keySize);
+  const byte *nr = logtree::readRecord(xid,p,logtree::DEPTH, logtree::root_rec_size);
+  //const byte *cmp_nr = logtree::readRecord(xid, p , logtree::COMPARATOR, logtree::root_rec_size);
+
+  int64_t depth = *((int64_t*)nr);
+
+  recordid lsm_entry_rid = logtree::lookup(xid,p,depth,key,0);//keySize,comparators[cmp_nr->ptr]);
+
+  if(lsm_entry_rid.page == NULLRID.page && lsm_entry_rid.slot == NULLRID.slot) {
+    unlock(p->rwlatch);
+    return 0;
+  }
+  assert(lsm_entry_rid.size != INVALID_SLOT);
+
+  if(root.page != lsm_entry_rid.page) 
+  {
+    unlock(p->rwlatch);
+    releasePage(p);
+    p = loadPage(xid,lsm_entry_rid.page);
+    readlock(p->rwlatch,0);
+  }
+  
+  logtreeIterator_s *impl = (logtreeIterator_s*) malloc(sizeof(logtreeIterator_s));
+  impl->p = p;
+
+  impl->current.page = lsm_entry_rid.page;
+  impl->current.slot = lsm_entry_rid.slot - 1;  // slot before thing of interest
+  impl->current.size = lsm_entry_rid.size;
+
+  impl->t = 0; // must be zero so free() doesn't croak.
+  impl->justOnePage = (depth==0);  
+
+  lladdIterator_t *it = (lladdIterator_t*) malloc(sizeof(lladdIterator_t));
+  it->type = -1; // XXX LSM_TREE_ITERATOR
+  it->impl = impl;
+  return it;
+}
+
+/**
+ * move to the next page
+ **/
+int logtreeIterator::next(int xid, lladdIterator_t *it)
+{
+    logtreeIterator_s *impl = (logtreeIterator_s*) it->impl;
+
+    impl->current = stasis_record_next(xid, impl->p, impl->current);
+  
+    if(impl->current.size == INVALID_SLOT)
+    {
+        
+        const indexnode_rec next_rec = *(const indexnode_rec*)logtree::readRecord(xid,impl->p,
+                                                                   logtree::NEXT_LEAF,
+                                                                   0);
+        unlock(impl->p->rwlatch);
+        releasePage(impl->p);
+        
+        DEBUG("done with page %lld next = %lld\n", impl->p->id, next_rec.ptr);
+
+        
+        if(next_rec.ptr != -1 && ! impl->justOnePage)
+        {
+            impl->p = loadPage(xid, next_rec.ptr);
+            readlock(impl->p->rwlatch,0);
+            impl->current.page = next_rec.ptr;
+            impl->current.slot = 2;
+            impl->current.size = stasis_record_length_read(xid, impl->p, impl->current); //keySize;
+        } else {
+            impl->p = 0;
+            impl->current.size = INVALID_SLOT;
+        }
+                
+    }    
+    else
+    {
+        /*
+        assert(impl->current.size == keySize + sizeof(lsmTreeNodeRecord));
+        impl->current.size = keySize;
+        */
+    }
+
+    
+    if(impl->current.size != INVALID_SLOT)
+    {
+        //size_t sz = sizeof(*impl->t) + impl->current.size;
+        if(impl->t != NULL)
+            free(impl->t);
+        
+        impl->t = (indexnode_rec*)malloc(impl->current.size);
+        memcpy(impl->t, logtree::readRecord(xid,impl->p,impl->current), impl->current.size);
+        
+        return 1;
+    }
+    else
+    {
+        if(impl->t != NULL)
+            free(impl->t);
+        impl->t = 0;
+        return 0;
+    }
+    
+}
+
+/*
+lladdIterator_t *logtreeIterator::copy(int xid, lladdIterator_t* i)
+{
+    logtreeIterator_s *it = (logtreeIterator_s*) i->impl;
+    logtreeIterator_s *mine = (logtreeIterator_s*) malloc(sizeof(logtreeIterator_s));
+    
+    if(it->p)
+    {
+        mine->p = loadPage(xid, it->p->id);
+        readlock(mine->p->rwlatch,0);
+    }
+    else 
+        mine->p = 0;
+  
+    memcpy(&mine->current, &it->current,sizeof(recordid));
+    
+    if(it->t)
+    {
+        mine->t = (datatuple*)malloc(sizeof(*it->t)); //TODO: DATA IS NOT COPIED, MIGHT BE WRONG
+        //mine->t = malloc(sizeof(*it->t) + it->current.size);
+        memcpy(mine->t, it->t, sizeof(*it->t));// + it->current.size);
+    }
+    else 
+        mine->t = 0;
+    
+    mine->justOnePage = it->justOnePage;
+    lladdIterator_t * ret = (lladdIterator_t*)malloc(sizeof(lladdIterator_t));
+    ret->type = -1; // XXX LSM_TREE_ITERATOR
+    ret->impl = mine;
+    return ret;
+}
+*/
+
+void logtreeIterator::close(int xid, lladdIterator_t *it)
+{
+    logtreeIterator_s *impl = (logtreeIterator_s*)it->impl;
+  if(impl->p)
+  {
+    unlock(impl->p->rwlatch);
+    releasePage(impl->p);
+  }
+  if(impl->t)
+  {
+      free(impl->t);
+  }
+  free(impl);
+  free(it);
+}
+
+
+/////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////
+
+
+
+
+double tv_to_double(struct timeval tv)
+{
+  return static_cast<double>(tv.tv_sec) +
+      (static_cast<double>(tv.tv_usec) / 1000000.0);
+}
+
+
+///////////////////////////////////////////////////////////////////                       
+
diff --git a/logstore.h b/logstore.h
new file mode 100644
index 0000000..5230a67
--- /dev/null
+++ b/logstore.h
@@ -0,0 +1,302 @@
+#ifndef _LOGSTORE_H_
+#define _LOGSTORE_H_
+
+#undef end
+#undef begin
+
+#include <string>
+#include <set>
+#include <sstream>
+#include <iostream>
+#include <queue>
+#include <vector>
+
+#include "logserver.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include <pthread.h>
+
+
+
+#include <stasis/transactional.h>
+
+#include <stasis/operations.h>
+#include <stasis/bufferManager.h>
+#include <stasis/allocationPolicy.h>
+#include <stasis/blobManager.h>
+#include <stasis/page.h>
+#include <stasis/truncation.h>
+
+
+#include "datapage.h"
+#include "tuplemerger.h"
+#include "datatuple.h"
+
+
+double tv_to_double(struct timeval tv);
+
+
+struct logtable_mergedata;
+
+
+
+typedef struct RegionAllocConf_t
+{
+  recordid regionList;
+  pageid_t regionCount;
+  pageid_t nextPage;
+  pageid_t endOfRegion;
+  pageid_t regionSize;
+} RegionAllocConf_t;
+
+
+//struct logtree_state {
+//  pageid_t lastLeaf;
+//};
+
+
+struct indexnode_rec {
+    pageid_t ptr;
+};
+
+typedef pageid_t(*logtree_page_allocator_t)(int, void *);
+typedef void(*logtree_page_deallocator_t)(int, void *);
+
+
+class logtree{
+public:
+    logtree();
+
+    recordid create(int xid);
+
+    void print_tree(int xid);
+    
+    static pageid_t alloc_region(int xid, void *conf);
+    static pageid_t alloc_region_rid(int xid, void * ridp);
+    static void force_region_rid(int xid, void *conf);
+    static void dealloc_region_rid(int xid, void *conf);
+    static void free_region_rid(int xid, recordid tree,
+                                logtree_page_deallocator_t dealloc,
+                                void *allocator_state);
+
+    static void writeNodeRecord(int xid, Page *p, recordid &rid, 
+                         const byte *key, size_t keylen, pageid_t ptr);
+
+    static void writeRecord(int xid, Page *p, recordid &rid,
+                            const byte *data, size_t datalen);
+
+    static void writeRecord(int xid, Page *p, slotid_t slot,
+                            const byte *data, size_t datalen);
+
+    static const byte* readRecord(int xid, Page * p, recordid &rid);
+    static const byte* readRecord(int xid, Page * p, slotid_t slot, int64_t size);
+
+    static int32_t readRecordLength(int xid, Page *p, slotid_t slot);
+
+    //return the left-most leaf, these are not data pages, although referred to as leaf
+    static pageid_t findFirstLeaf(int xid, Page *root, int64_t depth);
+    //return the right-most leaf
+    static pageid_t findLastLeaf(int xid, Page *root, int64_t depth) ;
+
+    //reads the given record and returns the page id stored in it
+    static pageid_t lookupLeafPageFromRid(int xid, recordid rid);
+    
+    //returns a record that stores the pageid where the given key should be in, i.e. if it exists
+    static recordid lookup(int xid, Page *node, int64_t depth, const byte *key,
+                              size_t keySize);
+
+    //returns the id of the data page that could contain the given key
+    static pageid_t findPage(int xid, recordid tree, const byte *key, size_t keySize);
+
+
+    //appends a leaf page, val_page is the id of the leaf page
+    //rmLeafID --> rightmost leaf id
+    static recordid appendPage(int xid, recordid tree, pageid_t & rmLeafID,
+                               const byte *key,size_t keySize,
+                               logtree_page_allocator_t allocator, void *allocator_state,
+                               long val_page);
+
+    static recordid appendInternalNode(int xid, Page *p,
+                                       int64_t depth,
+                                       const byte *key, size_t key_len,
+                                       pageid_t val_page, pageid_t lastLeaf,
+                                       logtree_page_allocator_t allocator,
+                                       void *allocator_state);
+
+    static recordid buildPathToLeaf(int xid, recordid root, Page *root_p,
+                                    int64_t depth, const byte *key, size_t key_len,
+                                    pageid_t val_page, pageid_t lastLeaf,
+                                    logtree_page_allocator_t allocator,
+                                    void *allocator_state);
+
+
+
+    /**
+       Initialize a page for use as an internal node of the tree.
+     */
+    inline static void initializeNodePage(int xid, Page *p);
+    
+    recordid &get_tree_state(){return tree_state;}
+    recordid &get_root_rec(){return root_rec;}
+    
+public:
+
+    const static RegionAllocConf_t REGION_ALLOC_STATIC_INITIALIZER;
+    const static int64_t DEPTH;
+    const static int64_t COMPARATOR;
+    const static int64_t FIRST_SLOT;
+    const static size_t root_rec_size;
+    const static int64_t PREV_LEAF;
+    const static int64_t NEXT_LEAF;
+    
+    pageid_t lastLeaf;    
+private:
+
+    void print_tree(int xid, pageid_t pid, int64_t depth);
+    
+private:
+    recordid tree_state;
+    recordid root_rec;
+
+
+
+    
+};
+
+
+class logtable
+{
+public:
+    logtable();
+    ~logtable();
+
+    //user access functions
+    datatuple * findTuple(int xid, datatuple::key_t key, size_t keySize);
+
+    datatuple * findTuple_first(int xid, datatuple::key_t key, size_t keySize);
+    
+    void insertTuple(struct datatuple &tuple);
+
+
+    //other class functions
+    recordid allocTable(int xid);
+
+    void flushTable();    
+    
+    DataPage<datatuple>* insertTuple(int xid, struct datatuple &tuple, recordid &dpstate,logtree *ltree);
+
+    datatuple * findTuple(int xid, datatuple::key_t key, size_t keySize,  logtree *ltree);
+
+    inline recordid & get_table_rec(){return table_rec;}
+    
+    inline logtree * get_tree_c2(){return tree_c2;}
+    inline logtree * get_tree_c1(){return tree_c1;}
+
+    inline void set_tree_c1(logtree *t){tree_c1=t;}
+    inline void set_tree_c2(logtree *t){tree_c2=t;}
+    
+    typedef std::set<datatuple, datatuple> rbtree_t;
+    typedef rbtree_t* rbtree_ptr_t;
+    inline rbtree_ptr_t get_tree_c0(){return tree_c0;}
+    
+    void set_tree_c0(rbtree_ptr_t newtree){tree_c0 = newtree;}
+
+    inline recordid & get_dpstate1(){return tbl_header.c1_dp_state;}
+    inline recordid & get_dpstate2(){return tbl_header.c2_dp_state;}
+
+    int get_fixed_page_count(){return fixed_page_count;}
+    void set_fixed_page_count(int count){fixed_page_count = count;}
+
+    void setMergeData(logtable_mergedata * mdata) { this->mergedata = mdata;}
+    logtable_mergedata* getMergeData(){return mergedata;}
+
+    inline tuplemerger * gettuplemerger(){return tmerger;}
+    
+public:
+
+    struct table_header {
+        recordid c2_root;     //tree root record --> points to the root of the b-tree
+        recordid c2_state;    //tree state --> describes the regions used by the index tree
+        recordid c2_dp_state; //data pages state --> regions used by the data pages
+        recordid c1_root;
+        recordid c1_state;
+        recordid c1_dp_state;
+        //epoch_t beginning;
+        //epoch_t end;
+
+    };
+
+    const static RegionAllocConf_t DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER;
+
+    logtable_mergedata * mergedata;
+    
+private:
+
+    
+
+private:    
+    recordid table_rec;
+    struct table_header tbl_header;
+    
+    logtree *tree_c2; //big tree
+    logtree *tree_c1; //small tree
+    rbtree_ptr_t tree_c0; // in-mem red black tree
+
+
+    int tsize; //number of tuples
+    int64_t tree_bytes; //number of bytes
+
+    
+    //DATA PAGE SETTINGS
+    int fixed_page_count;//number of pages in a datapage
+
+//    logtable_mergedata * mergedata;
+
+    tuplemerger *tmerger;
+};
+
+
+typedef struct logtreeIterator_s {
+    Page * p;
+    recordid current;
+    indexnode_rec *t;    
+    int justOnePage;
+} logtreeIterator_s;
+
+
+class logtreeIterator
+{
+    
+public:
+    static lladdIterator_t* open(int xid, recordid root);
+    static lladdIterator_t* openAt(int xid, recordid root, const byte* key);
+    static int next(int xid, lladdIterator_t *it);
+    //static lladdIterator_t *copy(int xid, lladdIterator_t* i);
+    static void close(int xid, lladdIterator_t *it);
+
+    
+    static inline int key (int xid, lladdIterator_t *it, byte **key)
+        {
+            logtreeIterator_s * impl = (logtreeIterator_s*)it->impl;
+            *key = (byte*)(impl->t+1);
+            return (int) impl->current.size - sizeof(indexnode_rec);        
+        }
+    
+    
+    static inline int value(int xid, lladdIterator_t *it, byte **value)
+        {
+            logtreeIterator_s * impl = (logtreeIterator_s*)it->impl;
+            *value = (byte*)&(impl->t->ptr);
+            return sizeof(impl->t->ptr);
+        }
+    
+    static inline void tupleDone(int xid, void *it) { }
+    static inline void releaseLock(int xid, void *it) { }
+
+};
+
+
+#endif
diff --git a/merger.cpp b/merger.cpp
new file mode 100644
index 0000000..bcdced0
--- /dev/null
+++ b/merger.cpp
@@ -0,0 +1,836 @@
+
+#include <math.h>
+#include "merger.h"
+#include "logiterators.cpp"
+#include "datapage.cpp"
+//pageid_t merge_scheduler::C0_MEM_SIZE = 1000 * 1000 * 1000;
+
+//template <> struct merger_args<rbtree_t>;
+//template <> struct merger_args<logtree>;
+inline DataPage<datatuple>*
+insertTuple(int xid, DataPage<datatuple> *dp, datatuple &t,
+            logtable *ltable,
+            logtree * ltree,
+            recordid & dpstate,
+            int64_t &dpages, int64_t &npages);
+
+int merge_scheduler::addlogtable(logtable *ltable)
+{
+
+    struct logtable_mergedata * mdata = new logtable_mergedata;
+
+    // initialize merge data
+    mdata->header_lock = initlock();
+    mdata->rbtree_mut = new pthread_mutex_t;
+    pthread_mutex_init(mdata->rbtree_mut,0);
+    mdata->old_c0 = new rbtree_ptr_t;
+    *mdata->old_c0 = 0;
+    
+    mdata->input_needed = new bool(false);
+    
+    mdata->input_ready_cond = new pthread_cond_t;
+    pthread_cond_init(mdata->input_ready_cond,0);
+    
+    mdata->input_needed_cond = new pthread_cond_t;
+    pthread_cond_init(mdata->input_needed_cond,0);
+
+    mdata->input_size = new int64_t(100);
+
+    mdata->diskmerge_args = new merger_args<logtree>;
+    mdata->memmerge_args = new merger_args<rbtree_t>;
+    
+    mergedata.push_back(std::make_pair(ltable, mdata));
+    return mergedata.size()-1;
+    
+}
+
+merge_scheduler::~merge_scheduler()
+{
+    for(int i=0; i<mergedata.size(); i++)
+    {
+        logtable *ltable = mergedata[i].first;
+        logtable_mergedata *mdata = mergedata[i].second;
+
+        //delete the mergedata fields
+        deletelock(mdata->header_lock);
+        delete mdata->rbtree_mut;        
+        delete mdata->old_c0;
+        delete mdata->input_needed;
+        delete mdata->input_ready_cond;
+        delete mdata->input_needed_cond;
+        delete mdata->input_size;
+
+        //delete the merge thread structure variables
+        delete (recordid*) mdata->memmerge_args->pageAllocState;
+        delete (recordid*) mdata->memmerge_args->oldAllocState;
+        delete mdata->memmerge_args->still_open;
+
+        delete (recordid*) mdata->diskmerge_args->pageAllocState;
+        delete (recordid*) mdata->diskmerge_args->oldAllocState;
+
+        pthread_cond_destroy(mdata->diskmerge_args->in_block_needed_cond);
+        delete mdata->diskmerge_args->in_block_needed_cond;
+        delete mdata->diskmerge_args->in_block_needed;
+        
+        pthread_cond_destroy(mdata->diskmerge_args->out_block_needed_cond);        
+        delete mdata->diskmerge_args->out_block_needed_cond;
+        delete mdata->diskmerge_args->out_block_needed;
+        
+        pthread_cond_destroy(mdata->diskmerge_args->in_block_ready_cond);
+        delete mdata->diskmerge_args->in_block_ready_cond;
+        pthread_cond_destroy(mdata->diskmerge_args->out_block_ready_cond);
+        delete mdata->diskmerge_args->out_block_ready_cond;
+
+        delete mdata->diskmerge_args->my_tree_size;
+        
+        delete mdata->diskmerge_args;
+        delete mdata->memmerge_args;
+        
+        
+    }
+    mergedata.clear();
+
+}
+
+void merge_scheduler::shutdown()
+{
+    //signal shutdown
+    for(int i=0; i<mergedata.size(); i++)
+    {
+        logtable *ltable = mergedata[i].first;
+        logtable_mergedata *mdata = mergedata[i].second;
+
+        //flush the in memory table to write any tuples still in memory
+        ltable->flushTable();
+        
+        pthread_mutex_lock(mdata->rbtree_mut);        
+        *(mdata->memmerge_args->still_open)=false;
+        pthread_cond_signal(mdata->input_ready_cond);
+        
+        //*(mdata->diskmerge_args->still_open)=false;//same pointer so no need
+        
+        pthread_mutex_unlock(mdata->rbtree_mut);
+
+    }
+
+    for(int i=0; i<mergedata.size(); i++)
+    {
+        logtable_mergedata *mdata = mergedata[i].second;
+        
+        pthread_join(mdata->memmerge_thread,0);
+        pthread_join(mdata->diskmerge_thread,0);
+    }
+    
+
+}
+
+void merge_scheduler::startlogtable(int index)
+{
+    logtable * ltable = mergedata[index].first;
+    struct logtable_mergedata *mdata = mergedata[index].second;
+
+    pthread_cond_t * block1_needed_cond = new pthread_cond_t;
+    pthread_cond_init(block1_needed_cond,0);
+    pthread_cond_t * block2_needed_cond = new pthread_cond_t;
+    pthread_cond_init(block2_needed_cond,0);
+
+    pthread_cond_t * block1_ready_cond = new pthread_cond_t;
+    pthread_cond_init(block1_ready_cond,0);
+    pthread_cond_t * block2_ready_cond = new pthread_cond_t;
+    pthread_cond_init(block2_ready_cond,0);
+
+    bool *block1_needed = new bool(false);
+    bool *block2_needed = new bool(false);
+    bool *system_running = new bool(true);
+    
+    //wait to merge the next block until we have merged block FUDGE times.
+    static const int FUDGE = 1;
+    static double R = MIN_R;
+    int64_t * block1_size = new int64_t;
+    *block1_size = FUDGE * ((int)R) * (*(mdata->input_size));
+
+    //initialize rb-tree
+    ltable->set_tree_c0(new rbtree_t);
+
+    //disk merger args
+    recordid * ridp = new recordid;
+    *ridp = ltable->get_tree_c2()->get_tree_state(); //h.bigTreeAllocState;
+    recordid * oldridp = new recordid;
+    *oldridp = NULLRID;
+
+    logtree ** block1_scratch = new logtree*;
+    *block1_scratch=0;
+
+    //recordid * allocer_scratch = new recordid;
+    RegionAllocConf_t *allocer_scratch = new RegionAllocConf_t;
+
+    
+    struct merger_args<logtree> diskmerge_args= {
+        ltable, 
+            1,  //worker id 
+            logtree::alloc_region_rid, //pageAlloc
+                ridp,   // pageAllocState
+            oldridp, // oldAllocState
+            mdata->rbtree_mut, //block_ready_mutex
+            block1_needed_cond, //in_block_needed_cond
+            block1_needed,      //in_block_needed
+            block2_needed_cond, //out_block_needed_cond
+            block2_needed,      //out_block_needed
+            block1_ready_cond,  //in_block_ready_cond
+            block2_ready_cond,  //out_block_ready_cond
+        system_running,    //still_open i.e. system running
+            block1_size, //mytree_size ?
+            0, //out_tree_size,  biggest component computes its size directly.
+        0, //max_tree_size No max size for biggest component
+        &R, //r_i
+            block1_scratch, //in-tree
+            allocer_scratch, //in_tree_allocer
+            0, //out_tree
+            0, //out_tree_allocer
+            new treeIterator<datatuple>::treeIteratorHandle(ltable->get_tree_c2()->get_root_rec()),        // my_tree
+            ltable->get_table_rec() //tree
+                };
+
+    *mdata->diskmerge_args = diskmerge_args;
+
+    DEBUG("Tree C2 is %lld\n", (long long)ltable->get_tree_c2()->get_root_rec().page);
+
+
+    //memory merger args
+    ridp = new recordid;
+    *ridp = ltable->get_tree_c1()->get_tree_state(); 
+    oldridp = new recordid;
+    *oldridp = NULLRID;
+
+    DEBUG("Tree C1 is %lld\n", (long long)ltable->get_tree_c1()->get_root_rec().page);
+
+    struct merger_args<rbtree_t> memmerge_args =
+        {
+            ltable,
+            2,
+            logtree::alloc_region_rid, //pageAlloc
+            ridp,   // pageAllocState
+            oldridp, // oldAllocState
+            mdata->rbtree_mut, //block_ready_mutex
+            mdata->input_needed_cond,
+            mdata->input_needed,
+            block1_needed_cond,
+            block1_needed,
+            mdata->input_ready_cond,
+            block1_ready_cond,
+            system_running,
+            mdata->input_size,
+            block1_size,
+            (int64_t)(R * R * MAX_C0_SIZE),
+            &R,
+            mdata->old_c0,
+            0,
+            block1_scratch,
+            allocer_scratch,
+            new treeIterator<datatuple>::treeIteratorHandle(ltable->get_tree_c1()->get_root_rec()),
+            ltable->get_table_rec() //tree
+        };
+    
+    *mdata->memmerge_args = memmerge_args;
+
+    void * (*diskmerger)(void*) = diskMergeThread;
+    void * (*memmerger)(void*) = memMergeThread;
+
+    pthread_create(&mdata->diskmerge_thread, 0, diskmerger, mdata->diskmerge_args);
+    pthread_create(&mdata->memmerge_thread, 0, memmerger, mdata->memmerge_args);
+    
+}
+
+//TODO: flush the data pages
+//      deallocate/free their region
+//      create new data region for new data pages
+void* memMergeThread(void*arg)
+{
+
+    int xid;// = Tbegin();
+
+    merger_args<rbtree_t> * a = (merger_args<rbtree_t>*)(arg);    
+    assert(a->my_tree->r_.size != -1);
+
+    logtable * ltable = a->ltable;
+    
+    int merge_count =0;
+//    pthread_mutex_lock(a->block_ready_mut);
+    
+    while(true)
+    {
+        writelock(ltable->mergedata->header_lock,0);
+        int done = 0;
+        // get a new input for merge
+        while(!*(a->in_tree))
+        {            
+            pthread_mutex_lock(a->block_ready_mut);
+            *a->in_block_needed = true;
+            //pthread_cond_signal(a->in_block_needed_cond);
+            pthread_cond_broadcast(a->in_block_needed_cond);
+
+            if(!*(a->still_open)){
+                done = 1;
+                pthread_mutex_unlock(a->block_ready_mut);
+                break;
+            }
+            
+            printf("mmt:\twaiting for block ready cond\n");
+            unlock(ltable->mergedata->header_lock);
+            
+            pthread_cond_wait(a->in_block_ready_cond, a->block_ready_mut);
+            pthread_mutex_unlock(a->block_ready_mut);
+            
+            writelock(ltable->mergedata->header_lock,0);
+            printf("mmt:\tblock ready\n");
+            
+        }        
+        *a->in_block_needed = false;
+
+        if(done==1)
+        {
+            pthread_mutex_lock(a->block_ready_mut);
+            pthread_cond_signal(a->out_block_ready_cond);
+            pthread_mutex_unlock(a->block_ready_mut);
+            unlock(ltable->mergedata->header_lock);
+            break;
+        }
+
+        if((*a->in_tree)->size()==0) //input empty, this can only happen during shutdown
+        {
+            delete *a->in_tree;
+            *a->in_tree = 0;
+            unlock(ltable->mergedata->header_lock);
+            continue;
+        }
+      
+        uint64_t insertedTuples=0;
+        int64_t mergedPages=0;
+        
+        assert(a->my_tree->r_.size != -1);
+        
+        //create the iterators
+        treeIterator<datatuple> *itrA = new treeIterator<datatuple>(a->my_tree->r_);
+        memTreeIterator<rbtree_t, datatuple> *itrB =
+            new memTreeIterator<rbtree_t, datatuple>(*a->in_tree);
+        memTreeIterator<rbtree_t, datatuple> *itrBend = itrB->end();
+        
+        //Tcommit(xid);
+        xid = Tbegin();
+        
+        //create a new tree
+        logtree * scratch_tree = new logtree;
+        recordid scratch_root = scratch_tree->create(xid);
+
+        //save the old dp state values
+        RegionAllocConf_t olddp_state;        
+        Tread(xid, ltable->get_dpstate1(), &olddp_state);  
+        //reinitialize the dp state        
+        Tset(xid, ltable->get_dpstate1(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER);
+        
+        //pthread_mutex_unlock(a->block_ready_mut);        
+        unlock(ltable->mergedata->header_lock);
+        
+        //: do the merge        
+        printf("mmt:\tMerging:\n");
+
+        int64_t npages = 0;
+        mergedPages = merge_iterators(xid, itrA, itrB, ltable, scratch_tree, npages);
+      
+        delete itrA;
+        delete itrB;
+        delete itrBend;
+        
+        //force write the new region to disk
+        recordid scratch_alloc_state = scratch_tree->get_tree_state();        
+        //TlsmForce(xid,scratch_root,logtree::force_region_rid, &scratch_alloc_state);
+        logtree::force_region_rid(xid, &scratch_alloc_state);
+        //force write the new datapages
+        DataPage<datatuple>::force_region_rid(xid, &ltable->get_dpstate1());
+
+        //writes complete
+        //now automically replace the old c1 with new c1
+        //pthread_mutex_lock(a->block_ready_mut);
+
+        writelock(ltable->mergedata->header_lock,0);
+        merge_count++;        
+        *a->my_tree_size = mergedPages;      
+        printf("mmt:\tmerge_count %d #pages written %lld\n", merge_count, npages);      
+
+        delete ltable->get_tree_c1();
+        ltable->set_tree_c1(scratch_tree);
+
+        logtable::table_header h;
+        void * oldAllocState = a->pageAllocState;
+        Tread(xid, a->tree, &h);
+        
+        h.c1_root = scratch_root;
+        h.c1_state = scratch_alloc_state;
+        //note we already updated the dpstate before the merge
+        printf("mmt:\tUpdated C1's position on disk to %lld\n",scratch_root.page);      
+        Tset(xid, a->tree, &h);
+        
+        //Tcommit(xid);
+        //xid = Tbegin();
+        
+        // free old my_tree here
+        //TODO: check
+        logtree::free_region_rid(xid, a->my_tree->r_, logtree::dealloc_region_rid, oldAllocState);
+
+        
+        //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState);
+        //TODO: check
+        //free the old data pages
+        DataPage<datatuple>::dealloc_region_rid(xid, &olddp_state);
+
+        Tcommit(xid);
+        //xid = Tbegin();
+
+        
+        //TODO: this is simplistic for now
+        //signal the other merger if necessary
+        double target_R = *(a->r_i);
+        double new_c1_size = npages * PAGE_SIZE;
+        assert(target_R >= MIN_R);
+        if( (new_c1_size / MAX_C0_SIZE > target_R) ||
+            (a->max_size && new_c1_size > a->max_size ) )
+        {
+            printf("mmt:\tsignaling C2 for merge\n");
+            printf("mmt:\tnew_c1_size %.2f\tMAX_C0_SIZE %lld\ta->max_size %lld\t targetr %.2f \n", new_c1_size,
+                   MAX_C0_SIZE, a->max_size, target_R);
+            
+            // XXX need to report backpressure here!
+            while(*a->out_tree) {
+                pthread_mutex_lock(a->block_ready_mut);
+                unlock(ltable->mergedata->header_lock);
+                
+                pthread_cond_wait(a->out_block_needed_cond, a->block_ready_mut);
+                pthread_mutex_unlock(a->block_ready_mut);
+                writelock(ltable->mergedata->header_lock,0);
+            }
+
+
+            *a->out_tree = scratch_tree;
+            xid = Tbegin();
+            Tread(xid, ltable->get_dpstate1(), a->out_tree_allocer);  
+
+            pthread_cond_signal(a->out_block_ready_cond);
+
+
+            logtree *empty_tree = new logtree;
+            empty_tree->create(xid);
+            
+            *(recordid*)(a->pageAllocState) = empty_tree->get_tree_state();
+
+            a->my_tree->r_ = empty_tree->get_root_rec();
+
+            ltable->set_tree_c1(empty_tree);
+
+            logtable::table_header h;
+            Tread(xid, a->tree, &h);            
+            h.c1_root = empty_tree->get_root_rec(); //update root
+            h.c1_state = empty_tree->get_tree_state(); //update index alloc state
+            printf("mmt:\tUpdated C1's position on disk to %lld\n",empty_tree->get_root_rec().page);      
+            Tset(xid, a->tree, &h);
+            //update datapage alloc state
+            Tset(xid, ltable->get_dpstate1(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER);
+        
+            Tcommit(xid);
+            //xid = Tbegin();
+
+        }
+        else //not signaling the C2 for merge yet
+        {
+            printf("mmt:\tnot signaling C2 for merge\n");
+            *(recordid*)a->pageAllocState = scratch_alloc_state;      
+            a->my_tree->r_ = scratch_root;
+        }
+
+        rbtree_ptr_t deltree = *a->in_tree;
+        *a->in_tree = 0;
+
+        
+        //Tcommit(xid);
+        unlock(ltable->mergedata->header_lock);
+        
+        //TODO: get the freeing outside of the lock
+        //// ----------- Free in_tree
+        for(rbtree_t::iterator delitr=deltree->begin();
+            delitr != deltree->end(); delitr++)
+            free((*delitr).keylen);
+        
+        delete deltree;        
+        //deltree = 0;       
+
+
+        /*
+        for(rbtree_t::iterator delitr=(*a->in_tree)->begin();
+            delitr != (*a->in_tree)->end(); delitr++)
+            free((*delitr).keylen);
+        
+        delete *a->in_tree;        
+        *a->in_tree = 0;       
+        */
+    }
+
+    //pthread_mutex_unlock(a->block_ready_mut);
+    
+    return 0;
+
+}
+
+void *diskMergeThread(void*arg)
+{
+    int xid;// = Tbegin();
+
+    merger_args<logtree> * a = (merger_args<logtree>*)(arg);    
+    assert(a->my_tree->r_.size != -1);
+
+    logtable * ltable = a->ltable;
+    
+    int merge_count =0;
+    //pthread_mutex_lock(a->block_ready_mut);
+    
+    while(true)
+    {
+        writelock(ltable->mergedata->header_lock,0);
+        int done = 0;
+        // get a new input for merge
+        while(!*(a->in_tree))
+        {
+            pthread_mutex_lock(a->block_ready_mut);
+            *a->in_block_needed = true;
+            pthread_cond_signal(a->in_block_needed_cond);
+
+            if(!*(a->still_open)){
+                done = 1;
+                pthread_mutex_unlock(a->block_ready_mut);
+                break;
+            }
+            
+            printf("dmt:\twaiting for block ready cond\n");
+            unlock(ltable->mergedata->header_lock);
+            
+            pthread_cond_wait(a->in_block_ready_cond, a->block_ready_mut);
+            pthread_mutex_unlock(a->block_ready_mut);
+
+            printf("dmt:\tblock ready\n");
+            writelock(ltable->mergedata->header_lock,0);
+        }        
+        *a->in_block_needed = false;
+        if(done==1)
+        {
+            pthread_cond_signal(a->out_block_ready_cond);
+            unlock(ltable->mergedata->header_lock);
+            break;
+        }
+        
+      
+        uint64_t insertedTuples=0;
+        int64_t mergedPages=0;
+        
+        assert(a->my_tree->r_.size != -1);
+        
+        //create the iterators
+        treeIterator<datatuple> *itrA = new treeIterator<datatuple>(a->my_tree->r_);
+        treeIterator<datatuple> *itrB =
+            new treeIterator<datatuple>((*a->in_tree)->get_root_rec());
+        
+        //Tcommit(xid);
+        xid = Tbegin();
+        
+        //create a new tree
+        logtree * scratch_tree = new logtree;
+        recordid scratch_root = scratch_tree->create(xid);
+
+        //save the old dp state values
+        RegionAllocConf_t olddp_state;  
+        Tread(xid, ltable->get_dpstate2(), &olddp_state);  
+        //reinitialize the dp state
+        //TODO: maybe you want larger regions for the second tree?
+        Tset(xid, ltable->get_dpstate2(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER);
+        
+        //pthread_mutex_unlock(a->block_ready_mut);
+        unlock(ltable->mergedata->header_lock);
+        
+        
+        //do the merge        
+        printf("dmt:\tMerging:\n");
+
+        int64_t npages = 0;
+        mergedPages = merge_iterators(xid, itrA, itrB, ltable, scratch_tree, npages);
+      
+        delete itrA;
+        delete itrB;        
+        
+        //force write the new region to disk
+        recordid scratch_alloc_state = scratch_tree->get_tree_state();
+        //TODO:
+        //TlsmForce(xid,scratch_root,logtree::force_region_rid, &scratch_alloc_state);
+        logtree::force_region_rid(xid, &scratch_alloc_state);
+        //force write the new datapages
+        DataPage<datatuple>::force_region_rid(xid, &ltable->get_dpstate2());
+
+
+        //writes complete
+        //now automically replace the old c2 with new c2
+        //pthread_mutex_lock(a->block_ready_mut);
+        writelock(ltable->mergedata->header_lock,0);
+        
+        merge_count++;        
+        *a->my_tree_size = mergedPages;
+        //update the current optimal R value
+        *(a->r_i) = std::max(MIN_R, sqrt( (npages * 1.0) / (MAX_C0_SIZE/PAGE_SIZE) ) );
+        
+        printf("dmt:\tmerge_count %d\t#written pages: %lld\n optimal r %.2f", merge_count, npages, *(a->r_i));
+
+        delete ltable->get_tree_c2();
+        ltable->set_tree_c2(scratch_tree);
+
+        logtable::table_header h;
+        void * oldAllocState = a->pageAllocState;
+        Tread(xid, a->tree, &h);
+        
+        h.c2_root = scratch_root;
+        h.c2_state = scratch_alloc_state;
+        //note we already updated the dpstate before the merge
+        printf("dmt:\tUpdated C2's position on disk to %lld\n",scratch_root.page);
+        Tset(xid, a->tree, &h);
+        
+
+        
+        // free old my_tree here
+        //TODO: check
+        logtree::free_region_rid(xid, a->my_tree->r_, logtree::dealloc_region_rid, oldAllocState);        
+        //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState);
+        
+        //TODO: check
+        //free the old data pages
+        DataPage<datatuple>::dealloc_region_rid(xid, &olddp_state);        
+
+        
+        
+        *(recordid*)a->pageAllocState = scratch_alloc_state;      
+        a->my_tree->r_ = scratch_root;
+        
+        //// ----------- Free in_tree
+        //TODO: check
+        logtree::free_region_rid(xid, (*a->in_tree)->get_root_rec(),
+                                 logtree::dealloc_region_rid,
+                                 &((*a->in_tree)->get_tree_state()));
+        //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState);
+        
+        //TODO: check
+        //free the old data pages
+        DataPage<datatuple>::dealloc_region_rid(xid, a->in_tree_allocer);//TODO:    
+
+        Tcommit(xid);
+        
+        //xid = Tbegin();
+        //Tcommit(xid);
+        delete *a->in_tree;        
+        *a->in_tree = 0;
+
+        unlock(ltable->mergedata->header_lock);
+        
+    }
+
+    //pthread_mutex_unlock(a->block_ready_mut);
+    
+    return 0;
+
+
+}
+
+int64_t merge_iterators(int xid,
+                     treeIterator<datatuple> *itrA,
+                     memTreeIterator<rbtree_t, datatuple> * itrB,
+                     logtable *ltable,
+                    logtree *scratch_tree,
+                    int64_t &npages )
+{
+    int64_t dpages = 0;
+    //int npages = 0;
+    int64_t ntuples = 0;
+    DataPage<datatuple> *dp = 0;
+
+    memTreeIterator<rbtree_t, datatuple> *itrBend = itrB->end();
+    datatuple *t1 = itrA->getnext();
+    
+    while(*itrB != *itrBend)
+    {
+        datatuple t2 = **itrB;
+        DEBUG("tuple\t%lld: keylen %d datalen %d\n", ntuples, *t2.keylen,*t2.datalen );        
+
+        while(t1 != 0 && datatuple::compare(t1->key, t2.key) < 0) // t1 is less than t2
+        {
+            //insert t1
+            dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate1(),
+                         dpages, npages);
+
+            free(t1->keylen);
+            free(t1);            
+            ntuples++;      
+            //advance itrA
+            t1 = itrA->getnext();
+        }
+
+        if(t1 != 0 && datatuple::compare(t1->key, t2.key) == 0)
+        {
+            datatuple *mtuple = ltable->gettuplemerger()->merge(t1,&t2);
+            //insert merged tuple
+            dp = insertTuple(xid, dp, *mtuple, ltable, scratch_tree, ltable->get_dpstate1(),
+                             dpages, npages);
+            free(t1->keylen);
+            free(t1);
+            t1 = itrA->getnext();  //advance itrA
+            free(mtuple->keylen);
+            free(mtuple);
+        }
+        else
+        {
+            //insert t2
+            dp = insertTuple(xid, dp, t2, ltable, scratch_tree, ltable->get_dpstate1(),
+                             dpages, npages);
+            //free(t2.keylen); //cannot free here it may still be read through a lookup
+        }
+        
+        ntuples++;        
+        ++(*itrB);        
+    }
+
+    while(t1 != 0) // t1 is less than t2
+        {
+            dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate1(),
+                             dpages, npages);
+
+            free(t1->keylen);
+            free(t1);        
+            ntuples++;      
+            //advance itrA
+            t1 = itrA->getnext();
+        }
+        
+        
+    delete itrBend;
+    if(dp!=NULL)
+        delete dp;
+    DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples);
+    fflush(stdout);
+
+
+    return dpages;
+
+}
+
+
+int64_t merge_iterators(int xid,
+                        treeIterator<datatuple> *itrA, //iterator on c2
+                        treeIterator<datatuple> *itrB, //iterator on c1
+                        logtable *ltable,
+                        logtree *scratch_tree,
+                        int64_t &npages)
+{
+    int64_t dpages = 0;
+    //int npages = 0;
+    int64_t ntuples = 0;
+    DataPage<datatuple> *dp = 0;
+
+    datatuple *t1 = itrA->getnext();
+    datatuple *t2 = 0;
+    
+    while( (t2=itrB->getnext()) != 0)
+    {        
+        DEBUG("tuple\t%lld: keylen %d datalen %d\n",
+               ntuples, *(t2->keylen),*(t2->datalen) );        
+
+        while(t1 != 0 && datatuple::compare(t1->key, t2->key) < 0) // t1 is less than t2
+        {
+            //insert t1
+            dp = insertTuple(xid, dp, *t1, ltable, scratch_tree,
+                             ltable->get_dpstate2(),
+                             dpages, npages);
+
+            free(t1->keylen);
+            free(t1);            
+            ntuples++;      
+            //advance itrA
+            t1 = itrA->getnext();
+        }
+
+        if(t1 != 0 && datatuple::compare(t1->key, t2->key) == 0)
+        {
+            datatuple *mtuple = ltable->gettuplemerger()->merge(t1,t2);
+            
+            //insert merged tuple, drop deletes
+            if(!mtuple->isDelete())
+                dp = insertTuple(xid, dp, *mtuple, ltable, scratch_tree, ltable->get_dpstate2(),
+                                 dpages, npages);
+            
+            free(t1->keylen);
+            free(t1);
+            t1 = itrA->getnext();  //advance itrA
+            free(mtuple->keylen);
+            free(mtuple);
+        }
+        else
+        {        
+            //insert t2
+            dp = insertTuple(xid, dp, *t2, ltable, scratch_tree, ltable->get_dpstate2(),
+                             dpages, npages);
+        }
+        
+        free(t2->keylen);
+        free(t2);
+        ntuples++;
+    }
+
+    while(t1 != 0) 
+        {
+            dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate2(),
+                             dpages, npages);
+
+            free(t1->keylen);
+            free(t1);        
+            ntuples++;      
+            //advance itrA
+            t1 = itrA->getnext();
+        }
+
+    if(dp!=NULL)
+        delete dp;
+    DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples);
+    fflush(stdout);
+    
+    return dpages;
+
+}
+
+
+
+inline DataPage<datatuple>*
+insertTuple(int xid, DataPage<datatuple> *dp, datatuple &t,
+            logtable *ltable,
+            logtree * ltree,
+            recordid & dpstate,
+            int64_t &dpages, int64_t &npages)
+{
+    if(dp==0)
+    {
+        dp = ltable->insertTuple(xid, t, dpstate, ltree);
+        dpages++;
+    }
+    else if(!dp->append(xid, t))
+    {
+        npages += dp->get_page_count();
+        delete dp;
+        dp = ltable->insertTuple(xid, t, dpstate, ltree);
+        dpages++;
+    }
+
+    return dp;    
+}
+
+
+
+
diff --git a/merger.h b/merger.h
new file mode 100644
index 0000000..def1859
--- /dev/null
+++ b/merger.h
@@ -0,0 +1,127 @@
+#ifndef _MERGER_H_
+#define _MERGER_H_
+
+#include <vector>
+#include <utility>
+
+#include "logstore.h"
+#include "logiterators.h"
+
+typedef std::set<datatuple, datatuple> rbtree_t;
+typedef rbtree_t* rbtree_ptr_t;
+
+//TODO: 400 bytes overhead per tuple, this is nuts, check if this is true...
+static const int RB_TREE_OVERHEAD = 400;
+static const int64_t MAX_C0_SIZE = 800 *1024*1024; //max size of c0
+static const double MIN_R = 3.0;
+//T is either logtree or red-black tree
+template <class T>
+struct merger_args
+{
+    logtable * ltable;
+    int worker_id;
+
+    //page allocation information
+    pageid_t(*pageAlloc)(int,void*);
+    void *pageAllocState;
+    void *oldAllocState;
+
+    pthread_mutex_t * block_ready_mut;
+
+    pthread_cond_t * in_block_needed_cond;
+    bool * in_block_needed;
+
+    pthread_cond_t * out_block_needed_cond;
+    bool * out_block_needed;
+
+    pthread_cond_t * in_block_ready_cond;
+    pthread_cond_t * out_block_ready_cond;
+
+    bool * still_open;
+
+    int64_t * my_tree_size;
+    int64_t * out_tree_size;
+    int64_t max_size; //pageid_t
+    double * r_i;
+
+    T ** in_tree;
+    void * in_tree_allocer;
+
+    logtree ** out_tree;
+    void * out_tree_allocer;
+
+    treeIterator<datatuple>::treeIteratorHandle *my_tree;
+    
+    recordid tree;    
+};
+
+
+
+struct logtable_mergedata
+{
+    //merge threads
+    pthread_t diskmerge_thread;
+    pthread_t memmerge_thread;
+
+    rwl *header_lock;
+    
+    pthread_mutex_t * rbtree_mut;
+    rbtree_ptr_t *old_c0; //in-mem red black tree being merged / to be merged
+
+    bool *input_needed; // memmerge-input needed
+    
+    pthread_cond_t * input_ready_cond;
+    pthread_cond_t * input_needed_cond;
+    int64_t * input_size;
+
+    //merge args 1
+    struct merger_args<logtree> *diskmerge_args;    
+    //merge args 2
+    struct merger_args<rbtree_t> *memmerge_args;
+    
+};
+
+
+class merge_scheduler
+{
+    std::vector<std::pair<logtable *, logtable_mergedata*> > mergedata; 
+
+public:
+    //static pageid_t C0_MEM_SIZE; 
+    ~merge_scheduler();
+    
+    int addlogtable(logtable * ltable);
+    void startlogtable(int index);
+
+    struct logtable_mergedata *getMergeData(int index){return mergedata[index].second;}
+
+    void shutdown();
+
+    
+
+};
+
+
+void* memMergeThread(void* arg);
+
+//merges and returns the number of data pages used
+int64_t merge_iterators(int xid,
+                     treeIterator<datatuple> *itrA,
+                     memTreeIterator<rbtree_t, datatuple> * itrB,
+                     logtable *ltable,
+                    logtree *scratch_tree,
+                    int64_t &npages);
+
+                     
+int64_t merge_iterators(int xid,
+                    treeIterator<datatuple> *itrA,
+                    treeIterator<datatuple> *itrB,
+                    logtable *ltable,
+                    logtree *scratch_tree,
+                    int64_t &npages);
+
+
+void* diskMergeThread(void* arg);
+
+
+#endif
diff --git a/tuplemerger.cpp b/tuplemerger.cpp
new file mode 100644
index 0000000..0adbf22
--- /dev/null
+++ b/tuplemerger.cpp
@@ -0,0 +1,84 @@
+#include "tuplemerger.h"
+#include "logstore.h"
+
+datatuple* tuplemerger::merge(datatuple *t1, datatuple *t2)
+{
+    assert(!t1->isDelete() || !t2->isDelete()); //both cannot be delete
+
+    datatuple *t;
+
+    if(t1->isDelete()) //delete - t2
+    {
+        t = datatuple::from_bytes(t2->to_bytes());
+    }
+    else if(t2->isDelete())
+    {
+        t = datatuple::from_bytes(t2->to_bytes());
+    }
+    else //neither is a delete
+    {
+        t = (*merge_fp)(t1,t2);
+    }
+    
+    return t;
+    
+}
+
+/**
+ * appends the data in t2 to data from t1
+ * 
+ * deletes are handled by the tuplemerger::merge function
+ * so here neither t1 nor t2 is a delete datatuple
+ **/
+datatuple* append_merger(datatuple *t1, datatuple *t2)
+{
+    static const size_t isize = sizeof(uint32_t);
+    struct datatuple *t = (datatuple*) malloc(sizeof(datatuple));
+
+    byte *arr = (byte*)malloc(t1->byte_length() + *t2->datalen);
+
+    t->keylen = (uint32_t*) arr;
+    *(t->keylen) = *(t1->keylen);
+
+    t->datalen = (uint32_t*) (arr+isize);
+    *(t->datalen) = *(t1->datalen) + *(t2->datalen);
+    
+    t->key = (datatuple::key_t) (arr+isize+isize);
+    memcpy((byte*)t->key, (byte*)t1->key, *(t1->keylen));
+    
+    t->data = (datatuple::data_t) (arr+isize+isize+ *(t1->keylen));
+    memcpy((byte*)t->data, (byte*)t1->data, *(t1->datalen));
+    memcpy(((byte*)t->data) + *(t1->datalen), (byte*)t2->data, *(t2->datalen));
+        
+    return t;
+
+}
+
+/**
+ * replaces the data with data from t2
+ * 
+ * deletes are handled by the tuplemerger::merge function
+ * so here neither t1 nor t2 is a delete datatuple
+ **/
+datatuple* replace_merger(datatuple *t1, datatuple *t2)
+{
+    static const size_t isize = sizeof(uint32_t);
+    struct datatuple *t = (datatuple*) malloc(sizeof(datatuple));
+
+    byte *arr = (byte*)malloc(t2->byte_length());
+
+    t->keylen = (uint32_t*) arr;
+    *(t->keylen) = *(t2->keylen);
+
+    t->datalen = (uint32_t*) (arr+isize);
+    *(t->datalen) = *(t2->datalen);
+    
+    t->key = (datatuple::key_t) (arr+isize+isize);
+    memcpy((byte*)t->key, (byte*)t2->key, *(t2->keylen));
+    
+    t->data = (datatuple::data_t) (arr+isize+isize+ *(t2->keylen));
+    memcpy((byte*)t->data, (byte*)t2->data, *(t2->datalen));
+        
+    return t;
+
+}
diff --git a/tuplemerger.h b/tuplemerger.h
new file mode 100644
index 0000000..b8314ba
--- /dev/null
+++ b/tuplemerger.h
@@ -0,0 +1,34 @@
+#ifndef _TUPLE_MERGER_H_
+#define _TUPLE_MERGER_H_
+
+struct datatuple;
+
+typedef datatuple* (*merge_fn_t) (datatuple*, datatuple *);
+
+datatuple* append_merger(datatuple *t1, datatuple *t2);
+
+datatuple* replace_merger(datatuple *t1, datatuple *t2);
+
+
+class tuplemerger
+{
+
+public:
+
+    tuplemerger(merge_fn_t merge_fp) 
+        {
+            this->merge_fp = merge_fp;
+        }
+
+    
+    datatuple* merge(datatuple *t1, datatuple *t2);
+
+private:
+
+    merge_fn_t merge_fp;
+
+};
+
+
+
+#endif