From d016498f8d621fa897e0f1442d6f49981d902854 Mon Sep 17 00:00:00 2001 From: sears Date: Sat, 23 Jan 2010 02:13:59 +0000 Subject: [PATCH] initial import; removed cruft from mert's tarball, tweaked make's clean targets git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@520 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe --- FwCode.h | 165 +++++ LogUtils.cc | 77 ++ LogUtils.h | 130 ++++ Makefile | 74 ++ NOTES | 152 ++++ StringUtils.h | 345 +++++++++ UCharUtils.cc | 326 +++++++++ UCharUtils.h | 139 ++++ adriana-lima.awk | 130 ++++ check_datapage.cpp | 321 +++++++++ check_gen.cpp | 39 + check_logtable.cpp | 276 ++++++++ check_logtree.cpp | 331 +++++++++ check_merge.cpp | 246 +++++++ check_mergelarge.cpp | 264 +++++++ check_mergetuple.cpp | 409 +++++++++++ check_rbtree.cpp | 214 ++++++ check_server.cpp | 107 +++ check_tcpclient.cpp | 415 +++++++++++ cmds.txt | 9 + datapage.cpp | 507 +++++++++++++ datapage.h | 110 +++ datatuple.h | 147 ++++ hello.cpp | 48 ++ logiterators.cpp | 200 ++++++ logiterators.h | 173 +++++ logserver.cpp | 649 +++++++++++++++++ logserver.h | 197 ++++++ logserver_pers.cpp | 519 ++++++++++++++ logserver_pers.h | 163 +++++ logserver_simple.cpp | 409 +++++++++++ logserver_simple.h | 198 ++++++ logstore.cpp | 1606 ++++++++++++++++++++++++++++++++++++++++++ logstore.h | 302 ++++++++ merger.cpp | 836 ++++++++++++++++++++++ merger.h | 127 ++++ tuplemerger.cpp | 84 +++ tuplemerger.h | 34 + 38 files changed, 10478 insertions(+) create mode 100644 FwCode.h create mode 100644 LogUtils.cc create mode 100644 LogUtils.h create mode 100644 Makefile create mode 100644 NOTES create mode 100644 StringUtils.h create mode 100644 UCharUtils.cc create mode 100644 UCharUtils.h create mode 100755 adriana-lima.awk create mode 100644 check_datapage.cpp create mode 100644 check_gen.cpp create mode 100644 check_logtable.cpp create mode 100644 check_logtree.cpp create mode 100644 check_merge.cpp create mode 100644 check_mergelarge.cpp create mode 100644 check_mergetuple.cpp create mode 100644 check_rbtree.cpp create mode 100644 check_server.cpp create mode 100644 check_tcpclient.cpp create mode 100644 cmds.txt create mode 100644 datapage.cpp create mode 100644 datapage.h create mode 100644 datatuple.h create mode 100644 hello.cpp create mode 100644 logiterators.cpp create mode 100644 logiterators.h create mode 100644 logserver.cpp create mode 100644 logserver.h create mode 100644 logserver_pers.cpp create mode 100644 logserver_pers.h create mode 100644 logserver_simple.cpp create mode 100644 logserver_simple.h create mode 100644 logstore.cpp create mode 100644 logstore.h create mode 100644 merger.cpp create mode 100644 merger.h create mode 100644 tuplemerger.cpp create mode 100644 tuplemerger.h diff --git a/FwCode.h b/FwCode.h new file mode 100644 index 0000000..5af3d06 --- /dev/null +++ b/FwCode.h @@ -0,0 +1,165 @@ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +#ifndef __FW_CODE__H +#define __FW_CODE__H + +#include + +/** + * Global framework response codes. + */ +class FwCode { + public: + + typedef int ResponseCode; + + static const std::string unknownCodeStr; + + /** + * The convention here is to keep related codes grouped together, so + * that it is easier to find all existing codes for a particular + * module. Each section is given a range of 50 codes, so that adding + * a new code to an existing section won't invalidate all of the codes + * following it in the enum (causing binary incompatibility). + */ + + //----------- Generic section ------------- + static const ResponseCode FwOk = 0; //!< All successes + static const ResponseCode FwError = 1; //!< General error code + + static const ResponseCode FwCrit = 2; //!< General critical error. could be originated by low level library to indicate some nasty error has occurred. + + static const ResponseCode MdbmOpenFailed = 3; //!< Any kind of mdbm open failure + static const ResponseCode MdbmOperationFailed = 4; //!< Any store/fetch/lock from mdbm failed + static const ResponseCode NoMem = 5; //!< Out Of Memory + static const ResponseCode InvalidParam = 6; //!< Invalid parameter + static const ResponseCode NotFound = 7; //!< Fail to find the specified info; usuall returned by access methods + static const ResponseCode InvalidState = 8; //!< Invalid state + static const ResponseCode ConnReset = 9; //!< connection reset + static const ResponseCode Timeout = 10; //!< operation timed out + static const ResponseCode InvalidData = 11; //!< buffer data is invalid + static const ResponseCode BufTooSmall = 12; //!< Buffer size is smaller than required + static const ResponseCode MalformedRequest = 13; //!< Request data (like the URI) is malformed + static const ResponseCode RequestTooLarge = 14; //!< Request data (like the body) is too big + static const ResponseCode ConvertToDhtDataFailed = 15; // !< Failed convert json string to DHT::Data + static const ResponseCode ConvertFromDhtDataFailed = 16; // !< Failed to convert DHT::Data to json string + static const ResponseCode BadHexString = 17; //!< Failed to parse a hex string + static const ResponseCode ShmemCorrupted = 18; //!< A shared mem corruption has been detected. + static const ResponseCode ParseError = 19; //!< Generic parsing problem + /// If mdbm unlock fails, most of the time we want to shut off the + /// system automatically, without letting the caller know that we did + /// so. On specific instances where the caller is the FaultHandler, or + /// Oversight Fault counter (there may be other examples), we don't want + /// to do this because we want to avoid cross-dependency. + static const ResponseCode MdbmUnlockFailed = 20; + + //----------- Generic section ------------- + // Config + static const ResponseCode ConfigFailure = 50; //!< Failure to find or parse a config entry + + //----------- UChar section ------------- + // UCharUtils + static const ResponseCode UcnvOpenFailed = 100; //!< Failed to open ucnv converter for utf-8 + static const ResponseCode DataNotUtf8 = 101; //!< Data is not in utf-8 format + static const ResponseCode ConvertToUCharFailed = 102; //!< Failed to convert utf-8 string to UChar string + static const ResponseCode CompileRegExFailed = 103; //!< Failed to compile the regular expression + + //----------- Yca section ------------- + // YcaClient + static const ResponseCode YcaOpenFailed = 150; //!< Failed to open the yca database + static const ResponseCode YcaCertInvalid = 151; //!< Validation of presented cert failed + static const ResponseCode YcaCertNotFound = 152; //!< certificate for the requested appID was not found + + //----------- Broker section ------------- + static const ResponseCode BrokerClientOpenFailed = 200; //!< Failed to connect to broker + static const ResponseCode UncertainPublish = 201; //!< Publish was uncertain - unknown if it happened + static const ResponseCode PublishFailed = 202; //!< Publish failed (for certain :)) + static const ResponseCode SubscribeFailed = 203; //!< Failed to subscribe to a topic + static const ResponseCode NoSubscriptionFound = 204; //!< Operation on a sub failed because we (locally) + // don't know about it + static const ResponseCode RegisterFailed = 205; //!< Failed to register handler for subscription + static const ResponseCode UnsubscribeFailed = 206; //!< Failed to unsubscribe from sub + static const ResponseCode ListTopicsFailed = 207; //!< Failed to list subscribed topics + static const ResponseCode ConsumeFailed = 208; //!< Failed to consume messages for a topic + static const ResponseCode TopicInvalid = 209; //!< Topic is invalid (was usurped or ymb 'lost' it) + static const ResponseCode NoMessageDelivered = 210; //!< Call to deliver() found no messages ready + static const ResponseCode ConsumeFailedBadTopic = 211; //!< The topic is bad - our handle is bad, + // or it got usurped + static const ResponseCode ConsumeFailedBadHandle = 212; //!< Our ymb handle is bad - not usable anymore + static const ResponseCode ConsumeFailedConnectionError = 213; //!< a recoverable connection error + static const ResponseCode ConsumeFailedServerBusy = 214; //!< ymb server is having a temporary issue, + // not a failure per se + // second argument to messageProcessed() + static const ResponseCode ConsumeMessage = 215; //!< consume this message + static const ResponseCode ConsumeAndUnsubscribe = 216; //!< end this channel + // Internal to ymb implementation + static const ResponseCode YmbSubscribeTempFailure = 217; //!< A failure that might be resolved on a retry + static const ResponseCode YmbSubscribeTimedout = 218; //!< A timeout failure + static const ResponseCode YmbSubscriptionExists = 219; //!< Attempt to create a sub that already exists + static const ResponseCode NoSuchSubscription = 220; //!< Attempt to attach to a sub that does not exist + static const ResponseCode AttachNoSuchSubscription = 221; //!< Specific to attach, no subscription to attach to (not necessarily an error) + static const ResponseCode BrokerInitFailed = 222; //!< Config or allocation failed + static const ResponseCode BrokerConnectionLost = 223; //!< Lost connection to broker + static const ResponseCode BrokerFatalError = 224; //!< Generally shared mem corruption + + + //----------- Daemon section ------------- + // Daemon + static const ResponseCode NoImpl = 250; //!< No op + static const ResponseCode Restart = 251; //!< Exit the daemon so that it is restarted right away. + // request that the daemon do a soft restart + static const ResponseCode Exit = 252; //!< Exit the daemon so that it is NOT restarted right away. A monitoring process may restart the entire system later. + static const ResponseCode StopDelivery = 253; //!< Stop delivery on the topic, returned by Broker handlers only. + static const ResponseCode RetryDelivery = 254; //!< Stop delivery on the topic but retry after sometime, returned by Broker handlers only. + + //----------- Lock section ------------- + // LockManager + //ALL these lock errors are handled in SuFaulHandler.cc + //Any addition to these error codes requires update to the SuFaultHandler + static const ResponseCode LockSyserr = 301; //!< System error during lock/unlock op + static const ResponseCode LockInconsis = 302; //!< Inconsistency detected in LockManager. + static const ResponseCode LockNested = 303; //!< Nested locking of same key not allowed. + static const ResponseCode LockNosuchpid = 304; //!< This pid does not hold the lock. + static const ResponseCode LockUnavail = 305; //!< Outa lock + static const ResponseCode LockInitfail = 306; //!< Initialization failure of the lock subsystem + static const ResponseCode LockInvalidarg = 307; //!< Invalid arguments to lock subsystem. + + //----------- Message section ------------- + //Message and Message serialization + static const ResponseCode SerializeFailed = 350; //!< Message Serialization Failed + static const ResponseCode DeserializeFailed = 351; //!< Message Deserialization failed + static const ResponseCode NoResponseCodeInMessage = 352; + + //----------- Transport Errors ------------- + static const ResponseCode TransportSendError = 400; //!< Curl error in communicating with other server + static const ResponseCode TransportSetHeaderFailed = 401; //!< Error in setting header in curl request + static const ResponseCode TransportCurlInitError = 402; // !< Error initializing curl handle -- should be curl specific + static const ResponseCode TransportUncertain = 403; //!< Send came back uncertain (timeout, usually) + static const ResponseCode TransportInvalidResponseBody = 404; //!< Send came back unparsable body + + //----------- Apache/Web section ------------- + static const ResponseCode EndOfBody = 450; //!< Normal end of incoming request body + static const ResponseCode BodyReadFailed = 451; //!< Failed reading incoming request body + static const ResponseCode BodyWriteFailed = 452; //!< Failed writing outgoing request body + static const ResponseCode EncryptionFailed = 453; //!< Failed to encrypt body or header + static const ResponseCode DecryptionFailed = 454; //!< Failed to decrypt body or header + + /** + * Give back a basic, generic string description of the response code. + * + * @param rc The response code to convert. + * @return The string describing it. + */ + static std::string toString(ResponseCode rc); + +}; + +/* For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ +#endif diff --git a/LogUtils.cc b/LogUtils.cc new file mode 100644 index 0000000..3dea981 --- /dev/null +++ b/LogUtils.cc @@ -0,0 +1,77 @@ +/*! \file log4_util.cc + * \brief This file has the helper functions for log4cpp; + * + * Copyright (c) 2008 Yahoo, Inc. + * All rights reserved. + */ +#include +#include + +#include "LogUtils.h" + +using namespace log4cpp; +using namespace std; + +// hacked link to actioncontext +std::string s_trackPathLog; + +LogMethod:: +LogMethod(log4cpp::Category& log, log4cpp::Priority::Value priority, + const char *function) : + log_(log), priority_(priority), function_(function) +{ + if(log_.isPriorityEnabled(priority_)) { + log_.getStream(priority_) << "Entering: " << function_; + } +} + + +LogMethod:: +~LogMethod() +{ + if(log_.isPriorityEnabled(priority_)) { + log_.getStream(priority_) << "Exiting: " << function_; + } +} + +// Protects against multiple calls (won't try to re-init) and gives +// back the same answer the original call got. +static int log4cppInitResult = -1; + +bool +initLog4cpp(const string &confFile) +{ + + if (log4cppInitResult != -1) { + return (log4cppInitResult == 0 ? true : false); + } + + log4cppInitResult = 0; // Assume success. + try { + PropertyConfigurator::configure(confFile); + } catch (log4cpp::ConfigureFailure &e) { + cerr << "log4cpp configuration failure while loading '" << + confFile << "' : " << e.what() << endl; + log4cppInitResult = 1; + } catch (std::exception &e) { + cerr << "exception caught while configuring log4cpp via '" << + confFile << "': " << e.what() << endl; + log4cppInitResult = 1; + } catch (...) { + cerr << "unknown exception while configuring log4cpp via '" << + confFile << "'." << endl; + log4cppInitResult = 1; + } + + return (log4cppInitResult == 0 ? true : false); +} + +/* + * For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ diff --git a/LogUtils.h b/LogUtils.h new file mode 100644 index 0000000..73c0af6 --- /dev/null +++ b/LogUtils.h @@ -0,0 +1,130 @@ +/* Copyright (C) 2007 Yahoo! Inc. All Rights Reserved. */ + +#ifndef LOG_UTIL_H +#define LOG_UTIL_H + +#include +#include "StringUtils.h" + +/** + * Quick and dirty link between LogUtils and ActionContext without having to + * resolve cross-inclusion issues, or force all components to start including + * ActionContext if they don't already. + */ +extern std::string s_trackPathLog; + +// These macros cannot be protected by braces because of the trailing stream +// arguments that get appended. Care must taken not to use them inside if/else +// blocks that do not use curly braces. +// I.e., the following will give unexpected results: +// if(foo) +// DHT_DEBUG_STREAM() << "heyheyhey"; +// else +// blah(); +// The 'else' will end up applying to the 'if' within the debug macro. +// Regardless of this, our standards say to always use curly brackets +// on every block anyway, no matter what. + +#define DHT_DEBUG_STREAM() if(log.isDebugEnabled()) log.debugStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_STREAM() if(log.isInfoEnabled()) log.infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_WITH_STACK_STREAM() if(log.isInfoEnabled()) log.infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_WARN_STREAM() if(log.isWarnEnabled()) log.warnStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_ERROR_STREAM() if(log.isErrorEnabled()) log.errorStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_CRIT_STREAM() if(log.isCritEnabled()) log.critStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_TRACE_PRIORITY log4cpp::Priority::DEBUG + 50 +#define DHT_TRACE_STREAM() if (log.isPriorityEnabled(DHT_TRACE_PRIORITY)) log.getStream(DHT_TRACE_PRIORITY) << __FUNCTION__ << "():" << __LINE__ << ":" + +// Sadly, sometimes 'log' is reserved by someone else so the code needs to +// use a different name for log. In that case, it can be passed in to these. +#define DHT_DEBUG_STREAML(x_log_hdl_x) if((x_log_hdl_x).isDebugEnabled()) (x_log_hdl_x).debugStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_STREAML(x_log_hdl_x) if((x_log_hdl_x).isInfoEnabled()) (x_log_hdl_x).infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_WITH_STACK_STREAML(x_log_hdl_x) if((x_log_hdl_x).isInfoEnabled()) (x_log_hdl_x).infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_WARN_STREAML(x_log_hdl_x) if((x_log_hdl_x).isWarnEnabled()) (x_log_hdl_x).warnStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_ERROR_STREAML(x_log_hdl_x) if((x_log_hdl_x).isErrorEnabled()) (x_log_hdl_x).errorStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_CRIT_STREAML(x_log_hdl_x) if((x_log_hdl_x).isCritEnabled()) (x_log_hdl_x).critStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_TRACE_STREAML(x_log_hdl_x) if ((x_log_hdl_x).isPriorityEnabled(DHT_TRACE_PRIORITY)) (x_log_hdl_x).getStream(DHT_TRACE_PRIORITY) << __FUNCTION__ << "():" << __LINE__ << ":" + +//Macros to use when a function returns on error without writing any log message +// or error translation +#define RETURN_IF_NOT_OK(x_call_x) \ +{ \ + FwCode::ResponseCode rcx___ = (x_call_x); \ + if(rcx___ != FwCode::FwOk) { \ + return rcx___; \ + } \ +} + +#define RETURN_THIS_IF_NOT_OK(x_othercode_x, x_call_x) \ +{ \ + FwCode::ResponseCode rcx___ = (x_call_x); \ + if(rcx___ != FwCode::FwOk) { \ + return (x_othercode_x); \ + } \ +} + +/// Caution! Only use in checks for 'impossible' code conditions. Regular errors +/// should be handled regularly +#define BAD_CODE_ABORT() \ + { \ + std::string x_msg_x("Bad code at " __FILE__ ":"); \ + x_msg_x.append(StringUtils::toString(__LINE__)); \ + throw std::runtime_error(x_msg_x); \ + } + +#define BAD_CODE_IF_NOT_OK(x_call_x) \ + do {\ + if((x_call_x) != FwCode::FwOk) { \ + BAD_CODE_ABORT(); \ + } \ + } while(0) + +/* + * Above macros are meant to be used by all components. + */ + +/** + * Class that allows for method entry/exit logging with a single declaration. + * Always uses debug. + */ +class LogMethod +{ + public: + LogMethod(log4cpp::Category& log, log4cpp::Priority::Value priority, + const char *function); + virtual ~LogMethod(); + + private: + log4cpp::Category& log_; + log4cpp::Priority::Value priority_; + const char *function_; +}; + +// convenience macros to use the above class +#define LOG_METHOD() LogMethod log_method_entry_exit(log, log4cpp::Priority::DEBUG, __FUNCTION__) +#define TRACE_METHOD() LogMethod log_method_entry_exit(log, DHT_TRACE_PRIORITY, __FUNCTION__) + +/** Initialize log4cpp config file. + * This function needs to be called once for each executable. Multiple + * initializations will return the result of the first initialization (IOW, + * an executable can be initialized with exactly one config file). Errors + * encountered by this function are printed onto cerr. See log4cpp + * documentation for what happens when PropertyConfigurator::configure() + * fails. + * \param confFile is the path name of the log4cpp config file. + * Depending on the machine that the executable is running in, the path + * will be different. + * \return true if the initialization succeeds, false if it fails. + */ +bool initLog4cpp(const std::string & confFile); + +#endif + +/* + * For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7fcb172 --- /dev/null +++ b/Makefile @@ -0,0 +1,74 @@ +STASIS_DIR=../stasis + +LIB=$(STASIS_DIR)/build/src/stasis \ + -L/home/y/lib +INCLUDE=-I$(STASIS_DIR)/src/ -I$(STASIS_DIR) -I./ \ + -I/home/y/include + +LIBLIST=-lpthread \ + -lstasis \ + -lm +# -licui18n \ +# -licuuc \ +# -licudata \ +# -licuio \ +# -llog4cpp_y \ +# -lthoth + +FLAGS=-pg -g -O1 +#FLAGS=-O3 + +HFILES=logserver.h logstore.h logiterators.h datapage.h merger.h tuplemerger.h datatuple.h +CFILES=logserver.cpp logstore.cpp logiterators.cpp datapage.cpp merger.cpp tuplemerger.cpp + + +# STASIS_DIR=../stasis +# LD_LIBRARY_PATH=$STASIS_DIR/build/src/stasis +# LD_LIBRARY_PATH=$STASIS_DIR/build/src/stasis ./hello + + +logstore: check_gen.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +test: dp_check lt_check ltable_check merger_check rb_check \ + lmerger_check tmerger_check server_check tcpclient_check + +lt_check: check_logtree.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +dp_check: check_datapage.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +ltable_check: check_logtable.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +merger_check: check_merge.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +lmerger_check: check_mergelarge.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +tmerger_check: check_mergetuple.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +rb_check: check_rbtree.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +server_check: check_server.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +tcpclient_check: check_tcpclient.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + + +hello : hello.cpp UCharUtils.cc LogUtils.cc + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +clean: + rm -f logstore server_check hello lt_check merger_check lmerger_check rb_check \ + dp_check ltable_check tmerger_check rose tcpclient_check +veryclean: clean + rm -f *~ gmon.out prof.res + + + diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..cc6008a --- /dev/null +++ b/NOTES @@ -0,0 +1,152 @@ +###################################################################################### +constants.h +###################################################################################### + +#define PAGE_SIZE 4096 +#define BLOB_THRESHOLD_SIZE (PAGE_SIZE-30) + +SLOT TYPES + +#define INVALID_SLOT (-1) +/** This constant is used as a placeholder to mark slot locations that contain blobs. + @see slotted.c, indirect.c, blobManager.c */ +#define BLOB_SLOT (-2) +#define NORMAL_SLOT (-3) +#define SLOT_TYPE_END (-4) + +###################################################################################### +allocationPolicy.h +###################################################################################### + +struct allocationPolicy { + struct LH_ENTRY(table) * xidAlloced; + struct LH_ENTRY(table) * xidDealloced; + struct RB_ENTRY(tree) * availablePages; + struct LH_ENTRY(table) * pageOwners; + struct LH_ENTRY(table) * allPages; +}; + +typedef struct allocationPolicy stasis_allocation_policy_t; + +typedef struct availablePage { + int freespace; + pageid_t pageid; + int lockCount; // Number of active transactions that have alloced or dealloced from this page. +} availablePage; + +availablePage * stasis_allocation_policy_pick_suitable_page(stasis_allocation_policy_t * ap, int xid, int freespace); + +//////////////////////////////////////////////////////////////////////////////////// + +==15277== Thread 4: +==15277== Invalid free() / delete / delete[] +==15277== at 0x401BEFA: free (vg_replace_malloc.c:235) +==15277== by 0x4FD60FB: free_mem (in /lib/tls/libc-2.3.4.so) +==15277== by 0x4FD5B21: __libc_freeres (in /lib/tls/libc-2.3.4.so) +==15277== by 0x4017336: _vgw_freeres (vg_preloaded.c:62) +==15277== by 0x4030B25: pthread_cond_wait@@GLIBC_2.3.2 (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so) +==15277== Address 0x4EC66B8 is not stack'd, malloc'd or (recently) free'd +==15277== +==15277== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 40 from 1) +==15277== malloc/free: in use at exit: 8,540,389 bytes in 912 blocks. +==15277== malloc/free: 1,815,016 allocs, 1,814,105 frees, 1,121,769,405 bytes allocated. +==15277== For counts of detected errors, rerun with: -v +==15277== searching for pointers to 912 not-freed blocks. +==15277== checked 43,383,184 bytes. +==15277== +==15277== Thread 1: +==15277== +==15277== 4,883,561 (32 direct, 4,883,529 indirect) bytes in 1 blocks are definitely lost in loss record 16 of 46 +==15277== at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164) +==15277== by 0x8052C01: __gnu_cxx::new_allocator >::allocate(unsigned, void const*) (new_allocator.h:81) +==15277== by 0x8052B79: std::_Rb_tree, datatuple, std::allocator >::_M_get_node() (stl_tree.h:356) +==15277== by 0x8052ACC: std::_Rb_tree, datatuple, std::allocator >::_M_create_node(datatuple const&) (stl_tree.h:365) +==15277== by 0x8052978: std::_Rb_tree, datatuple, std::allocator >::_M_insert(std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, datatuple const&) (stl_tree.h:783) +==15277== by 0x805270C: std::_Rb_tree, datatuple, std::allocator >::insert_unique(datatuple const&) (stl_tree.h:881) +==15277== by 0x8052332: std::set >::insert(datatuple const&) (stl_set.h:314) +==15277== by 0x8050077: logtable::insertTuple(datatuple&) (logstore.cpp:1030) +==15277== by 0x804A641: insertProbeIter(int) (check_merge.cpp:160) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 336 (28 direct, 308 indirect) bytes in 1 blocks are definitely lost in loss record 17 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x404D906: stasis_dirty_page_table_init (dirtyPageTable.c:133) +==15277== by 0x404BFA5: Tinit (transactional2.c:66) +==15277== by 0x804A2AE: insertProbeIter(int) (check_merge.cpp:97) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 40 bytes in 1 blocks are definitely lost in loss record 20 of 46 +==15277== at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164) +==15277== by 0x8053025: merge_scheduler::addlogtable(logtable*) (merger.cpp:20) +==15277== by 0x804A33E: insertProbeIter(int) (check_merge.cpp:113) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 80 bytes in 10 blocks are definitely lost in loss record 32 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x804D75E: logtree::create(int) (logstore.cpp:169) +==15277== by 0x8053BD5: memMergeThread(void*) (merger.cpp:236) +==15277== by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so) +==15277== +==15277== +==15277== 4,792 (432 direct, 4,360 indirect) bytes in 18 blocks are definitely lost in loss record 40 of 46 +==15277== at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164) +==15277== by 0x80501C5: logtable::insertTuple(int, datatuple&, recordid&, logtree*) (logstore.cpp:1064) +==15277== by 0x8054FA7: insertTuple(int, DataPage*, datatuple&, logtable*, logtree*, recordid&, int&, int&) (merger.cpp:643) +==15277== by 0x8054AFF: merge_iterators(int, treeIterator*, memTreeIterator >, datatuple>*, logtable*, logtree*, int&) (merger.cpp:534) +==15277== by 0x8053C8F: memMergeThread(void*) (merger.cpp:251) +==15277== by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so) +==15277== +==15277== +==15277== 576 bytes in 4 blocks are possibly lost in loss record 41 of 46 +==15277== at 0x401C6BF: calloc (vg_replace_malloc.c:279) +==15277== by 0x400E71A: _dl_allocate_tls (in /lib/ld-2.3.4.so) +==15277== by 0x402E91E: pthread_create@@GLIBC_2.1 (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x80538FF: merge_scheduler::startlogtable(int) (merger.cpp:184) +==15277== by 0x804A37E: insertProbeIter(int) (check_merge.cpp:116) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 3,175 bytes in 1 blocks are possibly lost in loss record 42 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x8051BC7: DataPage::readbytes(int, int, int, unsigned char**) (datapage.cpp:235) +==15277== by 0x8051F7F: DataPage::RecordIterator::getnext(int) (datapage.cpp:442) +==15277== by 0x80512E0: DataPage::recordRead(int, unsigned char*, unsigned, datatuple**) (datapage.cpp:206) +==15277== by 0x8050449: logtable::findTuple(int, unsigned char*, unsigned, logtree*) (logstore.cpp:1104) +==15277== by 0x804FF48: logtable::findTuple(int, unsigned char*, unsigned) (logstore.cpp:979) +==15277== by 0x804A8D3: insertProbeIter(int) (check_merge.cpp:198) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 173,599 bytes in 2 blocks are possibly lost in loss record 43 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x804FFD0: logtable::insertTuple(datatuple&) (logstore.cpp:1014) +==15277== by 0x804A641: insertProbeIter(int) (check_merge.cpp:160) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 2,281,057 bytes in 681 blocks are definitely lost in loss record 45 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x8051BC7: DataPage::readbytes(int, int, int, unsigned char**) (datapage.cpp:235) +==15277== by 0x8051F7F: DataPage::RecordIterator::getnext(int) (datapage.cpp:442) +==15277== by 0x80512E0: DataPage::recordRead(int, unsigned char*, unsigned, datatuple**) (datapage.cpp:206) +==15277== by 0x8050449: logtable::findTuple(int, unsigned char*, unsigned, logtree*) (logstore.cpp:1104) +==15277== by 0x804FF81: logtable::findTuple(int, unsigned char*, unsigned) (logstore.cpp:990) +==15277== by 0x804A8D3: insertProbeIter(int) (check_merge.cpp:198) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== LEAK SUMMARY: +==15277== definitely lost: 2,281,669 bytes in 712 blocks. +==15277== indirectly lost: 4,888,197 bytes in 150 blocks. +==15277== possibly lost: 177,350 bytes in 7 blocks. +==15277== still reachable: 1,193,173 bytes in 43 blocks. +==15277== suppressed: 0 bytes in 0 blocks. +==15277== Reachable blocks (those to which a pointer was found) are not shown. +==15277== To see them, rerun with: --show-reachable=yes +Killed diff --git a/StringUtils.h b/StringUtils.h new file mode 100644 index 0000000..d098b76 --- /dev/null +++ b/StringUtils.h @@ -0,0 +1,345 @@ +/* $Id: StringUtils.h,v 1.17 2009/03/25 20:32:51 dlomax Exp $ */ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +#ifndef __STRING_UTIL_H +#define __STRING_UTIL_H +#include +#include +#include +#include "FwCode.h" + +/** + * Container for static string manipulation utilities. + */ +class StringUtils +{ + public: + + /** + * Our replacement for yax_getroot(). Allows our code to have a different + * root than components we use or link with. Is nice for unit testing. + * @return Copy of the value in a std::string + */ + static std::string getDhtRoot(); + + /** + * Parse a tablet name into left and right limits. + * @return true if parsing successful, false if incorrect format + */ + static bool parseTabletName(const std::string& tablet, std::string& leftLimit, + std::string& rightLimit); + + /** + * Construct a tablet name from left and right limits. + */ + static void buildTabletName(const std::string& leftLimit, + const std::string& rightLimit, + std::string& tablet); + + /** + * General purpose method to assemble a full path name, using + * getDhtRoot() so that + * the root will be configurable. DO NOT supply "/home/y" in path1. + */ + static std::string makePath(const std::string& path1 = "", + const std::string& path2 = "", + const std::string& path3 = "", + const std::string& path4 = "", + const std::string& path5 = "", + const std::string& path6 = ""); + + /** + * Append additional paths to an existing one - does not prepend ROOT. + */ + static void appendPath(std::string& base_path, const std::string& path2 = "", + const std::string& path3 = "", + const std::string& path4 = ""); + + /** + * Construct a topic name from a table/tablet. + * + * @return the topic name + */ + static std::string buildTopicName(const std::string& table, + const std::string& tablet); + + /** + * Construct a topic name from a table/tablet. + * @param topic Is filled with the topic name. + */ + static void buildTopicName(const std::string& table, + const std::string& tablet, + std::string &topic); + + /** + * Parses topic into table and tablet portions. + * + * @param table Filled with the table name. + * @param tablet Filled with the tablet name. + * @param true if the parsing succeeded, false if not. + */ + static bool parseTopicName(const std::string& topic, + std::string& table, + std::string &tablet); + + /** + * Only for use in log statements - this is slow. Produce a printable + * string where binary (<32) characters are hex encoded, but all others + * are left alone. + * + * @param str string to encode + * @param len length of string + * @return encoded string. + */ + static std::string toPrintable(const char *str, size_t len); + + /** + * Convert a formatted hex string back into its original + * 64-bit value + * + * @param value the hex-encoded string + * @param out the value + * @return FwCode::FwOk on success, FwCode::BadHexString on parse failure + */ + static FwCode::ResponseCode + convertHexStringToUI64(const std::string& value, uint64_t& out); + + /** + * Convert a formatted hex string back into its original + * 32-bit value + * + * @param value the hex-encoded string + * @param out the value + * @return FwCode::FwOk on success, FwCode::BadHexString on parse failure + */ + static FwCode::ResponseCode + convertHexStringToUI32(const std::string& value, uint32_t& out); + + /** + * Standard means for formatting a 0x prefixed hex string from a + * 64-bit unsigned value. Will produce upper-case letters. Will + * pad with zeros at the beginning to fill out 16 hex chars. + * + * @param the value to format + * @return the formatted value, like "0xDEADBEEF00000000" + */ + static std::string convertUI64ToHexString( uint64_t val ); + + /** + * Standard means for formatting a 0x prefixed hex string from a + * 32-bit unsigned value. Will produce upper-case letters. Will + * pad with zeros at the beginning to fill out 8 hex chars. + * + * @param the value to format + * @return the formatted value, like "0xDEADBEEF" + */ + static std::string convertUI32ToHexString( unsigned int val ); + + /** + * Standard means for formatting a small hex string from a + * 32-bit unsigned value. The "0x" will NOT be included. + * Will produce upper-case letters. Will NOT pad with zeros + * at the beginning. + * + * @param the value to format + * @return the formatted value, like "DEADBEEF" + */ + static std::string convertUI32ToMinimalHexString( unsigned int val ); + + /** + * Assemble the fields of ENCRYPTED_BODY_HEADER and encrypt it for + * sending to the remote side. + * @param result is the out parameter having the resulting string. + * @param encKeyName is the name of the key in keydb whose value will be + * used as the encryption key + * @param bodyEncVersion is the version of the encryption scheme used to + * encrypt the body (not the encryption scheme of this header itself). + * @param expireTime is the time (in usecs) after which the request + * should not be processed by the receiver of this header. + */ + static FwCode::ResponseCode makeEncryptedBodyHdr(std::string & result, + const char *encKeyName, uint32_t bodyEncVersion, uint64_t expireTime); + + /** + * Parse the incoming ENCRYPTED_BODY_HEADER, decrypting it, and + * separating the fields in it. + * @param inval is the incoming encrypted string. + * @param encKeyName is the name of the key in keydb whose value will be + * used as the decryption key + * @param bodyEncVersion is the version of the encryption scheme to be + * used to * decrypt the body (not for the decryption of this header + * itself). + * @param expireTime is the time (in usecs) after which the response + * should not be processed by the receiver of this header. + */ + static FwCode::ResponseCode parseEncryptedBodyHdr(const std::string & inval, + const char *encKeyName, uint32_t & bodyEncVersion, uint64_t & expireTime); + + /** + * Get the hash for an un-normalized record name. + * + * @param unnormalizedRecordName a raw record name from user input + * @param (output) hashResult the hex string of the hash value. + * @return FwCode::FwOk on success, else an error relating to normalization + */ + static FwCode::ResponseCode normalizeAndHashRecordName + ( const std::string& unnormalizedRecordName, + std::string & hashResult /* out */ ); + + /** + * Get the hash for a normalized record name. + * + * @param recordName the record name. MUST be previously normalized. + * @return hashResult the uint32_t of the hash value. + */ + static uint32_t hashRecordName(const std::string& recordName); + + /** + * Get the hash for a normalized record name. + * + * @param recordName the record name. MUST be previously normalized. + * @param (output) hashResult the hex string of the hash value. + */ + static void hashRecordName( const std::string& recordName, + std::string & hashResult /* out */ ); + /** + * Get the hash for a normalized record name in string and int form + * + * @param recordName the record name. MUST be previously normalized. + * @param (output) hashResult the hex string of the hash value. + * @param (output) hexNum numerical value of hash + */ + static void hashRecordName( const std::string& recordName, + std::string & hashResult /* out */, + uint32_t& hexNum); + + /** + * Method to hash a string using crc32. + * + * @param buf data to hash + * @param len length of buf + * @return hash value + */ + static uint32_t crcHash(const char * buf, uint32_t len); + + /** + * util function to convert any type to a string + */ + template static inline std::string toString(T item); + + /** + * convert string to any type of value + * @param strValue string value to parse + * @param value(out) value to read from strValue + * @return FwCode::FwOk on success + * FwCode::FwError on failure (error is *not* logged) + */ + template static inline + FwCode::ResponseCode fromString(const std::string& strValue, + T& value); + + /** + * convert a hexadecimal number to string representation + * of fixed width ( 2 * sizeof(T) ) + * @param value number to convert to string + * @return string representation of value + */ + template static inline + std::string numberToHexString(T value); + + /** + * convert a hexadecimal number to minimal string representation + * @param value number to convert to string + * @return string representation of value + */ + template static inline + std::string numberToMinimalHexString(T value); + + /** + * convert a hexadecimal string to a number + * @param strvalue input string to read from + * @param value(out) output number + * @return FwCode::FwOk on successful conversion + * FwCode::FwError on failure to convert strvalue + * to number + */ + template static inline + FwCode::ResponseCode hexStringToNumber(const std::string& strvalue, + T& value); + + + static const std::string EMPTY_STRING; +}; + +template +std::string StringUtils:: +toString(T item) +{ + std::ostringstream buf; + buf << item; + return buf.str(); +} + +template +FwCode::ResponseCode StringUtils:: +fromString(const std::string& strValue, + T& value) +{ + std::istringstream buf(strValue); + buf >> value; + if(buf.fail()|| + (strValue.length() != buf.tellg() )) + { + return FwCode::FwError; + } + return FwCode::FwOk; +} + +template +std::string StringUtils:: +numberToHexString(T value) +{ + std::ostringstream buf; + buf << "0x" << std::hex + << std::setw(sizeof(T) * 2) << std::setfill('0') + << std::uppercase << value; + return buf.str(); + +} + +template +std::string StringUtils:: +numberToMinimalHexString(T value) +{ + std::ostringstream buf; + buf << std::hex << std::uppercase << value; + return buf.str(); + +} + +template +FwCode::ResponseCode StringUtils:: +hexStringToNumber(const std::string& strvalue, + T& value) +{ + std::istringstream buf(strvalue); + buf >> std::hex >> value; + if(buf.fail() || + (strvalue.length() != buf.tellg() )) + { + return FwCode::FwError; + } + return FwCode::FwOk; + +} + +/* + * For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ +#endif diff --git a/UCharUtils.cc b/UCharUtils.cc new file mode 100644 index 0000000..2133034 --- /dev/null +++ b/UCharUtils.cc @@ -0,0 +1,326 @@ +/* $Id: UCharUtils.cc,v 1.16 2009/03/03 20:19:18 dlomax Exp $ */ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +//#include +#include "UCharUtils.h" +#include +#include "LogUtils.h" +//#include "ActionContext.h" +#include +#include +#include // To make sure we have UTF-8 + +static log4cpp::Category &log = + log4cpp::Category::getInstance("dht.framework." __FILE__); + + +UCharUtilsImpl *UCharUtils::instance_ = NULL; + +UCharUtilsImpl:: +UCharUtilsImpl() : uconv_(NULL) { + LOG_METHOD(); + + ucBuffLen = 0; + ucBuff = NULL; + + ucNormBuffLen = 0; + ucNormBuff = NULL; + + charBuffLen = 0; + charBuff = NULL; +} + +FwCode::ResponseCode UCharUtilsImpl:: +init() +{ + UErrorCode erc = U_ZERO_ERROR; + + uconv_ = ucnv_open("utf-8", &erc); + if (uconv_ == NULL) { + DHT_ERROR_STREAM() << "EC:UNICODE:Problem geting utf-8 converter, erc:" << erc + << ", " << u_errorName(erc); + return FwCode::UcnvOpenFailed; + } + return FwCode::FwOk; +} + +UCharUtilsImpl:: +~UCharUtilsImpl() { + reset(); + if (uconv_ != NULL) { + ucnv_close(uconv_); + uconv_ = NULL; + } +} + +void UCharUtilsImpl:: +reset() { + LOG_METHOD(); + + if (ucBuff != NULL) { + delete[] ucBuff; + ucBuffLen = 0; + ucBuff = NULL; + } + if (ucNormBuff != NULL) { + delete[] ucNormBuff; + ucNormBuffLen = 0; + ucNormBuff = NULL; + } + if (charBuff != NULL) { + delete[] charBuff; + charBuffLen = 0; + charBuff = NULL; + } +} + +/** + * Small wrapper to hide multi-line thoth api inside single-line call. + */ +bool UCharUtils:: +isUTF8(const std::string& value) +{ + size_t pos = 0; + thoth_result result = thoth_validate_utf8(value.c_str(), value.length(), + &pos); + + if(result != UTF8_VALID) { + std::cerr + //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8) + << "value (" << value << ") is not UTF-8. thoth_result:" << result + << ", position=" << pos; + return false; + } + return true; +} + +/** + * Small wrapper to hide multi-line thoth api inside single-line call. + */ +bool UCharUtils:: +isUTF8(const char * value, size_t value_len) +{ + size_t pos = 0; + thoth_result result = thoth_validate_utf8(value, value_len, &pos); + + if(result != UTF8_VALID) { + //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8) + std::cerr + << "value (" << std::string(value, value_len) + << ") is not UTF-8. thoth_result:" << result + << ", position=" << pos; + return false; + } + return true; +} + +// Convert an input string (expected to be UTF-8) into unicode UChars +// The result of the conversion will be sitting in our ucBuff area. +FwCode::ResponseCode UCharUtilsImpl:: +convert(const std::string &input, int32_t &len) +{ + LOG_METHOD(); + + //UTF-8 validation + if(!UCharUtils::isUTF8(input)) { + return FwCode::DataNotUtf8; + } + + int size = input.length() * 2; + + // Check if we already have a big enough buffer + if (ucBuffLen < size) { + // Nope, first check if we need to release what we've been using + if (ucBuff) { + delete[] ucBuff; + } + ucBuffLen = size; + ucBuff = new UChar[ucBuffLen]; + } + + UErrorCode erc = U_ZERO_ERROR; + len = ucnv_toUChars(uconv_, + ucBuff, + ucBuffLen, + input.data(), + input.length(), &erc); + + if (U_FAILURE(erc)) { + //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed) + std::cerr + << "EC:UNICODE:error:" << erc + << ", " << u_errorName(erc) + << " from converting input:'" << input << "'"; + len = 0; + return FwCode::ConvertToUCharFailed; + } + return FwCode::FwOk; +} + +// Normalize an input string. Note that all three internal buffers will +// be used by this operation, but by the time we finish, we'll be done +// with them. +FwCode::ResponseCode UCharUtilsImpl:: +normalize(const std::string &input, std::string &result /* out */) +{ + LOG_METHOD(); + + // convert our UTF-8 into UChar + int32_t inLen = 0; + FwCode::ResponseCode rc = convert(input, inLen); + + if (rc != FwCode::FwOk) { + result.erase(); + return rc; + } + + // Do a quick check if the input is already normalized so that + // we can duck out early + UErrorCode status = U_ZERO_ERROR; + if (unorm_quickCheck(ucBuff, inLen, + UNORM_NFC, &status) == UNORM_YES) { + DHT_DEBUG_STREAM() << "already normalized input:" << input; + result = input; + return FwCode::FwOk; + } + + // Check if we have enough space for the normalized result. + // We'll make the output space twice as big as the input (although + // it's more likely that the normalized result will be shorter + // as it combines characters. E.g. 'A' 'put an accent on the previous' + int32_t newSize = inLen * 2; + if (newSize > ucNormBuffLen) { + DHT_DEBUG_STREAM() << "newSize:" << newSize + << " ucNormBuffLen:" << ucNormBuffLen; + if (ucNormBuff) { + delete[] ucNormBuff; + } + ucNormBuffLen = newSize; + ucNormBuff = new UChar[ucNormBuffLen]; + } + + // Do the actual normalization + status = U_ZERO_ERROR; + int32_t normLen = unorm_normalize(ucBuff, inLen, + UNORM_NFC, 0, + ucNormBuff, + ucNormBuffLen, + &status); + if(U_FAILURE(status)) { + //RESPONSE_ERROR_STREAM(FwCode::FwError) + std::cerr + << "EC:UNICODE:error:" << status << ", " << u_errorName(status) + <<" in unorm_normalize, inLen:" << inLen + << " ucNormBuffLen:" << ucNormBuffLen; + return FwCode::FwError; + } + + // Make sure we have some space to convert back to UTF-8 + int32_t resultLen = normLen * 4; + if (resultLen > charBuffLen) { + DHT_DEBUG_STREAM() << "resultLen:" << resultLen + << " charBuffLen:" << charBuffLen; + if (charBuff) { + delete[] charBuff; + charBuff= NULL; + } + charBuffLen = resultLen; + charBuff = new char[charBuffLen]; + } + + DHT_DEBUG_STREAM() <<"calling ucnv_fromUChars, normLen:" << normLen; + + // Go from UChar array to UTF-8 + int32_t actualLen = ucnv_fromUChars(uconv_, + charBuff, charBuffLen, + ucNormBuff, normLen, + &status); + if(U_FAILURE(status)) { + //RESPONSE_ERROR_STREAM(FwCode::FwError) + std::cerr + << "EC:UNICODE:error:" << status << ", " << u_errorName(status) + << " in ucnv_fromUChars charBuffLen:" << charBuffLen + << " normLen:" << normLen; + return FwCode::FwError; + } + + // Smack our UTF-8 characters into the result string + result.assign(charBuff, actualLen); + DHT_DEBUG_STREAM() << "leaving actualLen:" << actualLen + << " result:" << result; + return FwCode::FwOk; +} + + +FwCode::ResponseCode UCharUtils:: +init() +{ + if (instance_ == NULL) { + instance_ = new UCharUtilsImpl(); + return instance_->init(); + } + return FwCode::FwOk; // already initialized +} + +void UCharUtils:: +close() +{ + if(instance_ != NULL) { + delete instance_; + instance_ = NULL; + } +} + +// Given an input string, return a unicode UChar array. Note that the +// return value is a pointer to our internal buffer. +UChar * UCharUtils:: +getUChar(const std::string &input, int32_t& len) { + LOG_METHOD(); + + // do the conversion...somehow need 2x input len for utf8 to utf16 + if(instance_->convert(input, len) != FwCode::FwOk) { + len = 0; + return NULL; + } + + return instance_->ucBuff; +} + +FwCode::ResponseCode UCharUtils:: +normalize(const std::string &input, std::string &result) { + LOG_METHOD(); + return(instance_->normalize(input, result)); +} + + +FwCode::ResponseCode UCharUtils:: +parseRegExpPattern(const std::string &pattern, + URegularExpression * & result /* out */) +{ + UParseError perr; + UErrorCode erc = U_ZERO_ERROR; + int32_t ureglen = 0; + + // Do not delete uregexp, it's a static reusable buffer inside UCharUtils + UChar *uregexp = UCharUtils::getUChar(pattern, ureglen); + if (uregexp == NULL) { + //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed) + std::cerr + << "EC:UNICODE|IMPOSSIBLE:Unable to convert pattern to unicode: " << pattern; + return FwCode::ConvertToUCharFailed; + } + + URegularExpression *regexp= uregex_open(uregexp, ureglen, 0, + &perr, + &erc); + if(erc != U_ZERO_ERROR) { + //RESPONSE_DEBUG_STREAM(FwCode::CompileRegExFailed) + std::cerr + << "Compiling regex failed at: " << perr.offset + << "; re=" << pattern; + return FwCode::CompileRegExFailed; + } + + result = regexp; + return FwCode::FwOk; +} diff --git a/UCharUtils.h b/UCharUtils.h new file mode 100644 index 0000000..4f751be --- /dev/null +++ b/UCharUtils.h @@ -0,0 +1,139 @@ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +#ifndef UCHAR_UTILS_H +#define UCHAR_UTILS_H + +#include +#include +#include "FwCode.h" +#include + +// Forward declaration +class UCharUtilsImpl; + +/** + * Some handy utilities for working with unicode characters. Yes, these + * could have just been some regular routines instead of static methods + * in a class, but doing it this way gives us some containment of what + * other static tidbits might be necessary (like reusable buffer space). + * which are all hidden within the UCharUtilsImpl class. + * + * This is a singleton - do not use in a threaded program. + */ +class UCharUtils { + private: + + /** + * Our pointer to all sorts of goodness. + */ + static UCharUtilsImpl *instance_; + public: + + /** + * Initialize the utilities. Primarily opens the utf-8 converter. + * Calling this is required prior to using the converter. + * + * @return FwCode::FwOk on success, FwCode::UcnvOpenFailed on + * failure. + */ + static FwCode::ResponseCode init(); + + /** + * Release all resources. init() must be called again + * in order to use again. + */ + static void close(); + + /** + * Small wrapper to hide multi-line thoth api inside single-line call. + * + * @param value string to be tested for utf-8-ness + * @return true if it is utf-8, false if not + */ + static bool isUTF8(const std::string& value); + + /** + * Small wrapper to hide multi-line thoth api inside single-line call. + * + * @param value char string to be tested for utf-8-ness + * @param value_len length of value + * @return true if it is utf-8, false if not + */ + static bool isUTF8(const char * value, size_t value_len); + + /** + * Convert utf-8 strings into UChar strings. Note that the + * result is an internal reusable buffer so the caller should + * *not* release it. + * @param input utf-8 string to convert + * @param len set to length of output string + * @return NULL if anything bad happens, otherwise an allocated UChar * + * the caller must *NEVER* free this pointer. + */ + static UChar * getUChar(const std::string &input, int32_t& len); + + /** + * Do a NFC normalization so that different yet equivalent strings + * will have a single representation. See + * http://www.unicode.org/unicode/reports/tr15/ + * for more information. + * @param input A UTF-8 string that we want to normalize + * @param result (output) the normalized UTF-8 string + * @return FwCode::FwOk on success, + * FwCode::FwError on conversion failure, + * FwCode::InvalidData if input was not utf-8 + */ + static FwCode::ResponseCode normalize(const std::string &input, + std::string &result); + + /** + * Compile a regular expression in a unicode-friendly way. + * + * @param pattern the regexp pattern to compile. Assumed to + * be utf-8. + * @param result (output) Set to point to the compiled regexp. + * Must be released by the caller via uregex_close() when + * finished with it. + * @return FwCode::FwOk if compilation succeeded, + * FwCode::CompileRegExFailed or FwCode::ConvertToUCharFailed + * on failure. + */ + static FwCode::ResponseCode parseRegExpPattern + (const std::string &pattern, + URegularExpression * & result /* out */); + +}; + +/** + * Bug 2574599 - Impl exposed for use by multiple threads; singleton not + * appropriate for multi-threaded program. + */ +class UCharUtilsImpl +{ +private: + UConverter *uconv_; + +public: + UCharUtilsImpl(); + ~UCharUtilsImpl(); + + FwCode::ResponseCode init(); + void reset(); + FwCode::ResponseCode convert(const std::string &input, int32_t &len); + + FwCode::ResponseCode normalize(const std::string &nput, std::string &result); + + // Buffer used to convert from UTF-* into UChar + int32_t ucBuffLen; + UChar *ucBuff; + + // Buffer used for UChar normalization output + int32_t ucNormBuffLen; + UChar *ucNormBuff; + + // Buffer used to convert UChars back to UTF-8 + int32_t charBuffLen; + char *charBuff; +}; + +#endif // _DHT_UCHAR_UTILS_ diff --git a/adriana-lima.awk b/adriana-lima.awk new file mode 100755 index 0000000..4454496 --- /dev/null +++ b/adriana-lima.awk @@ -0,0 +1,130 @@ +#! /usr/bin/awk -f + +BEGIN{ + + READ_SLA = 500; + WRITE_SLA = 750; + + readcnt = 0; + writecnt = 0; + + wlat_tot = 0; + wlat_max = 0; + wlat_sqtot = 0; + wlat_slafail = 0; + + DIST_BUCKET_LENGTH = 100; + DIST_BUCKET_COUNT = 20; + for(i=1; i<=DIST_BUCKET_COUNT; i++) + { + rlat_dist[i] = 0; + wlat_dist[i] = 0; + } + + + rlat_tot = 0; + rlat_max = 0; + rlat_sqtot = 0; + rlat_slafail = 0; + + printf("READ SLA:\t%d\n", READ_SLA); + printf("WRITE SLA:\t%d\n", WRITE_SLA); + printf("\n"); + +} + +/INFO - doRead()/ { readcnt = readcnt + 1; + + split(substr($0, match($0, "latency:")+ length("latency:")+1), tmp_arr, " "); + #printf("%d\n", strtonum(tmp_arr[1])); + + lat_val = strtonum(tmp_arr[1]); + + dist_index = int(lat_val / DIST_BUCKET_LENGTH) + 1; + if(dist_index > DIST_BUCKET_COUNT) + dist_index = DIST_BUCKET_COUNT; + rlat_dist[dist_index]++; + + rlat_tot = rlat_tot + lat_val; + + rlat_sqtot = rlat_sqtot + lat_val*lat_val; + + if(lat_val > rlat_max) + rlat_max = lat_val; + + if(lat_val > READ_SLA) + rlat_slafail = rlat_slafail + 1; + +} + + +/INFO - doInsert()/ { writecnt = writecnt + 1; + + split(substr($0, match($0, "latency:")+ length("latency:")+1), tmp_arr, " "); + + lat_val = tmp_arr[1]; + + if(index(tmp_arr[1], ",")!= 0) + lat_val = substr(tmp_arr[1],1,index(tmp_arr[1],",")-1); + + #printf("%d\n", strtonum(lat_val)); + lat_val = strtonum(lat_val); + + dist_index = int(lat_val / DIST_BUCKET_LENGTH) + 1; + if(dist_index > DIST_BUCKET_COUNT) + dist_index = DIST_BUCKET_COUNT; + wlat_dist[dist_index]++; + + wlat_tot = wlat_tot + lat_val; + + wlat_sqtot = wlat_sqtot + lat_val*lat_val; + + if(lat_val > wlat_max) + wlat_max = lat_val; + + if(lat_val > WRITE_SLA) + wlat_slafail = wlat_slafail + 1; + + +} + + +END{ + + printf("R/W ratio:\t%.2f\n", strtonum(readcnt) / strtonum(writecnt)); + + printf("\n"); + + printf("#reads:\t%d\n",readcnt); + if(strtonum(readcnt) != 0) + { + printf("avg read latency:\t%.2f\n", (rlat_tot / readcnt)); + printf("var read latency:\t%.2f\n", (rlat_sqtot/readcnt) - (rlat_tot/readcnt)*(rlat_tot/readcnt)); + printf("max read latency:\t%.2f\n", rlat_max); + printf("read SLA fail:\t%d\n", rlat_slafail); + + printf("\nREAD LATENCY DISTRIBUTION\n"); + for(i=1; i +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" + +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + +template class DataPage; + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector &arr) +{ + + for(int i=arr.size()-1; i>0; i--) + { + if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i]))) + arr.erase(arr.begin()+i); + + } + +} + +void preprandstr(int count, std::vector &arr, int avg_len=50, bool duplicates_allowed=false) +{ + + for ( int j=0; jnextPage == a->endOfRegion) { + if(a->regionList.size == -1) { + //DEBUG("nextPage: %lld\n", a->nextPage); + a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t)); + DEBUG("regionList.page: %lld\n", a->regionList.page); + DEBUG("regionList.slot: %d\n", a->regionList.slot); + DEBUG("regionList.size: %lld\n", a->regionList.size); + + a->regionCount = 0; + } + DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page); + TarrayListExtend(xid,a->regionList,1); + a->regionList.slot = a->regionCount; + DEBUG("region lst slot %d\n",a->regionList.slot); + a->regionCount++; + DEBUG("region count %lld\n",a->regionCount); + a->nextPage = TregionAlloc(xid, a->regionSize,12); + DEBUG("next page %lld\n",a->nextPage); + a->endOfRegion = a->nextPage + a->regionSize; + Tset(xid,a->regionList,&a->nextPage); + DEBUG("next page %lld\n",a->nextPage); + } + + DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion); + pageid_t ret = a->nextPage; + // Ensure the page is in buffer cache without accessing disk (this + // sets it to clean and all zeros if the page is not in cache). + // Hopefully, future reads will get a cache hit, and avoid going to + // disk. + + Page * p = loadUninitializedPage(xid, ret); + releasePage(p); + DEBUG("ret %lld\n",ret); + (a->nextPage)++; + return ret; + +} + + +pageid_t alloc_region_rid(int xid, void * ridp) { + recordid rid = *(recordid*)ridp; + RegionAllocConf_t conf; + Tread(xid,rid,&conf); + pageid_t ret = alloc_region(xid,&conf); + DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + + Tset(xid,rid,&conf); + return ret; +} + + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + std::vector data_arr; + std::vector key_arr; + preprandstr(NUM_ENTRIES, data_arr, 5*4096, true); + preprandstr(NUM_ENTRIES+200, key_arr, 50, true);//well i can handle upto 200 + + std::sort(key_arr.begin(), key_arr.end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr.size() > NUM_ENTRIES) + key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end()); + + NUM_ENTRIES=key_arr.size(); + + if(data_arr.size() > NUM_ENTRIES) + data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end()); + + //for(int i = 0; i < NUM_ENTRIES; i++) + //{ + // printf("%s\t", arr[i].c_str()); + // int keylen = arr[i].length()+1; + // printf("%d\n", keylen); + //} + + + + recordid alloc_state = Talloc(xid,sizeof(RegionAllocConf_t)); + + Tset(xid,alloc_state, &logtree::REGION_ALLOC_STATIC_INITIALIZER); + + + + + + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + int pcount = 10; + int dpages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = key_arr[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + for(int j=0; jappend(xid, newtuple)) + { + dpages++; + if(dp) + delete dp; + + dp = new DataPage(xid, pcount, &DataPage::dp_alloc_region_rid, &alloc_state ); + + if(!dp->append(xid, newtuple)) + { + delete dp; + dp = new DataPage(xid, pcount, &DataPage::dp_alloc_region_rid, &alloc_state ); + assert(dp->append(xid, newtuple)); + } + + dsp.push_back(dp->get_start_pid()); + } + + + } + + printf("Total data set length: %d\n", datasize); + printf("Storage utilization: %.2f\n", (datasize+.0) / (PAGE_SIZE * pcount * dpages)); + printf("Number of datapages: %d\n", dpages); + printf("Writes complete.\n"); + + Tcommit(xid); + xid = Tbegin(); + + + printf("Stage 2: Reading %d tuples\n", NUM_ENTRIES); + + + int tuplenum = 0; + for(int i = 0; i < dpages ; i++) + { + DataPage dp(xid, dsp[i]); + DataPage::RecordIterator itr = dp.begin(); + datatuple *dt=0; + while( (dt=itr.getnext(xid)) != NULL) + { + assert(*(dt->keylen) == key_arr[tuplenum].length()+1); + assert(*(dt->datalen) == data_arr[tuplenum].length()+1); + tuplenum++; + free(dt->keylen); + free(dt); + dt = 0; + } + + } + + printf("Reads completed.\n"); +/* + + int64_t count = 0; + lladdIterator_t * it = logtreeIterator::open(xid, tree); + + while(logtreeIterator::next(xid, it)) { + byte * key; + byte **key_ptr = &key; + int keysize = logtreeIterator::key(xid, it, (byte**)key_ptr); + + pageid_t *value; + pageid_t **value_ptr = &value; + int valsize = lsmTreeIterator_value(xid, it, (byte**)value_ptr); + //printf("keylen %d key %s\n", keysize, (char*)(key)) ; + assert(valsize == sizeof(pageid_t)); + assert(!mycmp(std::string((char*)key), arr[count]) && !mycmp(arr[count],std::string((char*)key))); + assert(keysize == arr[count].length()+1); + count++; + } + assert(count == NUM_ENTRIES); + + logtreeIterator::close(xid, it); + + + */ + + + Tcommit(xid); + Tdeinit(); +} + + +/** @test + */ +int main() +{ + insertProbeIter(10000); + + + + return 0; +} + diff --git a/check_gen.cpp b/check_gen.cpp new file mode 100644 index 0000000..100d9d0 --- /dev/null +++ b/check_gen.cpp @@ -0,0 +1,39 @@ + + +#include "logstore.h" + +int main(int argc, char **argv) +{ + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + // PAGELAYOUT::initPageLayout(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + logtable ltable; + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + //ltable.startTable(); + +// lsmTableHandle* h = TlsmTableStart(lsmTable, INVALID_COL); + + xid = Tbegin(); + logtreeIterator::open(xid,ltable.get_tree_c2()->get_root_rec() ); + Tcommit(xid); + + + Tdeinit(); + + + +} diff --git a/check_logtable.cpp b/check_logtable.cpp new file mode 100644 index 0000000..5d01500 --- /dev/null +++ b/check_logtable.cpp @@ -0,0 +1,276 @@ + +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + +//template class DataPage; +template class treeIterator; + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector &arr) +{ + + for(int i=arr.size()-1; i>0; i--) + { + if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i]))) + arr.erase(arr.begin()+i); + + } + +} + +void preprandstr(int count, std::vector &arr, int avg_len=50, bool duplicates_allowed=false) +{ + + for ( int j=0; jget_root_rec(); + + + std::vector data_arr; + std::vector key_arr; + preprandstr(NUM_ENTRIES, data_arr, 5*4096, true); + preprandstr(NUM_ENTRIES+200, key_arr, 50, true);//well i can handle upto 200 + + std::sort(key_arr.begin(), key_arr.end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr.size() > NUM_ENTRIES) + key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end()); + + NUM_ENTRIES=key_arr.size(); + + if(data_arr.size() > NUM_ENTRIES) + data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end()); + + + + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + + int dpages = 0; + int npages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = key_arr[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + for(int j=0; jget_start_pid()); + } + else + { + if(!dp->append(xid, newtuple)) + { + npages += dp->get_page_count(); + delete dp; + dp = ltable.insertTuple(xid, newtuple, ltable.get_dpstate1(), lt); + dpages++; + dsp.push_back(dp->get_start_pid()); + } + } + + free(newtuple.key); + free(newtuple.data); + + + } + + printf("\nTREE STRUCTURE\n"); + lt->print_tree(xid); + + printf("Total data set length: %d\n", datasize); + printf("Storage utilization: %.2f\n", (datasize+.0) / (PAGE_SIZE * npages)); + printf("Number of datapages: %d\n", dpages); + printf("Writes complete.\n"); + + Tcommit(xid); + xid = Tbegin(); + + + + + + printf("Stage 2: Sequentially reading %d tuples\n", NUM_ENTRIES); + + + int tuplenum = 0; + treeIterator tree_itr(tree_root); + + + datatuple *dt=0; + while( (dt=tree_itr.getnext()) != NULL) + { + assert(*(dt->keylen) == key_arr[tuplenum].length()+1); + assert(*(dt->datalen) == data_arr[tuplenum].length()+1); + tuplenum++; + free(dt->keylen); + free(dt); + dt = 0; + } + + assert(tuplenum == key_arr.size()); + + printf("Sequential Reads completed.\n"); + + int rrsize=key_arr.size() / 3; + printf("Stage 3: Randomly reading %d tuples by key\n", rrsize); + + for(int i=0; ikeylen) == key_arr[ri].length()+1); + assert(*(dt->datalen) == data_arr[ri].length()+1); + free(dt->keylen); + free(dt); + dt = 0; + } + + printf("Random Reads completed.\n"); + Tcommit(xid); + Tdeinit(); + +} + +/** @test + */ +int main() +{ + insertProbeIter(15000); + + + + return 0; +} + diff --git a/check_logtree.cpp b/check_logtree.cpp new file mode 100644 index 0000000..6e4a3c1 --- /dev/null +++ b/check_logtree.cpp @@ -0,0 +1,331 @@ + +#include +#include +#include +#include +#include "logstore.h" + +#include +#include +#include +#include +#include +#include + +#define LOG_NAME "check_logTree.log" +#define NUM_ENTRIES_A 10000 +#define NUM_ENTRIES_B 10 +#define NUM_ENTRIES_C 0 + +#define OFFSET (NUM_ENTRIES * 10) + +#undef begin +#undef end + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +void preprandstr(int count, std::vector &arr) +{ + + for ( int j=0; jget_root_rec(); + + long oldpagenum = -1; + + std::vector arr; + preprandstr(NUM_ENTRIES, arr); + std::sort(arr.begin(), arr.end(), &mycmp); + + //for(int i = 0; i < NUM_ENTRIES; i++) + //{ + // printf("%s\t", arr[i].c_str()); + // int keylen = arr[i].length()+1; + // printf("%d\n", keylen); + //} + + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + + for(int i = 0; i < NUM_ENTRIES; i++) + { + int keylen = arr[i].length()+1; + byte *currkey = (byte*)malloc(keylen); + for(int j=0; jget_tree_state(); + RegionAllocConf_t alloc_conf; + Tread(xid,rid,&alloc_conf); + + logtree::appendPage(xid, tree, lt->lastLeaf, currkey, keylen, lt->alloc_region, &alloc_conf, i + OFFSET); + + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&alloc_conf); + + + pagenum = logtree::findPage(xid, tree, currkey,keylen); + oldpagenum = pagenum; + //printf("pagenum:%d\n", pagenum); + assert(pagenum == i + OFFSET); + free(currkey); + + + } + + printf("Writes complete."); + + tree = lt->get_root_rec(); + Tcommit(xid); + xid = Tbegin(); + + printf("\nTREE STRUCTURE\n"); + lt->print_tree(xid); + + printf("Stage 2: Looking up %d keys\n", NUM_ENTRIES); + + for(int i = 0; i < NUM_ENTRIES; i++) { + int keylen = arr[i].length()+1; + byte *currkey = (byte*)malloc(keylen); + for(int j=0; jget_root_rec(); + + long oldpagenum = -1; + + for(int32_t i = 0; i < NUM_ENTRIES; i++) { + int keylen = sizeof(int32_t); + byte *currkey = (byte*)malloc(keylen); + memcpy(currkey, (byte*)(&i), keylen); + //currkey[]='\0'; + + printf("\n#########\ni=%d\nkey:\t%d\nkeylen:%d\n",i,*((int32_t*)currkey),keylen); + long pagenum = logtree::findPage(xid, tree, currkey, keylen); + printf("pagenum:%d\n", pagenum); + assert(pagenum == -1 || pagenum == oldpagenum || oldpagenum == -1); + printf("TlsmAppendPage %d\n",i); + + recordid rid = lt->get_tree_state(); + RegionAllocConf_t alloc_conf; + Tread(xid,rid,&alloc_conf); + + logtree::appendPage(xid, tree, lt->lastLeaf, currkey, keylen, lt->alloc_region, &alloc_conf, i + OFFSET); + + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&alloc_conf); + + + pagenum = logtree::findPage(xid, tree, currkey,keylen); + oldpagenum = pagenum; + printf("pagenum:%d\n", pagenum); + assert(pagenum == i + OFFSET); + free(currkey); + } + + printf("Writes complete."); + + tree = lt->get_root_rec(); + Tcommit(xid); + xid = Tbegin(); + + printf("\nTREE STRUCTURE\n"); + lt->print_tree(xid); + + for(int32_t i = 1; i < NUM_ENTRIES; i++) { + int keylen = sizeof(int32_t); + byte *currkey = (byte*)malloc(keylen); + memcpy(currkey, (byte*)(&i), keylen); + + printf("\n#########\ni=%d\nkey:\t%d\nkeylen:%d\n",i,*((int32_t*)currkey),keylen); + long pagenum = logtree::findPage(xid, tree, currkey, keylen); + printf("pagenum:%d\n", pagenum); + assert(pagenum == i + OFFSET); + free(currkey); + } + + /* + int64_t count = 0; + + lladdIterator_t * it = lsmTreeIterator_open(xid, tree); + + while(lsmTreeIterator_next(xid, it)) { + lsmkey_t * key; + lsmkey_t **key_ptr = &key; + int size = lsmTreeIterator_key(xid, it, (byte**)key_ptr); + assert(size == sizeof(lsmkey_t)); + long *value; + long **value_ptr = &value; + size = lsmTreeIterator_value(xid, it, (byte**)value_ptr); + assert(size == sizeof(pageid_t)); + assert(*key + OFFSET == *value); + assert(*key == count); + count++; + } + assert(count == NUM_ENTRIES); + + lsmTreeIterator_close(xid, it); + + */ + Tcommit(xid); + Tdeinit(); +} + +/** @test + */ +int main() +{ + insertProbeIter_str(NUM_ENTRIES_A); + //insertProbeIter_int(NUM_ENTRIES_A); + + + + return 0; +} + + diff --git a/check_merge.cpp b/check_merge.cpp new file mode 100644 index 0000000..79a6bee --- /dev/null +++ b/check_merge.cpp @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void preprandstr(int count, std::vector *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + //data generation + std::vector * data_arr = new std::vector; + std::vector * key_arr = new std::vector; + + preprandstr(NUM_ENTRIES, data_arr, 10*8192); + preprandstr(NUM_ENTRIES+200, key_arr, 100); + + std::sort(key_arr->begin(), key_arr->end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + if(data_arr->size() > NUM_ENTRIES) + data_arr->erase(data_arr->begin()+NUM_ENTRIES, data_arr->end()); + + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + merge_scheduler mscheduler; + logtable ltable; + + int pcount = 5; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + xid = Tbegin(); + + int lindex = mscheduler.addlogtable(<able); + ltable.setMergeData(mscheduler.getMergeData(lindex)); + + mscheduler.startlogtable(lindex); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + //for(int j=0; jprint_tree(xid); + printf("datasize: %d\n", datasize); + //sleep(20); + + Tcommit(xid); + xid = Tbegin(); + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen); + //for(int j=0; jkeylen) == (*key_arr)[ri].length()+1); + assert(*(dt->datalen) == (*data_arr)[ri].length()+1); + free(dt->keylen); + free(dt); + } + dt = 0; + free(rkey); + } + printf("found %d\n", found_tuples); + + key_arr->clear(); + data_arr->clear(); + delete key_arr; + delete data_arr; + + mscheduler.shutdown(); + printf("merge threads finished.\n"); + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + + + + Tcommit(xid); + Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + insertProbeIter(5000); + + + + return 0; +} + diff --git a/check_mergelarge.cpp b/check_mergelarge.cpp new file mode 100644 index 0000000..692b360 --- /dev/null +++ b/check_mergelarge.cpp @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void getnextdata(std::string &data, int avg_len) +{ + int str_len = (rand()%(avg_len*2)) + 3; + + data = std::string(str_len, rand()%10+48); + /* + char *rc = (char*)malloc(str_len); + + for(int i=0; i *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + //data generation +// std::vector * data_arr = new std::vector; + std::vector * key_arr = new std::vector; + +// preprandstr(NUM_ENTRIES, data_arr, 10*8192); + preprandstr(NUM_ENTRIES+200, key_arr, 100); + + std::sort(key_arr->begin(), key_arr->end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + merge_scheduler mscheduler; + logtable ltable; + + int pcount = 100; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + //xid = Tbegin(); + + int lindex = mscheduler.addlogtable(<able); + ltable.setMergeData(mscheduler.getMergeData(lindex)); + + mscheduler.startlogtable(lindex); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + //for(int j=0; jprint_tree(xid); + printf("datasize: %lld\n", datasize); + //sleep(20); + + /* + //Tcommit(xid); + xid = Tbegin(); + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen); + //for(int j=0; jkeylen) == (*key_arr)[ri].length()+1); + //assert(*(dt->datalen) == (*data_arr)[ri].length()+1); + free(dt->keylen); + free(dt); + } + dt = 0; + free(rkey); + } + printf("found %d\n", found_tuples); + + key_arr->clear(); + //data_arr->clear(); + delete key_arr; + //delete data_arr; + */ + + mscheduler.shutdown(); + printf("merge threads finished.\n"); + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + + //Tcommit(xid); + + Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + insertProbeIter(25000); + + + + return 0; +} + diff --git a/check_mergetuple.cpp b/check_mergetuple.cpp new file mode 100644 index 0000000..914515a --- /dev/null +++ b/check_mergetuple.cpp @@ -0,0 +1,409 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void getnextdata(std::string &data, int avg_len) +{ + int str_len = (rand()%(avg_len*2)) + 3; + + data = std::string(str_len, rand()%10+48); + /* + char *rc = (char*)malloc(str_len); + + for(int i=0; i *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + //unlink("storefile.txt"); + //unlink("logfile.txt"); + + sync(); + double delete_freq = .05; + double update_freq = .15; + + //data generation + typedef std::vector key_v_t; + const static int max_partition_size = 100000; + int KEY_LEN = 100; + std::vector *key_v_list = new std::vector; + int list_size = NUM_ENTRIES / max_partition_size + 1; + for(int i =0; ibegin(), key_arr->end(), &mycmp); + key_v_list->push_back(key_arr); + printf("size partition %d is %d\n", i+1, key_arr->size()); + } + + + + key_v_t * key_arr = new key_v_t; + + std::vector iters; + for(int i=0; ibegin())); + } + + int lc = 0; + while(true) + { + int list_index = -1; + for(int i=0; iend()) + continue; + + if(list_index == -1 || mycmp(**iters[i], **iters[list_index])) + list_index = i; + } + + if(list_index == -1) + break; + + if(key_arr->size() == 0 || mycmp(key_arr->back(), **iters[list_index])) + key_arr->push_back(**iters[list_index]); + + (*iters[list_index])++; + lc++; + if(lc % max_partition_size == 0) + printf("%d/%d completed.\n", lc, NUM_ENTRIES); + } + + for(int i=0; iclear(); + delete (*key_v_list)[i]; + delete iters[i]; + } + key_v_list->clear(); + delete key_v_list; + +// preprandstr(NUM_ENTRIES, data_arr, 10*8192); + + printf("key arr size: %d\n", key_arr->size()); + + //removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + merge_scheduler mscheduler; + logtable ltable; + + int pcount = 40; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + xid = Tbegin(); + + int lindex = mscheduler.addlogtable(<able); + ltable.setMergeData(mscheduler.getMergeData(lindex)); + + mscheduler.startlogtable(lindex); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + int delcount = 0, upcount = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + std::vector del_list; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + //for(int j=0; j= 0 && std::find(del_list.begin(), del_list.end(), del_index) == del_list.end()) + { + delcount++; + datatuple deltuple; + keylen = (*key_arr)[del_index].length()+1; + deltuple.keylen = &keylen; + + deltuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)deltuple.key, (*key_arr)[del_index].c_str(), keylen); + + deltuple.datalen = &datalen; + deltuple.setDelete(); + + gettimeofday(&ti_st,0); + ltable.insertTuple(deltuple); + gettimeofday(&ti_end,0); + insert_time += tv_to_double(ti_end) - tv_to_double(ti_st); + + free(deltuple.key); + + del_list.push_back(del_index); + + } + } + else if(rval < delete_freq + update_freq) //update a record + { + int up_index = i - (rand()%50); //update one of the last inserted 50 elements + if(up_index >= 0 && std::find(del_list.begin(), del_list.end(), up_index) == del_list.end()) + {//only update non-deleted elements + upcount++; + datatuple uptuple; + keylen = (*key_arr)[up_index].length()+1; + uptuple.keylen = &keylen; + + uptuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)uptuple.key, (*key_arr)[up_index].c_str(), keylen); + + getnextdata(ditem, 512); + datalen = ditem.length()+1; + uptuple.datalen = &datalen; + uptuple.data = (datatuple::data_t) malloc(datalen); + memcpy((byte*)uptuple.data, ditem.c_str(), datalen); + + gettimeofday(&ti_st,0); + ltable.insertTuple(uptuple); + gettimeofday(&ti_end,0); + insert_time += tv_to_double(ti_end) - tv_to_double(ti_st); + + free(uptuple.key); + free(uptuple.data); + + } + + } + + } + gettimeofday(&stop_tv,0); + printf("insert time: %6.1f\n", insert_time); + printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + printf("#deletions: %d\n#updates: %d\n", delcount, upcount); + + printf("\nTREE STRUCTURE\n"); + //ltable.get_tree_c1()->print_tree(xid); + printf("datasize: %lld\n", datasize); + //sleep(20); + + Tcommit(xid); + xid = Tbegin(); + + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen); + //for(int j=0; jisDelete()); + found_tuples++; + assert(*(dt->keylen) == (*key_arr)[ri].length()+1); + //assert(*(dt->datalen) == (*data_arr)[ri].length()+1); + free(dt->keylen); + free(dt); + } + else + { + if(dt!=0) + { + assert(*(dt->keylen) == (*key_arr)[ri].length()+1); + assert(dt->isDelete()); + free(dt->keylen); + free(dt); + } + } + dt = 0; + free(rkey); + } + printf("found %d\n", found_tuples); + + + + + + key_arr->clear(); + //data_arr->clear(); + delete key_arr; + //delete data_arr; + + mscheduler.shutdown(); + printf("merge threads finished.\n"); + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + + + + Tcommit(xid); + Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + //insertProbeIter(25000); + insertProbeIter(400000); + /* + insertProbeIter(5000); + insertProbeIter(2500); + insertProbeIter(1000); + insertProbeIter(500); + insertProbeIter(1000); + insertProbeIter(100); + insertProbeIter(10); + */ + + return 0; +} + diff --git a/check_rbtree.cpp b/check_rbtree.cpp new file mode 100644 index 0000000..af17780 --- /dev/null +++ b/check_rbtree.cpp @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector &arr) +{ + + for(int i=arr.size()-1; i>0; i--) + { + if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i]))) + arr.erase(arr.begin()+i); + + } + +} + +void preprandstr(int count, std::vector &arr, int avg_len=50, bool duplicates_allowed=false) +{ + + for ( int j=0; j data_arr; + std::vector key_arr; + preprandstr(NUM_ENTRIES, data_arr, 10*8192, true); + preprandstr(NUM_ENTRIES+200, key_arr, 100, true); + + std::sort(key_arr.begin(), key_arr.end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr.size() > NUM_ENTRIES) + key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end()); + + NUM_ENTRIES=key_arr.size(); + + if(data_arr.size() > NUM_ENTRIES) + data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end()); + + std::set rbtree; + int64_t datasize = 0; + std::vector dsp; + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = key_arr[i].length()+1; + newtuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + *newtuple.keylen = keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + for(int j=0; jprint_tree(xid); + printf("datasize: %d\n", datasize); + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + + //get the key + uint32_t keylen = key_arr[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + for(int j=0; jkeylen) == key_arr[ri].length()+1); + assert(*(ret_tuple->datalen) == data_arr[ri].length()+1); + free(barr); + free(ret_tuple); + } + else + { + printf("Not in scratch_tree\n"); + } + + free(search_tuple.keylen); + free(rkey); + } + printf("found %d\n", found_tuples); +} + + + +/** @test + */ +int main() +{ + insertProbeIter(250); + + + + return 0; +} + diff --git a/check_server.cpp b/check_server.cpp new file mode 100644 index 0000000..60af0cf --- /dev/null +++ b/check_server.cpp @@ -0,0 +1,107 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#include + +#undef begin +#undef end + +logserver *lserver=0; +merge_scheduler *mscheduler=0; + +void terminate (int param) +{ + printf ("Stopping server...\n"); + lserver->stopserver(); + delete lserver; + + printf("Stopping merge threads...\n"); + mscheduler->shutdown(); + delete mscheduler; + + printf("Deinitializing stasis...\n"); + fflush(stdout); + Tdeinit(); + + exit(0); +} + +void insertProbeIter(int NUM_ENTRIES) +{ + //signal handling + void (*prev_fn)(int); + + prev_fn = signal (SIGINT,terminate); + //if (prev_fn==SIG_IGN) + //signal (SIGTERM,SIG_IGN); + + + sync(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + mscheduler = new merge_scheduler; + logtable ltable; + + + + int pcount = 40; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + int lindex = mscheduler->addlogtable(<able); + ltable.setMergeData(mscheduler->getMergeData(lindex)); + + mscheduler->startlogtable(lindex); + + + lserver = new logserver(10, 32432); + + lserver->startserver(<able); + + +// Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + //insertProbeIter(25000); + insertProbeIter(10000); + /* + insertProbeIter(5000); + insertProbeIter(2500); + insertProbeIter(1000); + insertProbeIter(500); + insertProbeIter(1000); + insertProbeIter(100); + insertProbeIter(10); + */ + + return 0; +} + diff --git a/check_tcpclient.cpp b/check_tcpclient.cpp new file mode 100644 index 0000000..a505e52 --- /dev/null +++ b/check_tcpclient.cpp @@ -0,0 +1,415 @@ +#include +#include +#include +#include +#include "logstore.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void getnextdata(std::string &data, int avg_len) +{ + int str_len = (rand()%(avg_len*2)) + 3; + + data = std::string(str_len, rand()%10+48); + /* + char *rc = (char*)malloc(str_len); + + for(int i=0; i *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + +datatuple * sendTuple(std::string & servername, int serverport, uint8_t opcode, datatuple &tuple) +{ + struct sockaddr_in serveraddr; + struct hostent *server; + + int sockfd = socket(AF_INET, SOCK_STREAM, 0); + + if (sockfd < 0) + { + printf("ERROR opening socket.\n"); + return 0; + } + + server = gethostbyname(servername.c_str()); + if (server == NULL) { + fprintf(stderr,"ERROR, no such host as %s\n", servername.c_str()); + exit(0); + } + + /* build the server's Internet address */ + bzero((char *) &serveraddr, sizeof(serveraddr)); + serveraddr.sin_family = AF_INET; + bcopy((char *)server->h_addr, + (char *)&serveraddr.sin_addr.s_addr, server->h_length); + serveraddr.sin_port = htons(serverport); + + /* connect: create a connection with the server */ + if (connect(sockfd, (sockaddr*) &serveraddr, sizeof(serveraddr)) < 0) + { + printf("ERROR connecting\n"); + return 0; + } + + + //send the opcode + int n = write(sockfd, (byte*) &opcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + n = write(sockfd, (byte*) tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + n = write(sockfd, (byte*) tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + writetosocket(sockfd, (byte*) tuple.key, *tuple.keylen); + if(!tuple.isDelete() && *tuple.datalen != 0) + writetosocket(sockfd, (byte*) tuple.data, *tuple.datalen); + + //read the reply code + uint8_t rcode; + n = read(sockfd, (byte*) &rcode, sizeof(uint8_t)); + + if(rcode == logserver::OP_SENDING_TUPLE) + { + datatuple *rcvdtuple = (datatuple*)malloc(sizeof(datatuple)); + //read the keylen + rcvdtuple->keylen = (uint32_t*) malloc(sizeof(uint32_t)); + n = read(sockfd, (byte*) rcvdtuple->keylen, sizeof(uint32_t)); + assert(n == sizeof(uint32_t)); + //read the datalen + rcvdtuple->datalen = (uint32_t*) malloc(sizeof(uint32_t)); + n = read(sockfd, (byte*) rcvdtuple->datalen, sizeof(uint32_t)); + assert(n == sizeof(uint32_t)); + //read key + rcvdtuple->key = (byte*) malloc(*rcvdtuple->keylen); + readfromsocket(sockfd, (byte*) rcvdtuple->key, *rcvdtuple->keylen); + if(!rcvdtuple->isDelete()) + { + //read key + rcvdtuple->data = (byte*) malloc(*rcvdtuple->datalen); + readfromsocket(sockfd, (byte*) rcvdtuple->data, *rcvdtuple->datalen); + } + + close(sockfd); + return rcvdtuple; + } + else + assert(rcode == logserver::OP_SUCCESS); + + close(sockfd); + return 0; +} + + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + std::string servername = "sherpa4"; + int serverport = 32432; + + double delete_freq = .05; + double update_freq = .15; + + //data generation + typedef std::vector key_v_t; + const static int max_partition_size = 100000; + int KEY_LEN = 100; + std::vector *key_v_list = new std::vector; + int list_size = NUM_ENTRIES / max_partition_size + 1; + for(int i =0; ibegin(), key_arr->end(), &mycmp); + key_v_list->push_back(key_arr); + printf("size partition %d is %d\n", i+1, key_arr->size()); + } + + + + key_v_t * key_arr = new key_v_t; + + std::vector iters; + for(int i=0; ibegin())); + } + + int lc = 0; + while(true) + { + int list_index = -1; + for(int i=0; iend()) + continue; + + if(list_index == -1 || mycmp(**iters[i], **iters[list_index])) + list_index = i; + } + + if(list_index == -1) + break; + + if(key_arr->size() == 0 || mycmp(key_arr->back(), **iters[list_index])) + key_arr->push_back(**iters[list_index]); + + (*iters[list_index])++; + lc++; + if(lc % max_partition_size == 0) + printf("%d/%d completed.\n", lc, NUM_ENTRIES); + } + + for(int i=0; iclear(); + delete (*key_v_list)[i]; + delete iters[i]; + } + key_v_list->clear(); + delete key_v_list; + +// preprandstr(NUM_ENTRIES, data_arr, 10*8192); + + printf("key arr size: %d\n", key_arr->size()); + + //removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + int delcount = 0, upcount = 0; + int64_t datasize = 0; + std::vector dsp; + std::vector del_list; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + + //prepare the data + std::string ditem; + getnextdata(ditem, 8192); + uint32_t datalen = ditem.length()+1; + newtuple.datalen = &datalen; + newtuple.data = (datatuple::data_t) malloc(datalen); + memcpy((byte*)newtuple.data, ditem.c_str(), datalen); + + /* + printf("key: \t, keylen: %u\ndata: datalen: %u\n", + //newtuple.key, + *newtuple.keylen, + //newtuple.data, + *newtuple.datalen); + */ + + datasize += newtuple.byte_length(); + + gettimeofday(&ti_st,0); + + //send the data + sendTuple(servername, serverport, logserver::OP_INSERT, newtuple); + + gettimeofday(&ti_end,0); + insert_time += tv_to_double(ti_end) - tv_to_double(ti_st); + + free(newtuple.key); + free(newtuple.data); + + if(i % 10000 == 0 && i > 0) + printf("%d / %d inserted.\n", i, NUM_ENTRIES); + + } + gettimeofday(&stop_tv,0); + printf("insert time: %6.1f\n", insert_time); + printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + printf("#deletions: %d\n#updates: %d\n", delcount, upcount); + + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple searchtuple; + searchtuple.keylen = (uint32_t*)malloc(2*sizeof(uint32_t) + keylen); + *searchtuple.keylen = keylen; + + searchtuple.datalen = searchtuple.keylen + 1; + *searchtuple.datalen = 0; + + searchtuple.key = (datatuple::key_t)(searchtuple.keylen + 2); + memcpy((byte*)searchtuple.key, (*key_arr)[ri].c_str(), keylen); + + //find the key with the given tuple + datatuple *dt = sendTuple(servername, serverport, logserver::OP_FIND, + searchtuple); + + assert(dt!=0); + assert(!dt->isDelete()); + found_tuples++; + assert(*(dt->keylen) == (*key_arr)[ri].length()+1); + + //free dt + free(dt->keylen); + free(dt->datalen); + free(dt->key); + free(dt->data); + free(dt); + + dt = 0; + + free(searchtuple.keylen); + + } + printf("found %d\n", found_tuples); + + + + + + key_arr->clear(); + //data_arr->clear(); + delete key_arr; + //delete data_arr; + + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + +} + + + +/** @test + */ +int main() +{ + //insertProbeIter(25000); + insertProbeIter(100000); + /* + insertProbeIter(5000); + insertProbeIter(2500); + insertProbeIter(1000); + insertProbeIter(500); + insertProbeIter(1000); + insertProbeIter(100); + insertProbeIter(10); + */ + + return 0; +} + diff --git a/cmds.txt b/cmds.txt new file mode 100644 index 0000000..5b24608 --- /dev/null +++ b/cmds.txt @@ -0,0 +1,9 @@ + dd if=/dev/zero of=storefile.txt bs=1M count=20000 + + +/dhtRecOpsGenerator -d clientType=LogStoreClient host=sherpa4 numOps=10ls existingStartKey=100 existingEndKey=1000 insertRatio=1.0 + + + + +dhtRecOpsGeneratorWrapper startClientID=1 endClientID=4 -d clientType=LogStoreClient host=sherpa4.corp.re1.yahoo.com numOps=5000000 existingStartKey=100 existingEndKey=10000000 insertRatio=1.0 readRatio=0 numClients=3 diff --git a/datapage.cpp b/datapage.cpp new file mode 100644 index 0000000..b931e10 --- /dev/null +++ b/datapage.cpp @@ -0,0 +1,507 @@ + +#include "logstore.h" +#include "datapage.h" + +template +const int32_t DataPage::HEADER_SIZE = sizeof(int32_t); + +template +DataPage::DataPage(int xid, pageid_t pid): + alloc_region(0), + alloc_state(0), + fix_pcount(-1) +{ + assert(pid!=0); + + pcount = readPageCount(xid, pid); + + pidarr = (pageid_t *) malloc(sizeof(pageid_t) * pcount); + + for(int i=0; i +DataPage::DataPage(int xid, int fix_pcount, pageid_t (*alloc_region)(int, void*), void * alloc_state) +{ + assert(fix_pcount >= 1); + byte_offset = -1; + + this->fix_pcount = fix_pcount; + + if(alloc_region != 0) + this->alloc_region = alloc_region; + if(alloc_state != 0) + this->alloc_state = alloc_state; + + initialize(xid); +} + +template +DataPage::~DataPage() +{ + if(pidarr) + free(pidarr); +} + + +template +void DataPage::initialize(int xid) +{ + //initializes to an empty datapage + //alloc a new page + pageid_t pid = alloc_region(xid, alloc_state); + + //load the first page + //Page *p = loadPage(xid, pid); + Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE); + writelock(p->rwlatch,0); + + //initialize header + + //set number of pages to 1 + int32_t * numpages_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, 0); + *numpages_ptr = 1; + + //write 0 to first data size + int32_t * size_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, HEADER_SIZE); + *size_ptr = 0; + + //set the page dirty + stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); + + //release the page + unlock(p->rwlatch); + releasePage(p); + + //set the class variables + byte_offset = HEADER_SIZE; + pcount = 1; + pidarr = (pageid_t *) malloc(fix_pcount * sizeof(pageid_t)); + pidarr[0] = pid; + +} + +template +inline bool DataPage::append(int xid, TUPLE const & dat) +{ + assert(byte_offset >= HEADER_SIZE); + assert(fix_pcount >= 1); + + //check if there is enough space (for the data length + data) + int32_t blen = dat.byte_length() + sizeof(int32_t); + if(PAGE_SIZE * fix_pcount - byte_offset < blen) + { + //check if the record is too large + // and if so do we wanna accomodate here by going over the fix_pcount + if(PAGE_SIZE * fix_pcount - HEADER_SIZE < blen && //record is larger than datapage + PAGE_SIZE * fix_pcount - HEADER_SIZE > 2 * byte_offset)//accept if i am less than half full + { + //nothing + } + else + { + //printf("page has %d bytes left, we needed %d. (byte_offset %d)\n", + //PAGE_SIZE * fix_pcount - byte_offset, blen, byte_offset); + return false; //not enough mana, return + } + } + + //write the length of the data + int32_t dsize = blen - sizeof(int32_t); + + if(!writebytes(xid, sizeof(int32_t), (byte*)(&dsize))) + return false; + byte_offset += sizeof(int32_t); + + //write the data + byte * barr = dat.to_bytes(); + if(!writebytes(xid, dsize, barr)) //if write fails, undo the previous write + { + byte_offset -= sizeof(int32_t); + free(barr); + //write 0 for the next tuple size, if there is enough space in this page + if(PAGE_SIZE - (byte_offset % PAGE_SIZE) >= sizeof(int32_t)) + { + dsize = 0; + writebytes(xid, sizeof(int32_t), (byte*)(&dsize));//this will succeed, since there is enough space on the page + } + return false; + } + free(barr); + byte_offset += dsize; + + //write 0 for the next tuple size, if there is enough space in this page + if(PAGE_SIZE - (byte_offset % PAGE_SIZE) >= sizeof(int32_t)) + { + dsize = 0; + writebytes(xid, sizeof(int32_t), (byte*)(&dsize));//this will succeed, since there is enough space on the page + } + + return true; +} + +template +bool DataPage::writebytes(int xid, int count, byte *data) +{ + + int32_t bytes_copied = 0; + while(bytes_copied < count) + { + //load the page to copy into + int pindex = (byte_offset + bytes_copied) / PAGE_SIZE; + if(pindex == pcount) //then this page must be allocated + { + pageid_t newid = alloc_region(xid, alloc_state); + //check continuity + if(pidarr[pindex-1] != newid - 1)//so we started a new region and that is not right after the prev region in the file + { + return false;//we cant store this + } + + //check whether we need to extend the pidarr, add fix_pcount many pageid_t slots + if(pindex >= fix_pcount && (pindex % fix_pcount==0)) + { + pidarr = (pageid_t*)realloc(pidarr, (pindex + fix_pcount)*sizeof(pageid_t)); + } + pidarr[pindex] = newid; + pcount++; + incrementPageCount(xid, pidarr[0]); + } + //Page *p = loadPage(xid, pidarr[pindex]); + Page *p = loadPageOfType(xid, pidarr[pindex], SEGMENT_PAGE); + writelock(p->rwlatch,0); + + //copy the portion of bytes we can copy in this page + int32_t page_offset = (byte_offset+bytes_copied) % PAGE_SIZE; + int32_t copy_len = ( (PAGE_SIZE - page_offset < count - bytes_copied ) ? PAGE_SIZE - page_offset: count - bytes_copied); + + byte * pb_ptr = stasis_page_byte_ptr_from_start(p, page_offset); + memcpy(pb_ptr, data+bytes_copied ,copy_len); + + //release the page + stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); + unlock(p->rwlatch); + releasePage(p); + + //update the copied bytes_count + bytes_copied += copy_len; + + + } + + assert(bytes_copied == count); + return true; +} + +template +bool DataPage::recordRead(int xid, typename TUPLE::key_t key, size_t keySize, TUPLE ** buf) +{ + RecordIterator itr(this); + + int match = -1; + while((*buf=itr.getnext(xid)) != 0) + { + match = TUPLE::compare((*buf)->get_key(), key); + + if(match<0) //keep searching + { + free((*buf)->keylen); + free(*buf); + *buf=0; + } + else if(match==0) //found + { + return true; + } + else // match > 0, then does not exist + { + free((*buf)->keylen); + free(*buf); + *buf = 0; + break; + } + } + + return false; +} + +template +void DataPage::readbytes(int xid, int32_t offset, int count, byte **data) +{ + + if(*data==NULL) + *data = (byte*)malloc(count); + + int32_t bytes_copied = 0; + while(bytes_copied < count) + { + //load the page to copy from + int pindex = (offset + bytes_copied) / PAGE_SIZE; + + //Page *p = loadPage(xid, pidarr[pindex]); + Page *p = loadPageOfType(xid, pidarr[pindex], SEGMENT_PAGE); + readlock(p->rwlatch,0); + + //copy the portion of bytes we can copy from this page + int32_t page_offset = (offset+bytes_copied) % PAGE_SIZE; + int32_t copy_len = ( (PAGE_SIZE - page_offset < count - bytes_copied ) ? PAGE_SIZE - page_offset : count - bytes_copied); + + byte * pb_ptr = stasis_page_byte_ptr_from_start(p, page_offset); + memcpy((*data)+bytes_copied, pb_ptr, copy_len); + + //release the page + unlock(p->rwlatch); + releasePage(p); + + //update the copied bytes_count + bytes_copied += copy_len; + } + + assert(bytes_copied == count); +} + + +template +inline int DataPage::readPageCount(int xid, pageid_t pid) +{ + + //Page *p = loadPage(xid, pid); + Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE); + readlock(p->rwlatch,0); + + int32_t numpages = *((int32_t*)stasis_page_byte_ptr_from_start(p, 0)); + + unlock(p->rwlatch); + releasePage(p); + + return numpages; +} + +template +inline void DataPage::incrementPageCount(int xid, pageid_t pid, int add) +{ + //Page *p = loadPage(xid, pid); + Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE); + writelock(p->rwlatch,0); + + int32_t *numpages_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, 0); + + *numpages_ptr = *numpages_ptr + add; + + stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); + + unlock(p->rwlatch); + releasePage(p); + + + +} + + +template +inline uint16_t DataPage::recordCount(int xid) +{ + + return 0; +} + +template +pageid_t DataPage::dp_alloc_region(int xid, void *conf) +{ + RegionAllocConf_t* a = (RegionAllocConf_t*)conf; + + + if(a->nextPage == a->endOfRegion) { + if(a->regionList.size == -1) { + //DEBUG("nextPage: %lld\n", a->nextPage); + a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t)); + DEBUG("regionList.page: %lld\n", a->regionList.page); + DEBUG("regionList.slot: %d\n", a->regionList.slot); + DEBUG("regionList.size: %lld\n", a->regionList.size); + + a->regionCount = 0; + } + DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page); + TarrayListExtend(xid,a->regionList,1); + a->regionList.slot = a->regionCount; + DEBUG("region lst slot %d\n",a->regionList.slot); + a->regionCount++; + DEBUG("region count %lld\n",a->regionCount); + a->nextPage = TregionAlloc(xid, a->regionSize,12); + DEBUG("next page %lld\n",a->nextPage); + a->endOfRegion = a->nextPage + a->regionSize; + Tset(xid,a->regionList,&a->nextPage); + DEBUG("next page %lld\n",a->nextPage); + } + + DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion); + pageid_t ret = a->nextPage; + // Ensure the page is in buffer cache without accessing disk (this + // sets it to clean and all zeros if the page is not in cache). + // Hopefully, future reads will get a cache hit, and avoid going to + // disk. + + Page * p = loadUninitializedPage(xid, ret); + //writelock(p->rwlatch,0); + p->pageType = SEGMENT_PAGE; + //unlock(p->rwlatch); + releasePage(p); + DEBUG("ret %lld\n",ret); + (a->nextPage)++; + return ret; + +} + +template +pageid_t DataPage::dp_alloc_region_rid(int xid, void * ridp) { + recordid rid = *(recordid*)ridp; + RegionAllocConf_t conf; + Tread(xid,rid,&conf); + pageid_t ret = dp_alloc_region(xid,&conf); + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&conf); + return ret; +} + +template +void DataPage::dealloc_region_rid(int xid, void *conf) +{ + RegionAllocConf_t a = *((RegionAllocConf_t*)conf); + DEBUG("{%lld <- dealloc region arraylist}\n", a.regionList.page); + + for(int i = 0; i < a.regionCount; i++) { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + TregionDealloc(xid,pid); + } +} + +template +void DataPage::force_region_rid(int xid, void *conf) +{ + recordid rid = *(recordid*)conf; + RegionAllocConf_t a; + Tread(xid,rid,&a); + + for(int i = 0; i < a.regionCount; i++) + { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + stasis_dirty_page_table_flush_range((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), pid, pid+a.regionSize); + forcePageRange(pid, pid+a.regionSize); + } +} + + +/////////////////////////////////////////////////////////////// +//RECORD ITERATOR +/////////////////////////////////////////////////////////////// + + +template +TUPLE* DataPage::RecordIterator::getnext(int xid) +{ + + + int pindex = offset / PAGE_SIZE; + + if(pindex == dp->pcount)//past end + return 0; + if(pindex == dp->pcount - 1 && (PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t))) + return 0; + + //Page *p = loadPage(xid, dp->pidarr[pindex]); + Page *p = loadPageOfType(xid, dp->pidarr[pindex], SEGMENT_PAGE); + readlock(p->rwlatch,0); + + int32_t *dsize_ptr; + if(PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)) //int spread in two pages + { + dsize_ptr = 0; + dp->readbytes(xid, offset, sizeof(int32_t), (byte**)(&dsize_ptr)); + } + else //int in a single page + dsize_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, offset % PAGE_SIZE); + + offset += sizeof(int32_t); + + if(*dsize_ptr == 0) //no more keys + { + unlock(p->rwlatch); + releasePage(p); + return 0; + } + + byte* tb=0; + dp->readbytes(xid, offset, *dsize_ptr, &tb); + + TUPLE *tup = TUPLE::from_bytes(tb); + + offset += *dsize_ptr; + + unlock(p->rwlatch); + releasePage(p); + + return tup; +} + + + +template +void DataPage::RecordIterator::advance(int xid, int count) +{ + + int pindex = -1; + Page *p = 0; + + for(int i=0; irwlatch); + releasePage(p); + } + + pindex = offset / PAGE_SIZE; + + if(pindex == dp->pcount)//past end + return; + + //p = loadPage(xid, dp->pidarr[pindex]); + p = loadPageOfType(xid, dp->pidarr[pindex], SEGMENT_PAGE); + readlock(p->rwlatch,0); + } + + if(pindex == dp->pcount - 1 && (PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t))) + return; + + int32_t *dsize_ptr=0; + if(PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)) //int spread in two pages + dp->readbytes(xid, offset, sizeof(int32_t), (byte**)(&dsize_ptr)); + else //int in a single page + dsize_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, offset % PAGE_SIZE); + + offset += sizeof(int32_t); + + if(*dsize_ptr == 0) //no more keys + { + unlock(p->rwlatch); + releasePage(p); + return; + } + + offset += *dsize_ptr; + + } + +} diff --git a/datapage.h b/datapage.h new file mode 100644 index 0000000..f26f454 --- /dev/null +++ b/datapage.h @@ -0,0 +1,110 @@ +#ifndef _SIMPLE_DATA_PAGE_H_ +#define _SIMPLE_DATA_PAGE_H_ + +#include + +#include +#include + + + +template +class DataPage +{ +public: + + class RecordIterator + { + public: + RecordIterator(DataPage *dp) + { + offset = HEADER_SIZE; + this->dp = dp; + } + + RecordIterator(const RecordIterator &rhs) + { + this->offset = rhs.offset; + this->dp = rhs.dp; + } + + void operator=(const RecordIterator &rhs) + { + this->offset = rhs.offset; + this->dp = rhs.dp; + } + + + //returns the next tuple and also advances the iterator + TUPLE *getnext(int xid); + + //advance the iterator by count tuples, i.e. skip over count tuples + void advance(int xid, int count=1); + + + int32_t offset ; + DataPage *dp; + + + }; + + +public: + + //to be used when reading an existing data page from disk + DataPage( int xid, pageid_t pid ); + + //to be used to create new data pages + DataPage( int xid, int fix_pcount, pageid_t (*alloc_region)(int, void*), void * alloc_state); + + ~DataPage(); + + inline bool append(int xid, TUPLE const & dat); + bool recordRead(int xid, typename TUPLE::key_t key, size_t keySize, TUPLE ** buf); + + inline uint16_t recordCount(int xid); + + + RecordIterator begin(){return RecordIterator(this);} + + pageid_t get_start_pid(){return pidarr[0];} + int get_page_count(){return pcount;} + + static pageid_t dp_alloc_region(int xid, void *conf); + + static pageid_t dp_alloc_region_rid(int xid, void * ridp); + + static void dealloc_region_rid(int xid, void* conf); + + static void force_region_rid(int xid, void *conf); + +public: + +private: + + void initialize(int xid); + + //reads the page count information from the first page + int readPageCount(int xid, pageid_t pid); + void incrementPageCount(int xid, pageid_t pid, int add=1); + + bool writebytes(int xid, int count, byte *data); + inline void readbytes(int xid, int32_t offset, int count, byte **data=0); + +private: + int fix_pcount; //number of pages in a standard data page + int pcount; + pageid_t *pidarr; + int32_t byte_offset;//points to the next free byte + + + //page alloc function + pageid_t (*alloc_region)(int, void*); + void *alloc_state; + + static const int32_t HEADER_SIZE; + + +}; + +#endif diff --git a/datatuple.h b/datatuple.h new file mode 100644 index 0000000..0e1e4ce --- /dev/null +++ b/datatuple.h @@ -0,0 +1,147 @@ +#ifndef _DATATUPLE_H_ +#define _DATATUPLE_H_ + + +typedef unsigned char uchar; + +#include + +//#define byte unsigned char +typedef unsigned char byte; +#include + +//#include +//#include +//#include + +typedef struct datatuple +{ + typedef uchar* key_t; + typedef uchar* data_t; + uint32_t *keylen; //key length should be size of string + 1 for \n + uint32_t *datalen; + key_t key; + data_t data; + + //this is used by the stl set + bool operator() (const datatuple& lhs, const datatuple& rhs) const + { + //std::basic_string s1(lhs.key); + //std::basic_string s2(rhs.key); + return strcmp((char*)lhs.key,(char*)rhs.key) < 0; + //return (*((int32_t*)lhs.key)) <= (*((int32_t*)rhs.key)); + } + + /** + * return -1 if k1 < k2 + * 0 if k1 == k2 + * 1 of k1 > k2 + **/ + static int compare(const key_t k1,const key_t k2) + { + //for char* ending with \0 + return strcmp((char*)k1,(char*)k2); + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); + } + + void setDelete() + { + *datalen = UINT_MAX; + } + + inline bool isDelete() const + { + return *datalen == UINT_MAX; + } + + static std::string key_to_str(const byte* k) + { + //for strings + return std::string((char*)k); + //for int + /* + std::ostringstream ostr; + ostr << *((int32_t*)k); + return ostr.str(); + */ + } + + //returns the length of the byte array representation + int32_t byte_length() const{ + static const size_t isize = sizeof(uint32_t); + if(isDelete()) + return isize + *keylen + isize; + else + return isize + *keylen + isize + (*datalen); + } + + //format: key length _ data length _ key _ data + byte * to_bytes() const { + static const size_t isize = sizeof(uint32_t); + byte * ret; + if(!isDelete()) + ret = (byte*) malloc(isize + *keylen + isize + *datalen); + else + ret = (byte*) malloc(isize + *keylen + isize); + + memcpy(ret, (byte*)(keylen), isize); + memcpy(ret+isize, (byte*)(datalen), isize); + memcpy(ret+isize+isize, key, *keylen); + if(!isDelete()) + memcpy(ret+isize+isize+*keylen, data, *datalen); + return ret; + } + + //does not copy the data again + //just sets the pointers in the datatuple to + //right positions in the given arr + + static datatuple* from_bytes(const byte * arr) + { + static const size_t isize = sizeof(uint32_t); + datatuple *dt = (datatuple*) malloc(sizeof(datatuple)); + + dt->keylen = (uint32_t*) arr; + dt->datalen = (uint32_t*) (arr+isize); + dt->key = (key_t) (arr+isize+isize); + if(!dt->isDelete()) + dt->data = (data_t) (arr+isize+isize+ *(dt->keylen)); + else + dt->data = 0; + + return dt; + } + /* + static datatuple form_tuple(const byte * arr) + { + static const size_t isize = sizeof(uint32_t); + datatuple dt; + + dt.keylen = (uint32_t*) arr; + dt.datalen = (uint32_t*) (arr+isize); + dt.key = (key_t) (arr+isize+isize); + if(!dt.isDelete()) + dt.data = (data_t) (arr+isize+isize+ *(dt.keylen)); + else + dt.data = 0; + + return dt; + } + */ + + byte * get_key() { return (byte*) key; } + byte * get_data() { return (byte*) data; } + + //releases only the tuple + static void release(datatuple *dt) + { + free(dt); + } + +} datatuple; + + +#endif diff --git a/hello.cpp b/hello.cpp new file mode 100644 index 0000000..118fccb --- /dev/null +++ b/hello.cpp @@ -0,0 +1,48 @@ + +#include +#include +#include +#include + +typedef unsigned char uchar; +typedef struct datatuple +{ + + typedef byte* key_t; + typedef byte* data_t; + uint32_t keylen; + uint32_t datalen; + key_t key; + data_t data; + + +}; + +int main(int argc, char** argv) { + +bool * m1 = new bool(false); +std::cout << *m1 << std::endl; + + datatuple t; + std::cout << "size of datatuple:\t" << sizeof(datatuple) << std::endl; + + t.key = (datatuple::key_t) malloc(10); + const char * str = "12345678"; + strcpy((char*)t.key, (str)); + + t.keylen = strlen((char*)t.key); + + t.data = (datatuple::data_t) malloc(10); + const char * str2 = "1234567"; + strcpy((char*)t.data, (str2)); + + t.datalen = strlen((char*)t.data); + + std::cout << "size of datatuple:\t" << sizeof(datatuple) << std::endl; + std::cout << "keylen:\t" << t.keylen << + "\tdatalen:\t" << t.datalen << + "\t" << t.key << + "\t" << t.data << + std::endl; + +} diff --git a/logiterators.cpp b/logiterators.cpp new file mode 100644 index 0000000..80a079b --- /dev/null +++ b/logiterators.cpp @@ -0,0 +1,200 @@ + +#include "logstore.h" +//#include "datapage.cpp" +#include "logiterators.h" + + + + +//template +/* +template <> +const byte* toByteArray, datatuple>( + memTreeIterator, datatuple> * const t) +{ + return (*(t->it_)).to_bytes(); +} +*/ + + +///////////////////////////////////////////////////////////////////// +// tree iterator implementation +///////////////////////////////////////////////////////////////////// + +template +treeIterator::treeIterator(recordid tree) : + tree_(tree), + lsmIterator_(logtreeIterator::open(-1,tree)), + curr_tuple(0) +{ + init_helper(); +} + +template +treeIterator::treeIterator(recordid tree, TUPLE& key) : + tree_(tree), + //scratch_(), + lsmIterator_(logtreeIterator::openAt(-1,tree,key.get_key()))//toByteArray())), + //slot_(0) +{ + init_helper(); + + /* + treeIterator * end = this->end(); + for(;*this != *end && **this < key; ++(*this)) + { + DEBUG("treeIterator was not at the given TUPLE"); + } + delete end; + */ + +} + +template +treeIterator::~treeIterator() +{ + if(lsmIterator_) + logtreeIterator::close(-1, lsmIterator_); + + if(curr_tuple != NULL) + free(curr_tuple); + + if(curr_page!=NULL) + { + delete curr_page; + curr_page = 0; + } + + +} + +template +void treeIterator::init_helper() +{ + if(!lsmIterator_) + { + printf("treeIterator:\t__error__ init_helper():\tnull lsmIterator_"); + curr_page = 0; + dp_itr = 0; + } + else + { + if(logtreeIterator::next(-1, lsmIterator_) == 0) + { + //printf("treeIterator:\t__error__ init_helper():\tlogtreeIteratr::next returned 0." ); + curr_page = 0; + dp_itr = 0; + } + else + { + pageid_t * pid_tmp; + pageid_t ** hack = &pid_tmp; + logtreeIterator::value(-1,lsmIterator_,(byte**)hack); + + curr_pageid = *pid_tmp; + curr_page = new DataPage(-1, curr_pageid); + dp_itr = new DPITR_T(curr_page->begin()); + } + + } +} + +template +TUPLE * treeIterator::getnext() +{ + assert(this->lsmIterator_); + + if(dp_itr == 0) + return 0; + + TUPLE* readTuple = dp_itr->getnext(-1); + + + if(!readTuple) + { + delete dp_itr; + dp_itr = 0; + delete curr_page; + curr_page = 0; + + if(logtreeIterator::next(-1,lsmIterator_)) + { + pageid_t *pid_tmp; + + pageid_t **hack = &pid_tmp; + logtreeIterator::value(-1,lsmIterator_,(byte**)hack); + curr_pageid = *pid_tmp; + curr_page = new DataPage(-1, curr_pageid); + dp_itr = new DPITR_T(curr_page->begin()); + + + readTuple = dp_itr->getnext(-1); + assert(readTuple); + } + else + { + // TODO: what is this? + //past end of iterator! "end" should contain the pageid of the + // last leaf, and 1+ numslots on that page. + //abort(); + } + } + + return curr_tuple=readTuple; +} + + + +/* +template +treeIterator::treeIterator(treeIteratorHandle* tree, TUPLE& key) : + tree_(tree?tree->r_:NULLRID), + scratch_(), + lsmIterator_(logtreeIterator::openAt(-1,tree?tree->r_:NULLRID,key.get_key())),//toByteArray())), + slot_(0) +{ + init_helper(); + if(lsmIterator_) { + treeIterator * end = this->end(); + for(;*this != *end && **this < key; ++(*this)) { } + delete end; + } else { + this->slot_ = 0; + this->pageid_ = 0; + } +} + +template +treeIterator::treeIterator(recordid tree, TUPLE &scratch) : + tree_(tree), + scratch_(scratch), + lsmIterator_(logtreeIterator::open(-1,tree)), + slot_(0) +{ + init_helper(); +} + +template +treeIterator::treeIterator(treeIteratorHandle* tree) : + tree_(tree?tree->r_:NULLRID), + scratch_(), + lsmIterator_(logtreeIterator::open(-1,tree?tree->r_:NULLRID)), + slot_(0) +{ + init_helper(); +} + +template +treeIterator::treeIterator(treeIterator& t) : + tree_(t.tree_), + scratch_(t.scratch_), + lsmIterator_(t.lsmIterator_?logtreeIterator::copy(-1,t.lsmIterator_):0), + slot_(t.slot_), + pageid_(t.pageid_), + p_((Page*)((t.p_)?loadPage(-1,t.p_->id):0)) + //currentPage_((PAGELAYOUT*)((p_)?p_->impl:0)) +{ + if(p_) + readlock(p_->rwlatch,0); +} +*/ diff --git a/logiterators.h b/logiterators.h new file mode 100644 index 0000000..8d61867 --- /dev/null +++ b/logiterators.h @@ -0,0 +1,173 @@ +#ifndef _LOG_ITERATORS_H_ +#define _LOG_ITERATORS_H_ + +#include +#include + +#undef begin +#undef end + +template class memTreeIterator; + +template +const byte* toByteArray(memTreeIterator * const t); + +template +class DataPage; + +////////////////////////////////////////////////////////////// +// memTreeIterator +///////////////////////////////////////////////////////////// + +template +class memTreeIterator{ + +private: + typedef typename MEMTREE::const_iterator MTITER; + +public: + memTreeIterator( MEMTREE *s ) + { + it_ = s->begin(); + itend_ = s->end(); + } + + + memTreeIterator( MTITER& it, MTITER& itend ) + { + it_ = it; + itend_ = itend; + } + + explicit memTreeIterator(memTreeIterator &i) + { + it_ = i.it_; + itend_ = i.itend_; + } + + const TUPLE& operator* () + { + return *it_; + } + + void seekEnd() + { + it_ = itend_; + } + + + memTreeIterator * end() + { + return new memTreeIterator(itend_,itend_); + } + + inline bool operator==(const memTreeIterator &o) const { + return it_ == o.it_; + } + inline bool operator!=(const memTreeIterator &o) const { + return !(*this == o); + } + inline void operator++() { + ++it_; + } + inline void operator--() { + --it_; + } + + inline int operator-(memTreeIterator &i) { + return it_ - i.it_; + } + + inline void operator=(memTreeIterator const &i) + { + it_ = i.it_; + itend_ = i.itend_; + } + +public: + typedef MEMTREE* handle; + +private: + + MTITER it_; + MTITER itend_; + + friend const byte* toByteArray(memTreeIterator * const t); + +}; + +template +const byte* toByteArray(memTreeIterator * const t) +{ + return (*(t->it_)).to_bytes();//toByteArray(); +} + +///////////////////////////////////////////////////////////////// + +/** + Scans through an LSM tree's leaf pages, each tuple in the tree, in + order. This iterator is designed for maximum forward scan + performance, and does not support all STL operations. +**/ +template +class treeIterator +{ + + public: + // typedef recordid handle; + class treeIteratorHandle + { + public: + treeIteratorHandle() : r_(NULLRID) {} + treeIteratorHandle(const recordid r) : r_(r) {} + + treeIteratorHandle * operator=(const recordid &r) { + r_ = r; + return this; + } + + recordid r_; + }; + + typedef treeIteratorHandle* handle; + + explicit treeIterator(recordid tree); + + explicit treeIterator(recordid tree,TUPLE &key); + + //explicit treeIterator(treeIteratorHandle* tree, TUPLE& key); + + //explicit treeIterator(treeIteratorHandle* tree); + + //explicit treeIterator(treeIterator& t); + + ~treeIterator(); + + TUPLE * getnext(); + + //void advance(int count=1); + +private: + inline void init_helper(); + + explicit treeIterator() { abort(); } + void operator=(treeIterator & t) { abort(); } + int operator-(treeIterator & t) { abort(); } + +private: + recordid tree_; //root of the tree + + lladdIterator_t * lsmIterator_; //logtree iterator + + pageid_t curr_pageid; //current page id + DataPage *curr_page; //current page + typedef typename DataPage::RecordIterator DPITR_T; + DPITR_T *dp_itr; + TUPLE *curr_tuple; //current tuple +}; + + + + +#endif + diff --git a/logserver.cpp b/logserver.cpp new file mode 100644 index 0000000..3f9eb54 --- /dev/null +++ b/logserver.cpp @@ -0,0 +1,649 @@ + + + +#include "logserver.h" +#include "datatuple.h" + +#include "logstore.h" + +#include +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end +#undef try + + +//server codes +uint8_t logserver::OP_SUCCESS = 1; +uint8_t logserver::OP_FAIL = 2; +uint8_t logserver::OP_SENDING_TUPLE = 3; + +//client codes +uint8_t logserver::OP_FIND = 4; +uint8_t logserver::OP_INSERT = 5; + +uint8_t logserver::OP_DONE = 6; + +uint8_t logserver::OP_INVALID = 32; + +void *serverLoop(void *args); + +void logserver::startserver(logtable *ltable) +{ + sys_alive = true; + this->ltable = ltable; + + selcond = new pthread_cond_t; + pthread_cond_init(selcond, 0); + + //initialize threads + for(int i=0; ith_handle = new pthread_t; + struct pthread_data *worker_data = new pthread_data; + worker_th->data = worker_data; + + worker_data->idleth_queue = &idleth_queue; + worker_data->ready_queue = &ready_queue; + worker_data->work_queue = &work_queue; + + worker_data->qlock = qlock; + + worker_data->selcond = selcond; + + worker_data->th_cond = new pthread_cond_t; + pthread_cond_init(worker_data->th_cond,0); + + worker_data->th_mut = new pthread_mutex_t; + pthread_mutex_init(worker_data->th_mut,0); + + worker_data->workitem = new int; + *(worker_data->workitem) = -1; + + //worker_data->table_lock = lsmlock; + + worker_data->ltable = ltable; + + worker_data->sys_alive = &sys_alive; + + pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th); + + idleth_queue.push(*worker_th); + + + } + + + + //start server socket + sdata = new serverth_data; + sdata->server_socket = &serversocket; + sdata->server_port = server_port; + sdata->idleth_queue = &idleth_queue; + sdata->ready_queue = &ready_queue; + sdata->selcond = selcond; + sdata->qlock = qlock; + + pthread_create(&server_thread, 0, serverLoop, sdata); + + //start monitoring loop + eventLoop(); + +} + +void logserver::stopserver() +{ + //close the server socket + //stops receiving data on the server socket + shutdown(serversocket, 0); + + //wait for all threads to be idle + while(idleth_queue.size() != nthreads) + sleep(1); + + #ifdef STATS_ENABLED + printf("\n\nSTATISTICS\n"); + std::map num_reqsc; + std::map work_timec; + #endif + + //set the system running flag to false + sys_alive = false; + for(int i=0; idata->th_mut); + pthread_cond_signal(idle_th->data->th_cond); + pthread_mutex_unlock(idle_th->data->th_mut); + //wait for it to join + pthread_join(*(idle_th->th_handle), 0); + //free the thread variables + pthread_cond_destroy(idle_th->data->th_cond); + + #ifdef STATS_ENABLED + if(i == 0) + { + tot_threadwork_time = 0; + num_reqs = 0; + } + + tot_threadwork_time += idle_th->data->work_time; + num_reqs += idle_th->data->num_reqs; + + printf("thread %d: work_time %.3f\t#calls %d\tavg req process time:\t%.3f\n", + i, + idle_th->data->work_time, + idle_th->data->num_reqs, + (( idle_th->data->num_reqs == 0 ) ? 0 : idle_th->data->work_time / idle_th->data->num_reqs) + ); + + for(std::map::const_iterator itr = idle_th->data->num_reqsc.begin(); + itr != idle_th->data->num_reqsc.end(); itr++) + { + std::string ckey = (*itr).first; + printf("\t%s\t%d\t%.3f\t%.3f\n", ckey.c_str(), (*itr).second, idle_th->data->work_timec[ckey], + idle_th->data->work_timec[ckey] / (*itr).second); + + if(num_reqsc.find(ckey) == num_reqsc.end()){ + num_reqsc[ckey] = 0; + work_timec[ckey] = 0; + } + num_reqsc[ckey] += (*itr).second; + work_timec[ckey] += idle_th->data->work_timec[ckey]; + } + #endif + + delete idle_th->data->th_cond; + delete idle_th->data->th_mut; + delete idle_th->data->workitem; + delete idle_th->data; + delete idle_th->th_handle; + } + + th_list.clear(); + + #ifdef STATS_ENABLED + + printf("\n\nAggregated Stats:\n"); + for(std::map::const_iterator itr = num_reqsc.begin(); + itr != num_reqsc.end(); itr++) + { + std::string ckey = (*itr).first; + printf("\t%s\t%d\t%.3f\t%.3f\n", ckey.c_str(), (*itr).second, work_timec[ckey], + work_timec[ckey] / (*itr).second); + } + + tot_time = (stop_tv.tv_sec - start_tv.tv_sec) * 1000 + + (stop_tv.tv_usec / 1000 - start_tv.tv_usec / 1000); + + printf("\ntot time:\t%f\n",tot_time); + printf("tot work time:\t%f\n", tot_threadwork_time); + printf("load avg:\t%f\n", tot_threadwork_time / tot_time); + + printf("tot num reqs\t%d\n", num_reqs); + if(num_reqs!= 0) + { + printf("tot work time / num reqs:\t%.3f\n", tot_threadwork_time / num_reqs); + printf("tot time / num reqs:\t%.3f\n", tot_time / num_reqs ); + } + #endif + + //close(serversocket); + + return; +} + +void logserver::eventLoop() +{ + + fd_set readfs; + std::vector sel_list; + + int maxfd; + + struct timeval Timeout; + struct timespec ts; + + while(true) + { + //clear readset + FD_ZERO(&readfs); + maxfd = -1; + + ts.tv_nsec = 250000; //nanosec + ts.tv_sec = 0; + + //Timeout.tv_usec = 250; /* microseconds */ + //Timeout.tv_sec = 0; /* seconds */ + + //update select set + pthread_mutex_lock(qlock); + + //while(ready_queue.size() == 0) + if(sel_list.size() == 0) + { + while(ready_queue.size() == 0) + pthread_cond_wait(selcond, qlock); + //pthread_cond_timedwait(selcond, qlock, &ts); + //printf("awoke\n"); + } + + //new connections + processed conns are in ready_queue + //add them to select list + while(ready_queue.size() > 0) + { + sel_list.push_back(ready_queue.front()); + ready_queue.pop(); + } + pthread_mutex_unlock(qlock); + + //ready select set + for(std::vector::const_iterator itr=sel_list.begin(); + itr != sel_list.end(); itr++) + { + if(maxfd < *itr) + maxfd = *itr; + FD_SET(*itr, &readfs); + } + + //select events + int sel_res = select(maxfd+1, &readfs, NULL, NULL, NULL);// &Timeout); + //printf("sel_res %d %d\n", sel_res, errno); + //fflush(stdout); + //job assignment to threads + //printf("sel_list size:\t%d ready_cnt\t%d\n", sel_list.size(), sel_res); + + #ifdef STATS_ENABLED + if(num_selcalls == 0) + gettimeofday(&start_tv, 0); + + num_selevents += sel_res; + num_selcalls++; + #endif + + pthread_mutex_lock(qlock); + for(int i=0; i 0) //assign the job to an indle thread + { + pthread_item idle_th = idleth_queue.front(); + idleth_queue.pop(); + + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = currsock; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + //printf("%d:\tconn %d assigned.\n", i, currsock); + } + else + { + //insert the given element to the work queue + work_queue.push(currsock); + //printf("work queue size:\t%d\n", work_queue.size()); + } + +// pthread_mutex_unlock(qlock); + + //remove from the sel_list + sel_list.erase(sel_list.begin()+i); + i--; + } + } + + pthread_mutex_unlock(qlock); + + #ifdef STATS_ENABLED + gettimeofday(&stop_tv, 0); + #endif + + } + +} + +void *serverLoop(void *args) +{ + + serverth_data *sdata = (serverth_data*)args; + + int sockfd; //socket descriptor + struct sockaddr_in serv_addr; + struct sockaddr_in cli_addr; + int newsockfd; //newly created + socklen_t clilen = sizeof(cli_addr); + + + //open a socket + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + printf("ERROR opening socket\n"); + return 0; + } + + bzero((char *) &serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(sdata->server_port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) + { + printf("ERROR on binding.\n"); + return 0; + } + + //start listening on the server socket + //second arg is the max number of coonections waiting in queue + if(listen(sockfd,SOMAXCONN)==-1) + { + printf("ERROR on listen.\n"); + return 0; + } + + printf("LSM Server listenning...\n"); + + *(sdata->server_socket) = sockfd; + int flag, result; + while(true) + { + newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen); + if (newsockfd < 0) + { + printf("ERROR on accept.\n"); + return 0; // we probably want to continue instead of return here (when not debugging) + } + + flag = 1; + result = setsockopt(newsockfd, /* socket affected */ + IPPROTO_TCP, /* set option at TCP level */ + TCP_NODELAY, /* name of option */ + (char *) &flag, /* the cast is historical + cruft */ + sizeof(int)); /* length of option value */ + if (result < 0) + { + printf("ERROR on setting socket option TCP_NODELAY.\n"); + return 0; + } + + char clientip[20]; + inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20); + printf("Connection from:\t%s\n", clientip); + + //printf("Number of idle threads %d\n", idleth_queue.size()); + + pthread_mutex_lock(sdata->qlock); + + //insert the given element to the ready queue + sdata->ready_queue->push(newsockfd); + + if(sdata->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(sdata->selcond); + + pthread_mutex_unlock(sdata->qlock); + + } + + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + + + + + +void * thread_work_fn( void * args) +{ + pthread_item * item = (pthread_item *) args; + + pthread_mutex_lock(item->data->th_mut); + while(true) + { + while(*(item->data->workitem) == -1) + { + if(!*(item->data->sys_alive)) + break; + pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job + } + + + #ifdef STATS_ENABLED + gettimeofday(& (item->data->start_tv), 0); + std::ostringstream ostr; + ostr << *(item->data->workitem) << "_"; + #endif + + if(!*(item->data->sys_alive)) + { + //printf("thread quitted.\n"); + break; + } + + //step 1: read the opcode + uint8_t opcode; + ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t)); + assert( n == sizeof(uint8_t)); + assert( opcode < logserver::OP_INVALID ); + + if( opcode == logserver::OP_DONE ) //close the conn on failure + { + pthread_mutex_lock(item->data->qlock); + printf("client done. conn closed. (%d, %d, %d, %d)\n", + n, errno, *(item->data->workitem), item->data->work_queue->size()); + close(*(item->data->workitem)); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + //printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + continue; + } + + + //step 2: read the tuple from client + datatuple tuple; + tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t)); + + //read the key length + n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + //read the data length + n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + //read the key + tuple.key = (byte*) malloc(*tuple.keylen); + readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen); + //read the data + if(!tuple.isDelete() && opcode != logserver::OP_FIND) + { + tuple.data = (byte*) malloc(*tuple.datalen); + readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen); + } + else + tuple.data = 0; + + //step 3: process the tuple + //pthread_mutex_lock(item->data->table_lock); + //readlock(item->data->table_lock,0); + + if(opcode == logserver::OP_INSERT) + { + //insert/update/delete + item->data->ltable->insertTuple(tuple); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + //step 4: send response + uint8_t rcode = logserver::OP_SUCCESS; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + } + else if(opcode == logserver::OP_FIND) + { + //find the tuple + datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + + #ifdef STATS_ENABLED + + if(dt == 0) + printf("key not found:\t%s\n", datatuple::key_to_str(tuple.key).c_str()); + else if( *dt->datalen != 1024) + printf("data len for\t%s:\t%d\n", datatuple::key_to_str(tuple.key).c_str(), + *dt->datalen); + + if(datatuple::compare(tuple.key, dt->key) != 0) + printf("key not equal:\t%s\t%s\n", datatuple::key_to_str(tuple.key).c_str(), + datatuple::key_to_str(dt->key).c_str()); + + #endif + + if(dt == 0) //tuple deleted + { + dt = (datatuple*) malloc(sizeof(datatuple)); + dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen); + *dt->keylen = *tuple.keylen; + dt->datalen = dt->keylen + 1; + dt->key = (datatuple::key_t) (dt->datalen+1); + memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen); + dt->setDelete(); + } + + //send the reply code + uint8_t rcode = logserver::OP_SENDING_TUPLE; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length()); + + //free datatuple + free(dt->keylen); + free(dt); + } + + //close the socket + //close(*(item->data->workitem)); + + //free the tuple + free(tuple.keylen); + free(tuple.datalen); + free(tuple.key); + free(tuple.data); + + //printf("socket %d: work completed.", *(item->data->workitem)); + + pthread_mutex_lock(item->data->qlock); + + //add conn desc to ready queue + item->data->ready_queue->push(*(item->data->workitem)); + //printf("ready queue size: %d sock(%d)\n", item->data->ready_queue->size(), *(item->data->workitem)); + if(item->data->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(item->data->selcond); + + //printf("work complete, added to ready queue %d (size %d)\n", *(item->data->workitem), + // item->data->ready_queue->size()); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + //printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + + #ifdef STATS_ENABLED + if( item->data->num_reqs == 0 ) + item->data->work_time = 0; + gettimeofday(& (item->data->stop_tv), 0); + (item->data->num_reqs)++; + //item->data->work_time += tv_to_double(item->data->stop_tv) - tv_to_double(item->data->start_tv); + item->data->work_time += (item->data->stop_tv.tv_sec - item->data->start_tv.tv_sec) * 1000 + + (item->data->stop_tv.tv_usec / 1000 - item->data->start_tv.tv_usec / 1000); + + int iopcode = opcode; + ostr << iopcode; + std::string clientkey = ostr.str(); + if(item->data->num_reqsc.find(clientkey) == item->data->num_reqsc.end()) + { + item->data->num_reqsc[clientkey]=0; + item->data->work_timec[clientkey]=0; + } + + item->data->num_reqsc[clientkey]++; + item->data->work_timec[clientkey] += (item->data->stop_tv.tv_sec - item->data->start_tv.tv_sec) * 1000 + + (item->data->stop_tv.tv_usec / 1000 - item->data->start_tv.tv_usec / 1000);; + #endif + + + } + pthread_mutex_unlock(item->data->th_mut); + + +} + + diff --git a/logserver.h b/logserver.h new file mode 100644 index 0000000..dd9888a --- /dev/null +++ b/logserver.h @@ -0,0 +1,197 @@ +#ifndef _LOGSERVER_H_ +#define _LOGSERVER_H_ + + +#include +#include + +//#include "logstore.h" + +#include "datatuple.h" + + + +#include +#include + +#undef begin +#undef try +#undef end + +#define STATS_ENABLED 1 + +#ifdef STATS_ENABLED +#include +#include +#include +#endif + +class logtable; + + + +struct pthread_item; + +struct pthread_data { + std::queue *idleth_queue; + std::queue *ready_queue; + std::queue *work_queue; + pthread_mutex_t * qlock; + + pthread_cond_t *selcond; + + pthread_cond_t * th_cond; + pthread_mutex_t * th_mut; + + int *workitem; //id of the socket to work + + //pthread_mutex_t * table_lock; + //rwl *table_lock; + logtable *ltable; + bool *sys_alive; + + #ifdef STATS_ENABLED + int num_reqs; + struct timeval start_tv, stop_tv; + double work_time; + std::map num_reqsc; + std::map work_timec; + #endif + +}; + +struct pthread_item{ + pthread_t * th_handle; + pthread_data *data; +}; + + +//struct work_item +//{ +// int sockd; //socket id +// datatuple in_tuple; //request +// datatuple out_tuple; //response +//}; + +struct serverth_data +{ + int *server_socket; + int server_port; + std::queue *idleth_queue; + std::queue *ready_queue; + + pthread_cond_t *selcond; + + pthread_mutex_t *qlock; + + + +}; + +void * thread_work_fn( void *); + +class logserver +{ +public: + //server codes + static uint8_t OP_SUCCESS; + static uint8_t OP_FAIL; + static uint8_t OP_SENDING_TUPLE; + + //client codes + static uint8_t OP_FIND; + static uint8_t OP_INSERT; + + static uint8_t OP_DONE; + + static uint8_t OP_INVALID; + +public: + logserver(int nthreads, int server_port){ + this->nthreads = nthreads; + this->server_port = server_port; + //lsmlock = new pthread_mutex_t; + //pthread_mutex_init(lsmlock,0); + + //lsmlock = initlock(); + + qlock = new pthread_mutex_t; + pthread_mutex_init(qlock,0); + + ltable = 0; + + #ifdef STATS_ENABLED + num_selevents = 0; + num_selcalls = 0; + #endif + + + } + + ~logserver() + { + //delete lsmlock; + //deletelock(lsmlock); + delete qlock; + } + + void startserver(logtable *ltable); + + void stopserver(); + + +public: + +private: + + //main loop of server + //accept connections, assign jobs to threads + //void dispatchLoop(); + + void eventLoop(); + + +private: + + int server_port; + + int nthreads; + + bool sys_alive; + + int serversocket; //server socket file descriptor + + //ccqueue conn_queue; //list of active connections (socket list) + + //ccqueue idleth_queue; //list of idle threads + + std::queue ready_queue; //connections to go inside select + std::queue work_queue; //connections to be processed by worker threads + std::queue idleth_queue; + pthread_mutex_t *qlock; + + pthread_t server_thread; + serverth_data *sdata; + pthread_cond_t *selcond; //server loop cond + + std::vector th_list; // list of threads + + //rwl *lsmlock; //lock for using lsm table + + logtable *ltable; + + + #ifdef STATS_ENABLED + int num_reqs; + int num_selevents; + int num_selcalls; + struct timeval start_tv, stop_tv; + double tot_threadwork_time; + double tot_time; + #endif + + +}; + + +#endif diff --git a/logserver_pers.cpp b/logserver_pers.cpp new file mode 100644 index 0000000..4c7f2bb --- /dev/null +++ b/logserver_pers.cpp @@ -0,0 +1,519 @@ + + + +#include "logserver.h" +#include "datatuple.h" + +#include "logstore.h" + +#include +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end +#undef try + + +//server codes +uint8_t logserver::OP_SUCCESS = 1; +uint8_t logserver::OP_FAIL = 2; +uint8_t logserver::OP_SENDING_TUPLE = 3; + +//client codes +uint8_t logserver::OP_FIND = 4; +uint8_t logserver::OP_INSERT = 5; + +uint8_t logserver::OP_DONE = 6; + +uint8_t logserver::OP_INVALID = 32; + +void *serverLoop(void *args); + +void logserver::startserver(logtable *ltable) +{ + sys_alive = true; + this->ltable = ltable; + + selcond = new pthread_cond_t; + pthread_cond_init(selcond, 0); + + //initialize threads + for(int i=0; ith_handle = new pthread_t; + struct pthread_data *worker_data = new pthread_data; + worker_th->data = worker_data; + + worker_data->idleth_queue = &idleth_queue; + worker_data->ready_queue = &ready_queue; + worker_data->work_queue = &work_queue; + + worker_data->qlock = qlock; + + worker_data->selcond = selcond; + + worker_data->th_cond = new pthread_cond_t; + pthread_cond_init(worker_data->th_cond,0); + + worker_data->th_mut = new pthread_mutex_t; + pthread_mutex_init(worker_data->th_mut,0); + + worker_data->workitem = new int; + *(worker_data->workitem) = -1; + + worker_data->table_lock = lsmlock; + + worker_data->ltable = ltable; + + worker_data->sys_alive = &sys_alive; + + pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th); + + idleth_queue.push(*worker_th); + + + } + + + + //start server socket + sdata = new serverth_data; + sdata->server_socket = &serversocket; + sdata->server_port = server_port; + sdata->idleth_queue = &idleth_queue; + sdata->ready_queue = &ready_queue; + sdata->selcond = selcond; + sdata->qlock = qlock; + + pthread_create(&server_thread, 0, serverLoop, sdata); + + //start monitoring loop + eventLoop(); + +} + +void logserver::stopserver() +{ + //close the server socket + //stops receiving data on the server socket + shutdown(serversocket, 0); + + //wait for all threads to be idle + while(idleth_queue.size() != nthreads) + sleep(1); + + //set the system running flag to false + sys_alive = false; + for(int i=0; idata->th_mut); + pthread_cond_signal(idle_th->data->th_cond); + pthread_mutex_unlock(idle_th->data->th_mut); + //wait for it to join + pthread_join(*(idle_th->th_handle), 0); + //free the thread variables + pthread_cond_destroy(idle_th->data->th_cond); + delete idle_th->data->th_cond; + delete idle_th->data->th_mut; + delete idle_th->data->workitem; + delete idle_th->data; + delete idle_th->th_handle; + } + + th_list.clear(); + + //close(serversocket); + + return; +} + +void logserver::eventLoop() +{ + + fd_set readfs; + std::vector sel_list; + + int maxfd; + + struct timeval Timeout; + struct timespec ts; + + while(true) + { + //clear readset + FD_ZERO(&readfs); + maxfd = -1; + + ts.tv_nsec = 250000; //nanosec + ts.tv_sec = 0; + + //Timeout.tv_usec = 250; /* microseconds */ + //Timeout.tv_sec = 0; /* seconds */ + + //update select set + pthread_mutex_lock(qlock); + + while(ready_queue.size() == 0) + { + pthread_cond_wait(selcond, qlock); + //pthread_cond_timedwait(selcond, qlock, &ts); + //printf("awoke\n"); + } + + //new connections + processed conns are in ready_queue + //add them to select list + while(ready_queue.size() > 0) + { + sel_list.push_back(ready_queue.front()); + ready_queue.pop(); + } + pthread_mutex_unlock(qlock); + + //ready select set + for(std::vector::const_iterator itr=sel_list.begin(); + itr != sel_list.end(); itr++) + { + if(maxfd < *itr) + maxfd = *itr; + FD_SET(*itr, &readfs); + } + + //select events + int sel_res = select(maxfd+1, &readfs, NULL, NULL, NULL);// &Timeout); + //printf("sel_res %d %d\n", sel_res, errno); + //fflush(stdout); + //job assignment to threads + + for(int i=0; i 0) //assign the job to an indle thread + { + pthread_item idle_th = idleth_queue.front(); + idleth_queue.pop(); + + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = currsock; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + } + else + { + //insert the given element to the work queue + work_queue.push(currsock); + printf("work queue size:\t%d\n", work_queue.size()); + } + + //remove from the sel_list + sel_list.erase(sel_list.begin()+i); + i--; + + pthread_mutex_unlock(qlock); + + } + } + } + +} + +void *serverLoop(void *args) +{ + + serverth_data *sdata = (serverth_data*)args; + + int sockfd; //socket descriptor + struct sockaddr_in serv_addr; + struct sockaddr_in cli_addr; + int newsockfd; //newly created + socklen_t clilen = sizeof(cli_addr); + + + //open a socket + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + printf("ERROR opening socket\n"); + return 0; + } + + bzero((char *) &serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(sdata->server_port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) + { + printf("ERROR on binding.\n"); + return 0; + } + + //start listening on the server socket + //second arg is the max number of coonections waiting in queue + if(listen(sockfd,SOMAXCONN)==-1) + { + printf("ERROR on listen.\n"); + return 0; + } + + printf("LSM Server listenning...\n"); + + *(sdata->server_socket) = sockfd; + int flag, result; + while(true) + { + newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen); + if (newsockfd < 0) + { + printf("ERROR on accept.\n"); + return 0; // we probably want to continue instead of return here (when not debugging) + } + + flag = 1; + result = setsockopt(newsockfd, /* socket affected */ + IPPROTO_TCP, /* set option at TCP level */ + TCP_NODELAY, /* name of option */ + (char *) &flag, /* the cast is historical + cruft */ + sizeof(int)); /* length of option value */ + if (result < 0) + { + printf("ERROR on setting socket option TCP_NODELAY.\n"); + return 0; + } + + char clientip[20]; + inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20); + printf("Connection from:\t%s\n", clientip); + + //printf("Number of idle threads %d\n", idleth_queue.size()); + + pthread_mutex_lock(sdata->qlock); + + //insert the given element to the ready queue + sdata->ready_queue->push(newsockfd); + + if(sdata->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(sdata->selcond); + + pthread_mutex_unlock(sdata->qlock); + + } + + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + + + + + +void * thread_work_fn( void * args) +{ + pthread_item * item = (pthread_item *) args; + + pthread_mutex_lock(item->data->th_mut); + while(true) + { + while(*(item->data->workitem) == -1) + { + if(!*(item->data->sys_alive)) + break; + pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job + } + + + if(!*(item->data->sys_alive)) + { + //printf("thread quitted.\n"); + break; + } + + //step 1: read the opcode + uint8_t opcode; + ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t)); + assert( n == sizeof(uint8_t)); + assert( opcode < logserver::OP_INVALID ); + + if( opcode == logserver::OP_DONE ) //close the conn on failure + { + pthread_mutex_lock(item->data->qlock); + printf("client done. conn closed. (%d, %d, %d, %d)\n", + n, errno, *(item->data->workitem), item->data->work_queue->size()); + close(*(item->data->workitem)); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_cond_signal(item->data->selcond); + + pthread_mutex_unlock(item->data->qlock); + continue; + } + + + //step 2: read the tuple from client + datatuple tuple; + tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t)); + + //read the key length + n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + //read the data length + n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + //read the key + tuple.key = (byte*) malloc(*tuple.keylen); + readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen); + //read the data + if(!tuple.isDelete() && opcode != logserver::OP_FIND) + { + tuple.data = (byte*) malloc(*tuple.datalen); + readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen); + } + else + tuple.data = 0; + + //step 3: process the tuple + //pthread_mutex_lock(item->data->table_lock); + //readlock(item->data->table_lock,0); + + if(opcode == logserver::OP_INSERT) + { + //insert/update/delete + item->data->ltable->insertTuple(tuple); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + //step 4: send response + uint8_t rcode = logserver::OP_SUCCESS; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + } + else if(opcode == logserver::OP_FIND) + { + //find the tuple + datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + + if(dt == 0) //tuple deleted + { + dt = (datatuple*) malloc(sizeof(datatuple)); + dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen); + *dt->keylen = *tuple.keylen; + dt->datalen = dt->keylen + 1; + dt->key = (datatuple::key_t) (dt->datalen+1); + memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen); + dt->setDelete(); + } + + //send the reply code + uint8_t rcode = logserver::OP_SENDING_TUPLE; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length()); + + //free datatuple + free(dt->keylen); + free(dt); + } + + //close the socket + //close(*(item->data->workitem)); + + //free the tuple + free(tuple.keylen); + free(tuple.datalen); + free(tuple.key); + free(tuple.data); + + //printf("socket %d: work completed.\n", *(item->data->workitem)); + + pthread_mutex_lock(item->data->qlock); + + //add conn desc to ready queue + item->data->ready_queue->push(*(item->data->workitem)); + //printf("ready queue size: %d sock(%d)\n", item->data->ready_queue->size(), *(item->data->workitem)); + if(item->data->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(item->data->selcond); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + + } + pthread_mutex_unlock(item->data->th_mut); + + +} + + diff --git a/logserver_pers.h b/logserver_pers.h new file mode 100644 index 0000000..94a10b7 --- /dev/null +++ b/logserver_pers.h @@ -0,0 +1,163 @@ +#ifndef _LOGSERVER_H_ +#define _LOGSERVER_H_ + + +#include +#include + +//#include "logstore.h" + +#include "datatuple.h" + + + +#include +#include + +#undef begin +#undef try +#undef end + +class logtable; + + + +struct pthread_item; + +struct pthread_data { + std::queue *idleth_queue; + std::queue *ready_queue; + std::queue *work_queue; + pthread_mutex_t * qlock; + + pthread_cond_t *selcond; + + pthread_cond_t * th_cond; + pthread_mutex_t * th_mut; + + int *workitem; //id of the socket to work + + //pthread_mutex_t * table_lock; + rwl *table_lock; + logtable *ltable; + bool *sys_alive; +}; + +struct pthread_item{ + pthread_t * th_handle; + pthread_data *data; +}; + + +//struct work_item +//{ +// int sockd; //socket id +// datatuple in_tuple; //request +// datatuple out_tuple; //response +//}; + +struct serverth_data +{ + int *server_socket; + int server_port; + std::queue *idleth_queue; + std::queue *ready_queue; + + pthread_cond_t *selcond; + + pthread_mutex_t *qlock; + + + +}; + +void * thread_work_fn( void *); + +class logserver +{ +public: + //server codes + static uint8_t OP_SUCCESS; + static uint8_t OP_FAIL; + static uint8_t OP_SENDING_TUPLE; + + //client codes + static uint8_t OP_FIND; + static uint8_t OP_INSERT; + + static uint8_t OP_DONE; + + static uint8_t OP_INVALID; + +public: + logserver(int nthreads, int server_port){ + this->nthreads = nthreads; + this->server_port = server_port; + //lsmlock = new pthread_mutex_t; + //pthread_mutex_init(lsmlock,0); + + lsmlock = initlock(); + + qlock = new pthread_mutex_t; + pthread_mutex_init(qlock,0); + + ltable = 0; + + } + + ~logserver() + { + //delete lsmlock; + deletelock(lsmlock); + delete qlock; + } + + void startserver(logtable *ltable); + + void stopserver(); + + +public: + +private: + + //main loop of server + //accept connections, assign jobs to threads + //void dispatchLoop(); + + void eventLoop(); + + +private: + + int server_port; + + int nthreads; + + bool sys_alive; + + int serversocket; //server socket file descriptor + + //ccqueue conn_queue; //list of active connections (socket list) + + //ccqueue idleth_queue; //list of idle threads + + std::queue ready_queue; //connections to go inside select + std::queue work_queue; //connections to be processed by worker threads + std::queue idleth_queue; + pthread_mutex_t *qlock; + + pthread_t server_thread; + serverth_data *sdata; + pthread_cond_t *selcond; //server loop cond + + std::vector th_list; // list of threads + + rwl *lsmlock; //lock for using lsm table + + logtable *ltable; + +}; + + +#endif diff --git a/logserver_simple.cpp b/logserver_simple.cpp new file mode 100644 index 0000000..56f9ceb --- /dev/null +++ b/logserver_simple.cpp @@ -0,0 +1,409 @@ + + + +#include "logserver.h" +#include "datatuple.h" + +#include "logstore.h" + +#include +#include +#include +#include +#include + +#undef begin +#undef end +#undef try + + +//server codes +uint8_t logserver::OP_SUCCESS = 1; +uint8_t logserver::OP_FAIL = 2; +uint8_t logserver::OP_SENDING_TUPLE = 3; + +//client codes +uint8_t logserver::OP_FIND = 4; +uint8_t logserver::OP_INSERT = 5; + +uint8_t logserver::OP_INVALID = 32; + + +void logserver::startserver(logtable *ltable) +{ + sys_alive = true; + this->ltable = ltable; + //initialize threads + for(int i=0; ith_handle = new pthread_t; + struct pthread_data *worker_data = new pthread_data; + worker_th->data = worker_data; + + worker_data->idleth_queue = &idleth_queue; + + worker_data->conn_queue = &conn_queue; + + worker_data->qlock = qlock; + + worker_data->th_cond = new pthread_cond_t; + pthread_cond_init(worker_data->th_cond,0); + + worker_data->th_mut = new pthread_mutex_t; + pthread_mutex_init(worker_data->th_mut,0); + + worker_data->workitem = new int; + *(worker_data->workitem) = -1; + + worker_data->table_lock = lsmlock; + + worker_data->ltable = ltable; + + worker_data->sys_alive = &sys_alive; + + pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th); + + idleth_queue.push(*worker_th); + + + } + + dispatchLoop(); + +} + +void logserver::stopserver() +{ + //close the server socket + //stops receiving data on the server socket + shutdown(serversocket, 0); + + //wait for all threads to be idle + while(idleth_queue.size() != nthreads) + sleep(1); + + //set the system running flag to false + sys_alive = false; + for(int i=0; idata->th_mut); + pthread_cond_signal(idle_th->data->th_cond); + pthread_mutex_unlock(idle_th->data->th_mut); + //wait for it to join + pthread_join(*(idle_th->th_handle), 0); + //free the thread variables + pthread_cond_destroy(idle_th->data->th_cond); + delete idle_th->data->th_cond; + delete idle_th->data->th_mut; + delete idle_th->data->workitem; + delete idle_th->data; + delete idle_th->th_handle; + } + + th_list.clear(); + + return; +} + +void logserver::dispatchLoop() +{ + + int sockfd; //socket descriptor + struct sockaddr_in serv_addr; + struct sockaddr_in cli_addr; + int newsockfd; //newly created + socklen_t clilen = sizeof(cli_addr); + + + //open a socket + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + printf("ERROR opening socket\n"); + return; + } + + bzero((char *) &serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(server_port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) + { + printf("ERROR on binding.\n"); + return; + } + + //start listening on the server socket + //second arg is the max number of coonections waiting in queue + if(listen(sockfd,SOMAXCONN)==-1) + { + printf("ERROR on listen.\n"); + return; + } + + printf("LSM Server listenning...\n"); + + serversocket = sockfd; + int flag, result; + while(true) + { + newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen); + if (newsockfd < 0) + { + printf("ERROR on accept.\n"); + return; // we probably want to continue instead of return here (when not debugging) + } + + flag = 1; + result = setsockopt(newsockfd, /* socket affected */ + IPPROTO_TCP, /* set option at TCP level */ + TCP_NODELAY, /* name of option */ + (char *) &flag, /* the cast is historical + cruft */ + sizeof(int)); /* length of option value */ + if (result < 0) + { + printf("ERROR on setting socket option TCP_NODELAY.\n"); + return; + } + + char clientip[20]; + inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20); + //printf("Connection from:\t%s\n", clientip); + + //printf("Number of idle threads %d\n", idleth_queue.size()); + + pthread_mutex_lock(qlock); + + if(idleth_queue.size() > 0) + { + pthread_item idle_th = idleth_queue.front(); + idleth_queue.pop(); + + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = newsockfd; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + } + else + { + //insert the given element to the queue + conn_queue.push(newsockfd); + //printf("Number of queued connections:\t%d\n", conn_queue.size()); + } + + pthread_mutex_unlock(qlock); + + /* + try + { + + pthread_item idle_th = idleth_queue.pop(); + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = newsockfd; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + + } + catch(int empty_exception) + { + //insert the given element to the queue + conn_queue.push(newsockfd); + //printf("Number of queued connections:\t%d\n", conn_queue.size()); + } + */ + } + + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + + + + + +void * thread_work_fn( void * args) +{ + pthread_item * item = (pthread_item *) args; + + pthread_mutex_lock(item->data->th_mut); + while(true) + { + while(*(item->data->workitem) == -1) + { + if(!*(item->data->sys_alive)) + break; + pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job + } + + + if(!*(item->data->sys_alive)) + { + //printf("thread quitted.\n"); + break; + } + + //step 1: read the opcode + uint8_t opcode; + ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t)); + assert( n == sizeof(uint8_t)); + assert( opcode < logserver::OP_INVALID ); + + //step 2: read the tuple from client + datatuple tuple; + tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t)); + + //read the key length + n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + //read the data length + n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + //read the key + tuple.key = (byte*) malloc(*tuple.keylen); + readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen); + //read the data + if(!tuple.isDelete() && opcode != logserver::OP_FIND) + { + tuple.data = (byte*) malloc(*tuple.datalen); + readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen); + } + else + tuple.data = 0; + + //step 3: process the tuple + //pthread_mutex_lock(item->data->table_lock); + //readlock(item->data->table_lock,0); + + if(opcode == logserver::OP_INSERT) + { + //insert/update/delete + item->data->ltable->insertTuple(tuple); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + //step 4: send response + uint8_t rcode = logserver::OP_SUCCESS; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + } + else if(opcode == logserver::OP_FIND) + { + //find the tuple + datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + + if(dt == 0) //tuple deleted + { + dt = (datatuple*) malloc(sizeof(datatuple)); + dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen); + *dt->keylen = *tuple.keylen; + dt->datalen = dt->keylen + 1; + dt->key = (datatuple::key_t) (dt->datalen+1); + memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen); + dt->setDelete(); + } + + //send the reply code + uint8_t rcode = logserver::OP_SENDING_TUPLE; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length()); + + //free datatuple + free(dt->keylen); + free(dt); + } + + //close the socket + close(*(item->data->workitem)); + + //free the tuple + free(tuple.keylen); + free(tuple.datalen); + free(tuple.key); + free(tuple.data); + + //printf("socket %d: work completed.\n", *(item->data->workitem)); + + pthread_mutex_lock(item->data->qlock); + + if(item->data->conn_queue->size() > 0) + { + int new_work = item->data->conn_queue->front(); + item->data->conn_queue->pop(); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + + /* + //check if there is new work this thread can do + try + { + int new_work = item->data->conn_queue->pop(); + *(item->data->workitem) = new_work; //set new work + //printf("socket %d: new work found.\n", *(item->data->workitem)); + } + catch(int empty_exception) + { + //printf("socket %d: no new work found.\n", *(item->data->workitem)); + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + + } + */ + + } + pthread_mutex_unlock(item->data->th_mut); + + +} + + diff --git a/logserver_simple.h b/logserver_simple.h new file mode 100644 index 0000000..48fbea6 --- /dev/null +++ b/logserver_simple.h @@ -0,0 +1,198 @@ +#ifndef _LOGSERVER_H_ +#define _LOGSERVER_H_ + + +#include +#include + +//#include "logstore.h" + +#include "datatuple.h" + + + +#include +#include + +#undef begin +#undef try +#undef end + +class logtable; + +template +class ccqueue +{ +public: + ccqueue() + { + qmut = new pthread_mutex_t; + pthread_mutex_init(qmut,0); + } + + int size() + { + pthread_mutex_lock(qmut); + int qsize = m_queue.size(); + pthread_mutex_unlock(qmut); + return qsize; + } + + //inserts a copy of the given element to the queue + void push(const T &item) + { + pthread_mutex_lock(qmut); + m_queue.push(item); + pthread_mutex_unlock(qmut); + return; + } + + //returns a copy of the next element + //deletes the copy in the queue + //throws an exception with -1 on empty queue + T pop() throw (int) + { + pthread_mutex_lock(qmut); + + if(m_queue.size() > 0) + { + T item = m_queue.front(); + m_queue.pop(); + pthread_mutex_unlock(qmut); + return item; + } + + + pthread_mutex_unlock(qmut); + throw(-1); + + + } + + + + ~ccqueue() + { + delete qmut; + } + +private: + + std::queue m_queue; + + pthread_mutex_t *qmut; + +}; + +struct pthread_item; + +struct pthread_data { + std::queue *idleth_queue; + std::queue *conn_queue; + pthread_mutex_t * qlock; + + pthread_cond_t * th_cond; + pthread_mutex_t * th_mut; + + int *workitem; //id of the socket to work + + //pthread_mutex_t * table_lock; + rwl *table_lock; + logtable *ltable; + bool *sys_alive; +}; + +struct pthread_item{ + pthread_t * th_handle; + pthread_data *data; +}; + +struct work_item +{ + int sockd; //socket id + datatuple in_tuple; //request + datatuple out_tuple; //response +}; + + +void * thread_work_fn( void *); + +class logserver +{ +public: + //server codes + static uint8_t OP_SUCCESS; + static uint8_t OP_FAIL; + static uint8_t OP_SENDING_TUPLE; + + //client codes + static uint8_t OP_FIND; + static uint8_t OP_INSERT; + + static uint8_t OP_INVALID; + +public: + logserver(int nthreads, int server_port){ + this->nthreads = nthreads; + this->server_port = server_port; + //lsmlock = new pthread_mutex_t; + //pthread_mutex_init(lsmlock,0); + + lsmlock = initlock(); + + qlock = new pthread_mutex_t; + pthread_mutex_init(qlock,0); + + ltable = 0; + + } + + ~logserver() + { + //delete lsmlock; + deletelock(lsmlock); + delete qlock; + } + + void startserver(logtable *ltable); + + void stopserver(); + + +public: + +private: + + //main loop of server + //accept connections, assign jobs to threads + void dispatchLoop(); + + +private: + + int server_port; + + int nthreads; + + bool sys_alive; + + int serversocket; //server socket file descriptor + + //ccqueue conn_queue; //list of active connections (socket list) + + //ccqueue idleth_queue; //list of idle threads + + std::queue conn_queue; + std::queue idleth_queue; + pthread_mutex_t *qlock; + + std::vector th_list; // list of threads + + rwl *lsmlock; //lock for using lsm table + + logtable *ltable; + +}; + + +#endif diff --git a/logstore.cpp b/logstore.cpp new file mode 100644 index 0000000..08d28b7 --- /dev/null +++ b/logstore.cpp @@ -0,0 +1,1606 @@ + + + +#include +#include +#include +#include + + +#include "merger.h" +#include "logstore.h" +#include "logiterators.h" + + +#include "datapage.cpp" + + +#include + +///////////////////////////////////////////////////////////////// +// LOGTREE implementation +///////////////////////////////////////////////////////////////// + +const RegionAllocConf_t logtree::REGION_ALLOC_STATIC_INITIALIZER = { {0,0,-1}, 0, -1, -1, 1000 }; +const RegionAllocConf_t +logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER = { {0,0,-1}, 0, -1, -1, 50000 }; + +#undef DEBUG +#define DEBUG(...) \ + +//printf(__VA_ARGS__); fflush(NULL) + +#define LOGTREE_ROOT_PAGE SLOTTED_PAGE + +//LSM_ROOT_PAGE + +const int64_t logtree::DEPTH = 0; //in root this is the slot num where the DEPTH (of tree) is stored +const int64_t logtree::COMPARATOR = 1; //in root this is the slot num where the COMPARATOR id is stored +const int64_t logtree::FIRST_SLOT = 2; //this is the first unused slot in all index pages +const size_t logtree::root_rec_size = sizeof(int64_t); +const int64_t logtree::PREV_LEAF = 0; //pointer to prev leaf page +const int64_t logtree::NEXT_LEAF = 1; //pointer to next leaf page + + + +logtree::logtree() +{ + +} + +void logtree::free_region_rid(int xid, recordid tree, + logtree_page_deallocator_t dealloc, void *allocator_state) +{ + // Tdealloc(xid,tree); + dealloc(xid,allocator_state); + // XXX fishy shouldn't caller do this? + Tdealloc(xid, *(recordid*)allocator_state); +} + + +void logtree::dealloc_region_rid(int xid, void *conf) +{ + recordid rid = *(recordid*)conf; + RegionAllocConf_t a; + Tread(xid,rid,&a); + DEBUG("{%lld <- dealloc region arraylist}\n", a.regionList.page); + + for(int i = 0; i < a.regionCount; i++) { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + TregionDealloc(xid,pid); + } +} + + +void logtree::force_region_rid(int xid, void *conf) +{ + recordid rid = *(recordid*)conf; + RegionAllocConf_t a; + Tread(xid,rid,&a); + + for(int i = 0; i < a.regionCount; i++) + { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + stasis_dirty_page_table_flush_range((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), pid, pid+a.regionSize); + forcePageRange(pid, pid+a.regionSize); + } +} + + +pageid_t logtree::alloc_region(int xid, void *conf) +{ + RegionAllocConf_t* a = (RegionAllocConf_t*)conf; + + + if(a->nextPage == a->endOfRegion) { + if(a->regionList.size == -1) { + //DEBUG("nextPage: %lld\n", a->nextPage); + a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t)); + DEBUG("regionList.page: %lld\n", a->regionList.page); + DEBUG("regionList.slot: %d\n", a->regionList.slot); + DEBUG("regionList.size: %lld\n", a->regionList.size); + + a->regionCount = 0; + } + DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page); + TarrayListExtend(xid,a->regionList,1); + a->regionList.slot = a->regionCount; + DEBUG("region lst slot %d\n",a->regionList.slot); + a->regionCount++; + DEBUG("region count %lld\n",a->regionCount); + a->nextPage = TregionAlloc(xid, a->regionSize,12); + DEBUG("next page %lld\n",a->nextPage); + a->endOfRegion = a->nextPage + a->regionSize; + Tset(xid,a->regionList,&a->nextPage); + DEBUG("next page %lld\n",a->nextPage); + } + + DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion); + pageid_t ret = a->nextPage; + // Ensure the page is in buffer cache without accessing disk (this + // sets it to clean and all zeros if the page is not in cache). + // Hopefully, future reads will get a cache hit, and avoid going to + // disk. + + Page * p = loadUninitializedPage(xid, ret); + releasePage(p); + DEBUG("ret %lld\n",ret); + (a->nextPage)++; + return ret; + +} + +pageid_t logtree::alloc_region_rid(int xid, void * ridp) { + recordid rid = *(recordid*)ridp; + RegionAllocConf_t conf; + Tread(xid,rid,&conf); + pageid_t ret = alloc_region(xid,&conf); + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&conf); + return ret; +} + + + +recordid logtree::create(int xid) +{ + + tree_state = Talloc(xid,sizeof(RegionAllocConf_t)); + + //int ptype = TpageGetType(xid, tree_state.page); + //DEBUG("page type %d\n", ptype); //returns a slotted page + + Tset(xid,tree_state, ®ION_ALLOC_STATIC_INITIALIZER); + + pageid_t root = alloc_region_rid(xid, &tree_state); + DEBUG("Root = %lld\n", root); + recordid ret = { root, 0, 0 }; + + Page *p = loadPage(xid, ret.page); + writelock(p->rwlatch,0); + + stasis_page_slotted_initialize_page(p); + + //*stasis_page_type_ptr(p) = SLOTTED_PAGE; //LOGTREE_ROOT_PAGE; + + //logtree_state *state = (logtree_state*) ( malloc(sizeof(logtree_state))); + //state->lastLeaf = -1; + + //p->impl = state; + lastLeaf = -1; + + //initialize root node + recordid tmp = stasis_record_alloc_begin(xid, p, root_rec_size); + stasis_record_alloc_done(xid,p,tmp); + + assert(tmp.page == ret.page + && tmp.slot == DEPTH + && tmp.size == root_rec_size); + + writeRecord(xid, p, tmp, (byte*)&DEPTH, root_rec_size); + + tmp = stasis_record_alloc_begin(xid, p, root_rec_size); + stasis_record_alloc_done(xid,p,tmp); + + assert(tmp.page == ret.page + && tmp.slot == COMPARATOR + && tmp.size == root_rec_size); + + writeRecord(xid, p, tmp, (byte*)&COMPARATOR, root_rec_size); + + + unlock(p->rwlatch); + releasePage(p); + + root_rec = ret; + + return ret; +} + + +/** + * TODO: what happen if there is already such a record with a different size? + * I guess this should never happen in rose, but what if? + **/ +void logtree::writeRecord(int xid, Page *p, recordid &rid, + const byte *data, size_t datalen) +{ + byte *byte_arr = stasis_record_write_begin(xid, p, rid); + memcpy(byte_arr, data, datalen); //TODO: stasis write call + stasis_record_write_done(xid, p, rid, byte_arr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? + +} + +void logtree::writeNodeRecord(int xid, Page * p, recordid & rid, + const byte *key, size_t keylen, pageid_t ptr) +{ + DEBUG("writenoderecord:\tp->id\t%lld\tkey:\t%s\tkeylen: %d\tval_page\t%lld\n", + p->id, datatuple::key_to_str(key).c_str(), keylen, ptr); + indexnode_rec *nr = (indexnode_rec*)stasis_record_write_begin(xid, p, rid); + nr->ptr = ptr; + memcpy(nr+1, key, keylen); + stasis_record_write_done(xid, p, rid, (byte*)nr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? +} + +void logtree::writeRecord(int xid, Page *p, slotid_t slot, + const byte *data, size_t datalen) +{ + recordid rid; + rid.page = p->id; + rid.slot = slot; + rid.size = datalen; + byte *byte_arr = stasis_record_write_begin(xid, p, rid); + memcpy(byte_arr, data, datalen); //TODO: stasis write call + stasis_record_write_done(xid, p, rid, byte_arr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? + +} + +const byte* logtree::readRecord(int xid, Page * p, recordid &rid) +{ + //byte *ret = (byte*)malloc(rid.size); + //const byte *nr = stasis_record_read_begin(xid,p,rid); + //memcpy(ret, nr, rid.size); + //stasis_record_read_done(xid,p,rid,nr); + + const byte *nr = stasis_record_read_begin(xid,p,rid); + return nr; + + //DEBUG("reading {%lld, %d, %d}\n", + // p->id, rid.slot, rid.size ); + + //return ret; +} + +const byte* logtree::readRecord(int xid, Page * p, slotid_t slot, int64_t size) +{ + recordid rid; + rid.page = p->id; + rid.slot = slot; + rid.size = size; + //byte *ret = (byte*)malloc(rid.size); + //stasis_record_read(xid,p,rid,ret); + //return ret; + const byte *nr = stasis_record_read_begin(xid,p,rid); + return nr; +// return readRecord(xid, p, rid); + +} + +int32_t logtree::readRecordLength(int xid, Page *p, slotid_t slot) +{ + recordid rec = {p->id, slot, 0}; + int32_t reclen = stasis_record_length_read(xid, p, rec); + return reclen; +} + +void logtree::initializeNodePage(int xid, Page *p) +{ + stasis_page_slotted_initialize_page(p); + recordid reserved1 = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)); + stasis_record_alloc_done(xid, p, reserved1); + recordid reserved2 = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)); + stasis_record_alloc_done(xid, p, reserved2); +} + + +recordid logtree::appendPage(int xid, recordid tree, pageid_t & rmLeafID, + const byte *key, size_t keySize, + lsm_page_allocator_t allocator, void *allocator_state, + long val_page) +{ + Page *p = loadPage(xid, tree.page); + writelock(p->rwlatch, 0); + //logtree_state *s = (logtree_state*)p->impl; + + tree.slot = 0; + //tree.size = sizeof(lsmTreeNodeRecord)+keySize; + + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + int64_t depth = *((int64_t*)nr); + + if(rmLeafID == -1) { + rmLeafID = findLastLeaf(xid, p, depth); + } + + Page *lastLeaf; + + if(rmLeafID != tree.page) + { + lastLeaf= loadPage(xid, rmLeafID); + writelock(lastLeaf->rwlatch, 0); + } else + lastLeaf = p; + + + recordid ret = stasis_record_alloc_begin(xid, lastLeaf, + sizeof(indexnode_rec)+keySize); + + if(ret.size == INVALID_SLOT) + { + if(lastLeaf->id != p->id) + { + assert(rmLeafID != tree.page); + unlock(lastLeaf->rwlatch); + releasePage(lastLeaf); // don't need that page anymore... + lastLeaf = 0; + } + // traverse down the root of the tree. + + tree.slot = 0; + + assert(tree.page == p->id); + + ret = appendInternalNode(xid, p, depth, key, keySize, val_page, + rmLeafID == tree.page ? -1 : rmLeafID, + allocator, allocator_state); + + if(ret.size == INVALID_SLOT) + { + DEBUG("Need to split root; depth = %d\n", depth); + + pageid_t child = allocator(xid, allocator_state); + Page *lc = loadPage(xid, child); + writelock(lc->rwlatch,0); + + initializeNodePage(xid, lc); + + //creates a copy of the root page records in the + //newly allocated child page + for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(p); i++) + { + //read the record from the root page + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,p,i,0); + int reclen = readRecordLength(xid, p, i); + + recordid cnext = stasis_record_alloc_begin(xid, lc,reclen); + + assert(i == cnext.slot); + assert(cnext.size != INVALID_SLOT); + + stasis_record_alloc_done(xid, lc, cnext); + + writeRecord(xid,lc,i,(byte*)(nr),reclen); + } + + // deallocate old entries, and update pointer on parent node. + // NOTE: stasis_record_free call goes to slottedFree in slotted.c + // this function only reduces the numslots when you call it + // with the last slot. so thats why i go backwards here. + for(int i = *stasis_page_slotted_numslots_ptr(p)-1; i>FIRST_SLOT; i--) + { + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,p,i,0); + int reclen = readRecordLength(xid, p, i); + recordid tmp_rec= {p->id, i, reclen}; + stasis_record_free(xid, p, tmp_rec); + } + + //TODO: could change with stasis_slotted_page_initialize(...); + // reinsert first. + + recordid pFirstSlot = { p->id, FIRST_SLOT, readRecordLength(xid, p, FIRST_SLOT)}; + + assert(*stasis_page_slotted_numslots_ptr(p) == FIRST_SLOT+1); + + indexnode_rec *nr + = (indexnode_rec*)stasis_record_write_begin(xid, p, pFirstSlot); + + // don't overwrite key... + nr->ptr = child; + stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? + + if(!depth) { + rmLeafID = lc->id; + pageid_t tmpid = -1; + writeRecord(xid,lc,PREV_LEAF,(byte*)(&tmpid), root_rec_size); + writeRecord(xid,lc,NEXT_LEAF,(byte*)(&tmpid), root_rec_size); + } + + unlock(lc->rwlatch); + releasePage(lc); + + //update the depth info at the root + depth ++; + writeRecord(xid,p,DEPTH,(byte*)(&depth), root_rec_size); + + assert(tree.page == p->id); + ret = appendInternalNode(xid, p, depth, key, keySize, val_page, + rmLeafID == tree.page ? -1 : rmLeafID, + allocator, allocator_state); + + assert(ret.size != INVALID_SLOT); + + } + else { + DEBUG("Appended new internal node tree depth = %lld key = %s\n", + depth, datatuple::key_to_str(key).c_str()); + } + + rmLeafID = ret.page; + DEBUG("lastleaf is %lld\n", rmLeafID); + + + } + else + { + // write the new value to an existing page + DEBUG("Writing %s\t%d to existing page# %lld\n", datatuple::key_to_str(key).c_str(), + val_page, lastLeaf->id); + + stasis_record_alloc_done(xid, lastLeaf, ret); + + logtree::writeNodeRecord(xid, lastLeaf, ret, key, keySize, val_page); + + if(lastLeaf->id != p->id) { + assert(rmLeafID != tree.page); + unlock(lastLeaf->rwlatch); + releasePage(lastLeaf); + } + } + + unlock(p->rwlatch); + releasePage(p); + + return ret; +} + +/* adding pages: + + 1) Try to append value to lsmTreeState->lastLeaf + + 2) If that fails, traverses down the root of the tree, split pages while + traversing back up. + + 3) Split is done by adding new page at end of row (no key + redistribution), except at the root, where root contents are + pushed into the first page of the next row, and a new path from root to + leaf is created starting with the root's immediate second child. + +*/ + +recordid logtree::appendInternalNode(int xid, Page *p, + int64_t depth, + const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state) +{ +// assert(*stasis_page_type_ptr(p) == LOGTREE_ROOT_PAGE || +// *stasis_page_type_ptr(p) == SLOTTED_PAGE); + assert(p->pageType == LOGTREE_ROOT_PAGE || + p->pageType == SLOTTED_PAGE); + + DEBUG("appendInternalNode\tdepth %lldkeylen%d\tnumslots %d\n", depth, key_len, *stasis_page_slotted_numslots_ptr(p)); + + if(!depth) + { + // leaf node. + recordid ret = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)+key_len); + if(ret.size != INVALID_SLOT) { + stasis_record_alloc_done(xid, p, ret); + writeNodeRecord(xid,p,ret,key,key_len,val_page); + } + return ret; + } + else + { + // recurse + int slot = *stasis_page_slotted_numslots_ptr(p)-1;//*recordcount_ptr(p)-1; + + assert(slot >= FIRST_SLOT); // there should be no empty nodes + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid, p, slot, 0); + pageid_t child_id = nr->ptr; + nr = 0; + recordid ret; + { + Page *child_page = loadPage(xid, child_id); + writelock(child_page->rwlatch,0); + ret = appendInternalNode(xid, child_page, depth-1, key, key_len, + val_page, lastLeaf, allocator, allocator_state); + + unlock(child_page->rwlatch); + releasePage(child_page); + } + + if(ret.size == INVALID_SLOT) // subtree is full; split + { + ret = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)+key_len); + DEBUG("keylen %d\tnumslots %d for page id %lld ret.size %lld prv rec len %d\n", + key_len, + *stasis_page_slotted_numslots_ptr(p), + p->id, + ret.size, + readRecordLength(xid, p, slot)); + if(ret.size != INVALID_SLOT) + { + stasis_record_alloc_done(xid, p, ret); + ret = buildPathToLeaf(xid, ret, p, depth, key, key_len, val_page, + lastLeaf, allocator, allocator_state); + + DEBUG("split tree rooted at %lld, wrote value to {%d %d %lld}\n", + p->id, ret.page, ret.slot, ret.size); + } else { + // ret is NULLRID; this is the root of a full tree. Return + // NULLRID to the caller. + } + } else { + // we inserted the value in to a subtree rooted here. + } + return ret; + } +} + +recordid logtree::buildPathToLeaf(int xid, recordid root, Page *root_p, + int64_t depth, const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state) +{ + + // root is the recordid on the root page that should point to the + // new subtree. + assert(depth); + DEBUG("buildPathToLeaf(depth=%lld) (lastleaf=%lld) called\n",depth, lastLeaf); + + pageid_t child = allocator(xid,allocator_state); + DEBUG("new child = %lld internal? %lld\n", child, depth-1); + + Page *child_p = loadPage(xid, child); + writelock(child_p->rwlatch,0); + initializeNodePage(xid, child_p); + + recordid ret; + + if(depth-1) { + // recurse: the page we just allocated is not a leaf. + recordid child_rec = stasis_record_alloc_begin(xid, child_p, sizeof(indexnode_rec)+key_len); + assert(child_rec.size != INVALID_SLOT); + stasis_record_alloc_done(xid, child_p, child_rec); + + ret = buildPathToLeaf(xid, child_rec, child_p, depth-1, key, key_len, + val_page,lastLeaf, allocator, allocator_state); + + unlock(child_p->rwlatch); + releasePage(child_p); + + } else { + // set leaf + + // backward link.//these writes do not need alloc_begin as it is done in page initialization + writeRecord(xid, child_p, PREV_LEAF, (byte*)(&lastLeaf), root_rec_size); + //writeNodeRecord(xid,child_p,PREV_LEAF,dummy,key_len,lastLeaf); + + // forward link (initialize to -1) + + pageid_t tmp_pid = -1; + writeRecord(xid, child_p, NEXT_LEAF, (byte*)(&tmp_pid), root_rec_size); + //writeNodeRecord(xid,child_p,NEXT_LEAF,dummy,key_len,-1); + + recordid leaf_rec = stasis_record_alloc_begin(xid, child_p, + sizeof(indexnode_rec)+key_len); + + assert(leaf_rec.slot == FIRST_SLOT); + + stasis_record_alloc_done(xid, child_p, leaf_rec); + writeNodeRecord(xid,child_p,leaf_rec,key,key_len,val_page); + + ret = leaf_rec; + + unlock(child_p->rwlatch); + releasePage(child_p); + if(lastLeaf != -1) + { + // install forward link in previous page + Page *lastLeafP = loadPage(xid, lastLeaf); + writelock(lastLeafP->rwlatch,0); + writeRecord(xid,lastLeafP,NEXT_LEAF,(byte*)(&child),root_rec_size); + unlock(lastLeafP->rwlatch); + releasePage(lastLeafP); + } + + DEBUG("%lld <-> %lld\n", lastLeaf, child); + } + + writeNodeRecord(xid, root_p, root, key, key_len, child); + + return ret; + +} + + + +/** + * Traverse from the root of the page to the right most leaf (the one + * with the higest base key value). + **/ +pageid_t logtree::findLastLeaf(int xid, Page *root, int64_t depth) +{ + if(!depth) + { + DEBUG("Found last leaf = %lld\n", root->id); + return root->id; + } + else + { + const indexnode_rec *nr = (indexnode_rec*) readRecord(xid, root, + (*stasis_page_slotted_numslots_ptr(root))-1, 0); + pageid_t ret; + + Page *p = loadPage(xid, nr->ptr); + readlock(p->rwlatch,0); + ret = findLastLeaf(xid,p,depth-1); + unlock(p->rwlatch); + releasePage(p); + + return ret; + } +} + + +/** + * Traverse from the root of the tree to the left most (lowest valued + * key) leaf. + */ +pageid_t logtree::findFirstLeaf(int xid, Page *root, int64_t depth) +{ + if(!depth) //if depth is 0, then returns the id of the page + return root->id; + else + { + const indexnode_rec *nr = (indexnode_rec*)readRecord(xid,root,FIRST_SLOT,0); + Page *p = loadPage(xid, nr->ptr); + readlock(p->rwlatch,0); + pageid_t ret = findFirstLeaf(xid,p,depth-1); + unlock(p->rwlatch); + releasePage(p); + return ret; + } +} + + +pageid_t logtree::findPage(int xid, recordid tree, const byte *key, size_t keySize) +{ + Page *p = loadPage(xid, tree.page); + readlock(p->rwlatch,0); + + const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + + int64_t depth = *((int64_t*)depth_nr); + + recordid rid = lookup(xid, p, depth, key, keySize); + pageid_t ret = lookupLeafPageFromRid(xid,rid);//,keySize); + unlock(p->rwlatch); + releasePage(p); + + return ret; + +} + +pageid_t logtree::lookupLeafPageFromRid(int xid, recordid rid) +{ + pageid_t pid = -1; + if(rid.page != NULLRID.page || rid.slot != NULLRID.slot) + { + Page * p2 = loadPage(xid, rid.page); + readlock(p2->rwlatch,0); + pid = ((const indexnode_rec*)(readRecord(xid,p2,rid.slot,0)))->ptr; + unlock(p2->rwlatch); + releasePage(p2); + } + return pid; +} + + +recordid logtree::lookup(int xid, + Page *node, + int64_t depth, + const byte *key, size_t keySize ) +{ + //DEBUG("lookup: pid %lld\t depth %lld\n", node->id, depth); + if(*stasis_page_slotted_numslots_ptr(node) == FIRST_SLOT) + return NULLRID; + + assert(*stasis_page_slotted_numslots_ptr(node) > FIRST_SLOT); + + int match = FIRST_SLOT; + + // don't need to compare w/ first item in tree. + const indexnode_rec * rec = (indexnode_rec*)readRecord(xid,node,FIRST_SLOT,0); //TODO: why read it then? + + for(int i = FIRST_SLOT+1; i < *stasis_page_slotted_numslots_ptr(node); i++) + { + rec = (const indexnode_rec*)readRecord(xid,node,i,0); + int cmpval = datatuple::compare((datatuple::key_t) (rec+1),(datatuple::key_t) key); + if(cmpval>0) //changed it from > + break; + match = i; + } + + + if(depth) + { + pageid_t child_id = ((const indexnode_rec*)readRecord(xid,node,match,0))->ptr; + Page* child_page = loadPage(xid, child_id); + readlock(child_page->rwlatch,0); + recordid ret = lookup(xid,child_page,depth-1,key,0); + unlock(child_page->rwlatch); + releasePage(child_page); + return ret; + } + else + { + recordid ret = {node->id, match, keySize}; + return ret; + } +} + + +void logtree::print_tree(int xid) +{ + Page *p = loadPage(xid, root_rec.page); + readlock(p->rwlatch,0); + + const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + + int64_t depth = *((int64_t*)depth_nr); + + print_tree(xid, root_rec.page, depth); + + unlock(p->rwlatch); + releasePage(p); + +} + +void logtree::print_tree(int xid, pageid_t pid, int64_t depth) +{ + + Page *node = loadPage(xid, pid); + readlock(node->rwlatch,0); + + //const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + + printf("page_id:%lld\tnum_slots:%d\t\n", node->id, *stasis_page_slotted_numslots_ptr(node)); + + if(*stasis_page_slotted_numslots_ptr(node) == FIRST_SLOT) + return; + + assert(*stasis_page_slotted_numslots_ptr(node) > FIRST_SLOT); + + if(depth) + { + printf("\tnot_leaf\n"); + + for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(node); i++) + { + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,i,0); + printf("\tchild_page_id:%lld\tkey:%s\n", nr->ptr, + datatuple::key_to_str((byte*)(nr+1)).c_str()); + + } + + for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(node); i++) + { + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,i,0); + print_tree(xid, nr->ptr, depth-1); + + } + + } + else + { + printf("\tis_leaf\t\n"); + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,FIRST_SLOT,0); + printf("\tdata_page_id:%lld\tkey:%s\n", nr->ptr, + datatuple::key_to_str((byte*)(nr+1)).c_str()); + printf("\t...\n"); + nr = (const indexnode_rec*)readRecord(xid,node,(*stasis_page_slotted_numslots_ptr(node))-1,0); + printf("\tdata_page_id:%lld\tkey:%s\n", nr->ptr, + datatuple::key_to_str((byte*)(nr+1)).c_str()); + + + } + + + unlock(node->rwlatch); + releasePage(node); + + +} + +///////////////////////////////////////////////////////////////// +// LOG TABLE IMPLEMENTATION +///////////////////////////////////////////////////////////////// + +template class DataPage; + + +logtable::logtable() +{ + + tree_c0 = NULL; + tree_c1 = NULL; + tree_c2 = NULL; +// rbtree_mut = NULL; + this->mergedata = 0; + fixed_page_count = -1; + //tmerger = new tuplemerger(&append_merger); + tmerger = new tuplemerger(&replace_merger); + + tsize = 0; + tree_bytes = 0; + + +} + +logtable::~logtable() +{ + if(tree_c1 != NULL) + delete tree_c1; + if(tree_c2 != NULL) + delete tree_c2; + + if(tree_c0 != NULL) + { + for(rbtree_t::iterator delitr=tree_c0->begin(); + delitr != tree_c0->end(); delitr++) + free((*delitr).keylen); + + delete tree_c0; + } + + delete tmerger; + + /* + if(rbtree_mut) + delete rbtree_mut; + if(tree_c0) + delete tree_c0; + if(input_needed) + delete input_needed; + */ +} + +recordid logtable::allocTable(int xid) +{ + + table_rec = Talloc(xid, sizeof(tbl_header)); + + //create the big tree + tree_c2 = new logtree(); + tree_c2->create(xid); + + tbl_header.c2_dp_state = Talloc(xid, sizeof(RegionAllocConf_t)); + Tset(xid, tbl_header.c2_dp_state, &DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + + //create the small tree + tree_c1 = new logtree(); + tree_c1->create(xid); + tbl_header.c1_dp_state = Talloc(xid, sizeof(RegionAllocConf_t)); + Tset(xid, tbl_header.c1_dp_state, &DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + tbl_header.c2_root = tree_c2->get_root_rec(); + tbl_header.c2_state = tree_c2->get_tree_state(); + tbl_header.c1_root = tree_c1->get_root_rec(); + tbl_header.c1_state = tree_c1->get_tree_state(); + + Tset(xid, table_rec, &tbl_header); + + return table_rec; +} + +void logtable::flushTable() +{ + struct timeval start_tv, stop_tv; + double start, stop; + + static double last_start; + static bool first = 1; + static int merge_count = 0; + + gettimeofday(&start_tv,0); + start = tv_to_double(start_tv); + + + writelock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + int expmcount = merge_count; + + + //this is for waiting the previous merger of the mem-tree + //hopefullly this wont happen + printf("prv merge not complete\n"); + + + while(*mergedata->old_c0) { + unlock(mergedata->header_lock); +// pthread_mutex_lock(mergedata->rbtree_mut); + if(tree_bytes >= MAX_C0_SIZE) + pthread_cond_wait(mergedata->input_needed_cond, mergedata->rbtree_mut); + else + { + pthread_mutex_unlock(mergedata->rbtree_mut); + return; + } + + + pthread_mutex_unlock(mergedata->rbtree_mut); + + writelock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + if(expmcount != merge_count) + { + unlock(mergedata->header_lock); + pthread_mutex_unlock(mergedata->rbtree_mut); + return; + } + + } + + printf("prv merge complete\n"); + + gettimeofday(&stop_tv,0); + stop = tv_to_double(stop_tv); + + //rbtree_ptr *tmp_ptr = new rbtree_ptr_t; //(typeof(h->scratch_tree)*) malloc(sizeof(void*)); + //*tmp_ptr = tree_c0; + *(mergedata->old_c0) = tree_c0; + +// pthread_mutex_lock(mergedata->rbtree_mut); + pthread_cond_signal(mergedata->input_ready_cond); +// pthread_mutex_unlock(mergedata->rbtree_mut); + + merge_count ++; + tree_c0 = new rbtree_t; + tsize = 0; + tree_bytes = 0; + + pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + if(first) + { + printf("flush waited %f sec\n", stop-start); + first = 0; + } + else + { + printf("flush waited %f sec (worked %f)\n", + stop-start, start-last_start); + } + last_start = stop; + +} + +datatuple * logtable::findTuple(int xid, datatuple::key_t key, size_t keySize) +{ + //prepare a search tuple + datatuple search_tuple; + search_tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + *(search_tuple.keylen) = keySize; + search_tuple.key = key; + + readlock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + datatuple *ret_tuple=0; + + //step 1: look in tree_c0 + rbtree_t::iterator rbitr = tree_c0->find(search_tuple); + if(rbitr != tree_c0->end()) + { + DEBUG("tree_c0 size %d\n", tree_c0->size()); + datatuple tuple = *rbitr; + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + } + + bool done = false; + //step: 2 look into first in tree if exists (a first level merge going on) + if(*(mergedata->old_c0) != 0) + { + DEBUG("old mem tree not null %d\n", (*(mergedata->old_c0))->size()); + rbitr = (*(mergedata->old_c0))->find(search_tuple); + if(rbitr != (*(mergedata->old_c0))->end()) + { + datatuple tuple = *rbitr; + + if(tuple.isDelete()) //tuple deleted + done = true; //return ret_tuple + else if(ret_tuple != 0) //merge the two + { + datatuple *mtuple = tmerger->merge(&tuple, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from current tree + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //key first found in old mem tree + { + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + } + //we cannot free tuple from old-tree 'cos it is not a copy + } + } + + //release the memtree lock + pthread_mutex_unlock(mergedata->rbtree_mut); + + //step 3: check c1 + if(!done) + { + datatuple *tuple_c1 = findTuple(xid, key, keySize, tree_c1); + if(tuple_c1 != NULL) + { + bool use_copy = false; + if(tuple_c1->isDelete()) //tuple deleted + done = true; + else if(ret_tuple != 0) //merge the two + { + datatuple *mtuple = tmerger->merge(tuple_c1, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from before + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //found for the first time + { + use_copy = true; + ret_tuple = tuple_c1; + //byte *barr = (byte*)malloc(tuple_c1->byte_length()); + //memcpy(barr, (byte*)tuple_c1->keylen, tuple_c1->byte_length()); + //ret_tuple = datatuple::from_bytes(barr); + } + + if(!use_copy) + { + free(tuple_c1->keylen); //free tuple from tree c1 + free(tuple_c1); + } + } + } + + //step 4: check old c1 if exists + if(!done && *(mergedata->diskmerge_args->in_tree) != 0) + { + DEBUG("old c1 tree not null\n"); + datatuple *tuple_oc1 = findTuple(xid, key, keySize, + (logtree*)( *(mergedata->diskmerge_args->in_tree))); + + if(tuple_oc1 != NULL) + { + bool use_copy = false; + if(tuple_oc1->isDelete()) + done = true; + else if(ret_tuple != 0) //merge the two + { + datatuple *mtuple = tmerger->merge(tuple_oc1, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from before + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //found for the first time + { + use_copy = true; + ret_tuple = tuple_oc1; + //byte *barr = (byte*)malloc(tuple_oc1->byte_length()); + //memcpy(barr, (byte*)tuple_oc1->keylen, tuple_oc1->byte_length()); + //ret_tuple = datatuple::from_bytes(barr); + } + + if(!use_copy) + { + free(tuple_oc1->keylen); //free tuple from tree old c1 + free(tuple_oc1); + } + } + } + + //step 5: check c2 + if(!done) + { + DEBUG("Not in old first disk tree\n"); + datatuple *tuple_c2 = findTuple(xid, key, keySize, tree_c2); + + if(tuple_c2 != NULL) + { + bool use_copy = false; + if(tuple_c2->isDelete()) + done = true; + else if(ret_tuple != 0) + { + datatuple *mtuple = tmerger->merge(tuple_c2, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from before + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //found for the first time + { + use_copy = true; + ret_tuple = tuple_c2; + //byte *barr = (byte*)malloc(tuple_c2->byte_length()); + //memcpy(barr, (byte*)tuple_c2->keylen, tuple_c2->byte_length()); + //ret_tuple = datatuple::from_bytes(barr); + } + + if(!use_copy) + { + free(tuple_c2->keylen); //free tuple from tree c2 + free(tuple_c2); + } + } + } + + //pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + free(search_tuple.keylen); + + return ret_tuple; + +} + +/* + * returns the first record found with the matching key + * (not to be used together with diffs) + **/ +datatuple * logtable::findTuple_first(int xid, datatuple::key_t key, size_t keySize) +{ + //prepare a search tuple + datatuple search_tuple; + search_tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + *(search_tuple.keylen) = keySize; + search_tuple.key = key; + + pthread_mutex_lock(mergedata->rbtree_mut); + + datatuple *ret_tuple=0; + //step 1: look in tree_c0 + + rbtree_t::iterator rbitr = tree_c0->find(search_tuple); + if(rbitr != tree_c0->end()) + { + DEBUG("tree_c0 size %d\n", tree_c0->size()); + datatuple tuple = *rbitr; + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + + } + else + { + DEBUG("Not in mem tree %d\n", tree_c0->size()); + //step: 2 look into first in tree if exists (a first level merge going on) + if(*(mergedata->old_c0) != 0) + { + DEBUG("old mem tree not null %d\n", (*(mergedata->old_c0))->size()); + rbitr = (*(mergedata->old_c0))->find(search_tuple); + if(rbitr != (*(mergedata->old_c0))->end()) + { + datatuple tuple = *rbitr; + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + } + } + + if(ret_tuple == 0) + { + DEBUG("Not in old mem tree\n"); + + //step 3: check c1 + ret_tuple = findTuple(xid, key, keySize, tree_c1); + } + + if(ret_tuple == 0) + { + DEBUG("Not in first disk tree\n"); + + //step 4: check old c1 if exists + if( *(mergedata->diskmerge_args->in_tree) != 0) + { + DEBUG("old c1 tree not null\n"); + ret_tuple = findTuple(xid, key, keySize, + (logtree*)( *(mergedata->diskmerge_args->in_tree))); + } + + } + + if(ret_tuple == 0) + { + DEBUG("Not in old first disk tree\n"); + + //step 5: check c2 + ret_tuple = findTuple(xid, key, keySize, tree_c2); + } + } + + + + + pthread_mutex_unlock(mergedata->rbtree_mut); + free(search_tuple.keylen); + + return ret_tuple; + +} + +void logtable::insertTuple(struct datatuple &tuple) +{ + //static int count = LATCH_INTERVAL; + //static int tsize = 0; //number of tuples + //static int64_t tree_bytes = 0; //number of bytes + static const size_t isize = sizeof(uint32_t); + + //lock the red-black tree + readlock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + //find the previous tuple with same key in the memtree if exists + rbtree_t::iterator rbitr = tree_c0->find(tuple); + if(rbitr != tree_c0->end()) + { + datatuple pre_t = *rbitr; + //do the merging + datatuple *new_t = tmerger->merge(&pre_t, &tuple); + tree_c0->erase(pre_t); //remove the previous tuple + + tree_c0->insert( *new_t); //insert the new tuple + + //update the tree size (+ new_t size - pre_t size) + tree_bytes += (new_t->byte_length() - pre_t.byte_length()); + + free(pre_t.keylen); //free the previous tuple + free(new_t); // frees the malloc(sizeof(datatuple)) coming from merge + } + else //no tuple with same key exists in mem-tree + { + + //create a copy + datatuple t; + byte *arr = (byte*) malloc(tuple.byte_length()); + + t.keylen = (uint32_t*) arr; + *t.keylen = *tuple.keylen; + t.datalen = (uint32_t*) (arr+isize); + *t.datalen = *tuple.datalen; + t.key = (datatuple::key_t) (arr+isize+isize); + memcpy((byte*)t.key, (byte*)tuple.key, *t.keylen); + if(!tuple.isDelete()) + { + t.data = (datatuple::data_t) (arr+isize+isize+ *(t.keylen)); + memcpy((byte*)t.data, (byte*)tuple.data, *t.datalen); + } + else + t.data = 0; + + //insert tuple into the rbtree + tree_c0->insert(t); + tsize++; + tree_bytes += t.byte_length() + RB_TREE_OVERHEAD; + + } + + //flushing logic + /* + bool go = false; + if(tree_bytes >= MAX_C0_SIZE) + { + go = *mergedata->input_needed; + DEBUG("go %d\n", go); + } + */ + + if(tree_bytes >= MAX_C0_SIZE ) + { + DEBUG("tree size before merge %d tuples %lld bytes.\n", tsize, tree_bytes); + pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + flushTable(); + + readlock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + //tsize = 0; + //tree_bytes = 0; + + } + + //unlock + pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + + + DEBUG("tree size %d tuples %lld bytes.\n", tsize, tree_bytes); +} + + +DataPage* logtable::insertTuple(int xid, struct datatuple &tuple, recordid &dpstate, logtree *ltree) +{ + + //create a new data page + + DataPage * dp = 0; + + while(dp==0) + { + dp = new DataPage(xid, fixed_page_count, + &DataPage::dp_alloc_region_rid, + &dpstate ); + + //insert the record into the data page + if(!dp->append(xid, tuple)) + { + delete dp; + dp = 0; + } + } + + + RegionAllocConf_t alloc_conf; + //insert the record key and id of the first page of the datapage to the logtree + Tread(xid,ltree->get_tree_state(), &alloc_conf); + logtree::appendPage(xid, ltree->get_root_rec(), ltree->lastLeaf, + tuple.get_key(), + *tuple.keylen, + ltree->alloc_region, + &alloc_conf, + dp->get_start_pid() + ); + Tset(xid,ltree->get_tree_state(),&alloc_conf); + + + //return the datapage + return dp; +} + +datatuple * logtable::findTuple(int xid, datatuple::key_t key, size_t keySize, logtree *ltree) +{ + datatuple * tup=0; + + //find the datapage + pageid_t pid = ltree->findPage(xid, ltree->get_root_rec(), (byte*)key, keySize); + + if(pid!=-1) + { + DataPage * dp = new DataPage(xid, pid); + dp->recordRead(xid, key, keySize, &tup); + delete dp; + } + return tup; +} + + +///////////////////////////////////////////////// +//logtreeIterator implementation +///////////////////////////////////////////////// + +lladdIterator_t* logtreeIterator::open(int xid, recordid root) +{ + if(root.page == 0 && root.slot == 0 && root.size == -1) + return 0; + + Page *p = loadPage(xid,root.page); + readlock(p->rwlatch,0); + + //size_t keySize = getKeySize(xid,p); + DEBUG("ROOT_REC_SIZE %d\n", logtree::root_rec_size); + const byte * nr = logtree::readRecord(xid,p, + logtree::DEPTH, + logtree::root_rec_size); + int64_t depth = *((int64_t*)nr); + DEBUG("DEPTH = %lld\n", depth); + + pageid_t leafid = logtree::findFirstLeaf(xid, p, depth); + if(leafid != root.page) + { + unlock(p->rwlatch); + releasePage(p); + p = loadPage(xid,leafid); + readlock(p->rwlatch,0); + assert(depth != 0); + } + else + assert(depth == 0); + + + logtreeIterator_s *impl = (logtreeIterator_s*)malloc(sizeof(logtreeIterator_s)); + impl->p = p; + { + recordid rid = { p->id, 1, 0};//keySize }; //TODO: why does this start from 1? + impl->current = rid; + } + //DEBUG("keysize = %d, slot = %d\n", keySize, impl->current.slot); + impl->t = 0; + impl->justOnePage = (depth == 0); + + lladdIterator_t *it = (lladdIterator_t*) malloc(sizeof(lladdIterator_t)); + it->type = -1; // XXX LSM_TREE_ITERATOR; + it->impl = impl; + return it; +} + +lladdIterator_t* logtreeIterator::openAt(int xid, recordid root, const byte* key) +{ + if(root.page == NULLRID.page && root.slot == NULLRID.slot) + return 0; + + Page *p = loadPage(xid,root.page); + readlock(p->rwlatch,0); + //size_t keySize = getKeySize(xid,p); + //assert(keySize); + const byte *nr = logtree::readRecord(xid,p,logtree::DEPTH, logtree::root_rec_size); + //const byte *cmp_nr = logtree::readRecord(xid, p , logtree::COMPARATOR, logtree::root_rec_size); + + int64_t depth = *((int64_t*)nr); + + recordid lsm_entry_rid = logtree::lookup(xid,p,depth,key,0);//keySize,comparators[cmp_nr->ptr]); + + if(lsm_entry_rid.page == NULLRID.page && lsm_entry_rid.slot == NULLRID.slot) { + unlock(p->rwlatch); + return 0; + } + assert(lsm_entry_rid.size != INVALID_SLOT); + + if(root.page != lsm_entry_rid.page) + { + unlock(p->rwlatch); + releasePage(p); + p = loadPage(xid,lsm_entry_rid.page); + readlock(p->rwlatch,0); + } + + logtreeIterator_s *impl = (logtreeIterator_s*) malloc(sizeof(logtreeIterator_s)); + impl->p = p; + + impl->current.page = lsm_entry_rid.page; + impl->current.slot = lsm_entry_rid.slot - 1; // slot before thing of interest + impl->current.size = lsm_entry_rid.size; + + impl->t = 0; // must be zero so free() doesn't croak. + impl->justOnePage = (depth==0); + + lladdIterator_t *it = (lladdIterator_t*) malloc(sizeof(lladdIterator_t)); + it->type = -1; // XXX LSM_TREE_ITERATOR + it->impl = impl; + return it; +} + +/** + * move to the next page + **/ +int logtreeIterator::next(int xid, lladdIterator_t *it) +{ + logtreeIterator_s *impl = (logtreeIterator_s*) it->impl; + + impl->current = stasis_record_next(xid, impl->p, impl->current); + + if(impl->current.size == INVALID_SLOT) + { + + const indexnode_rec next_rec = *(const indexnode_rec*)logtree::readRecord(xid,impl->p, + logtree::NEXT_LEAF, + 0); + unlock(impl->p->rwlatch); + releasePage(impl->p); + + DEBUG("done with page %lld next = %lld\n", impl->p->id, next_rec.ptr); + + + if(next_rec.ptr != -1 && ! impl->justOnePage) + { + impl->p = loadPage(xid, next_rec.ptr); + readlock(impl->p->rwlatch,0); + impl->current.page = next_rec.ptr; + impl->current.slot = 2; + impl->current.size = stasis_record_length_read(xid, impl->p, impl->current); //keySize; + } else { + impl->p = 0; + impl->current.size = INVALID_SLOT; + } + + } + else + { + /* + assert(impl->current.size == keySize + sizeof(lsmTreeNodeRecord)); + impl->current.size = keySize; + */ + } + + + if(impl->current.size != INVALID_SLOT) + { + //size_t sz = sizeof(*impl->t) + impl->current.size; + if(impl->t != NULL) + free(impl->t); + + impl->t = (indexnode_rec*)malloc(impl->current.size); + memcpy(impl->t, logtree::readRecord(xid,impl->p,impl->current), impl->current.size); + + return 1; + } + else + { + if(impl->t != NULL) + free(impl->t); + impl->t = 0; + return 0; + } + +} + +/* +lladdIterator_t *logtreeIterator::copy(int xid, lladdIterator_t* i) +{ + logtreeIterator_s *it = (logtreeIterator_s*) i->impl; + logtreeIterator_s *mine = (logtreeIterator_s*) malloc(sizeof(logtreeIterator_s)); + + if(it->p) + { + mine->p = loadPage(xid, it->p->id); + readlock(mine->p->rwlatch,0); + } + else + mine->p = 0; + + memcpy(&mine->current, &it->current,sizeof(recordid)); + + if(it->t) + { + mine->t = (datatuple*)malloc(sizeof(*it->t)); //TODO: DATA IS NOT COPIED, MIGHT BE WRONG + //mine->t = malloc(sizeof(*it->t) + it->current.size); + memcpy(mine->t, it->t, sizeof(*it->t));// + it->current.size); + } + else + mine->t = 0; + + mine->justOnePage = it->justOnePage; + lladdIterator_t * ret = (lladdIterator_t*)malloc(sizeof(lladdIterator_t)); + ret->type = -1; // XXX LSM_TREE_ITERATOR + ret->impl = mine; + return ret; +} +*/ + +void logtreeIterator::close(int xid, lladdIterator_t *it) +{ + logtreeIterator_s *impl = (logtreeIterator_s*)it->impl; + if(impl->p) + { + unlock(impl->p->rwlatch); + releasePage(impl->p); + } + if(impl->t) + { + free(impl->t); + } + free(impl); + free(it); +} + + +///////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////// + + + + +double tv_to_double(struct timeval tv) +{ + return static_cast(tv.tv_sec) + + (static_cast(tv.tv_usec) / 1000000.0); +} + + +/////////////////////////////////////////////////////////////////// + diff --git a/logstore.h b/logstore.h new file mode 100644 index 0000000..5230a67 --- /dev/null +++ b/logstore.h @@ -0,0 +1,302 @@ +#ifndef _LOGSTORE_H_ +#define _LOGSTORE_H_ + +#undef end +#undef begin + +#include +#include +#include +#include +#include +#include + +#include "logserver.h" + +#include +#include +#include + +#include + + + +#include + +#include +#include +#include +#include +#include +#include + + +#include "datapage.h" +#include "tuplemerger.h" +#include "datatuple.h" + + +double tv_to_double(struct timeval tv); + + +struct logtable_mergedata; + + + +typedef struct RegionAllocConf_t +{ + recordid regionList; + pageid_t regionCount; + pageid_t nextPage; + pageid_t endOfRegion; + pageid_t regionSize; +} RegionAllocConf_t; + + +//struct logtree_state { +// pageid_t lastLeaf; +//}; + + +struct indexnode_rec { + pageid_t ptr; +}; + +typedef pageid_t(*logtree_page_allocator_t)(int, void *); +typedef void(*logtree_page_deallocator_t)(int, void *); + + +class logtree{ +public: + logtree(); + + recordid create(int xid); + + void print_tree(int xid); + + static pageid_t alloc_region(int xid, void *conf); + static pageid_t alloc_region_rid(int xid, void * ridp); + static void force_region_rid(int xid, void *conf); + static void dealloc_region_rid(int xid, void *conf); + static void free_region_rid(int xid, recordid tree, + logtree_page_deallocator_t dealloc, + void *allocator_state); + + static void writeNodeRecord(int xid, Page *p, recordid &rid, + const byte *key, size_t keylen, pageid_t ptr); + + static void writeRecord(int xid, Page *p, recordid &rid, + const byte *data, size_t datalen); + + static void writeRecord(int xid, Page *p, slotid_t slot, + const byte *data, size_t datalen); + + static const byte* readRecord(int xid, Page * p, recordid &rid); + static const byte* readRecord(int xid, Page * p, slotid_t slot, int64_t size); + + static int32_t readRecordLength(int xid, Page *p, slotid_t slot); + + //return the left-most leaf, these are not data pages, although referred to as leaf + static pageid_t findFirstLeaf(int xid, Page *root, int64_t depth); + //return the right-most leaf + static pageid_t findLastLeaf(int xid, Page *root, int64_t depth) ; + + //reads the given record and returns the page id stored in it + static pageid_t lookupLeafPageFromRid(int xid, recordid rid); + + //returns a record that stores the pageid where the given key should be in, i.e. if it exists + static recordid lookup(int xid, Page *node, int64_t depth, const byte *key, + size_t keySize); + + //returns the id of the data page that could contain the given key + static pageid_t findPage(int xid, recordid tree, const byte *key, size_t keySize); + + + //appends a leaf page, val_page is the id of the leaf page + //rmLeafID --> rightmost leaf id + static recordid appendPage(int xid, recordid tree, pageid_t & rmLeafID, + const byte *key,size_t keySize, + logtree_page_allocator_t allocator, void *allocator_state, + long val_page); + + static recordid appendInternalNode(int xid, Page *p, + int64_t depth, + const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state); + + static recordid buildPathToLeaf(int xid, recordid root, Page *root_p, + int64_t depth, const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state); + + + + /** + Initialize a page for use as an internal node of the tree. + */ + inline static void initializeNodePage(int xid, Page *p); + + recordid &get_tree_state(){return tree_state;} + recordid &get_root_rec(){return root_rec;} + +public: + + const static RegionAllocConf_t REGION_ALLOC_STATIC_INITIALIZER; + const static int64_t DEPTH; + const static int64_t COMPARATOR; + const static int64_t FIRST_SLOT; + const static size_t root_rec_size; + const static int64_t PREV_LEAF; + const static int64_t NEXT_LEAF; + + pageid_t lastLeaf; +private: + + void print_tree(int xid, pageid_t pid, int64_t depth); + +private: + recordid tree_state; + recordid root_rec; + + + + +}; + + +class logtable +{ +public: + logtable(); + ~logtable(); + + //user access functions + datatuple * findTuple(int xid, datatuple::key_t key, size_t keySize); + + datatuple * findTuple_first(int xid, datatuple::key_t key, size_t keySize); + + void insertTuple(struct datatuple &tuple); + + + //other class functions + recordid allocTable(int xid); + + void flushTable(); + + DataPage* insertTuple(int xid, struct datatuple &tuple, recordid &dpstate,logtree *ltree); + + datatuple * findTuple(int xid, datatuple::key_t key, size_t keySize, logtree *ltree); + + inline recordid & get_table_rec(){return table_rec;} + + inline logtree * get_tree_c2(){return tree_c2;} + inline logtree * get_tree_c1(){return tree_c1;} + + inline void set_tree_c1(logtree *t){tree_c1=t;} + inline void set_tree_c2(logtree *t){tree_c2=t;} + + typedef std::set rbtree_t; + typedef rbtree_t* rbtree_ptr_t; + inline rbtree_ptr_t get_tree_c0(){return tree_c0;} + + void set_tree_c0(rbtree_ptr_t newtree){tree_c0 = newtree;} + + inline recordid & get_dpstate1(){return tbl_header.c1_dp_state;} + inline recordid & get_dpstate2(){return tbl_header.c2_dp_state;} + + int get_fixed_page_count(){return fixed_page_count;} + void set_fixed_page_count(int count){fixed_page_count = count;} + + void setMergeData(logtable_mergedata * mdata) { this->mergedata = mdata;} + logtable_mergedata* getMergeData(){return mergedata;} + + inline tuplemerger * gettuplemerger(){return tmerger;} + +public: + + struct table_header { + recordid c2_root; //tree root record --> points to the root of the b-tree + recordid c2_state; //tree state --> describes the regions used by the index tree + recordid c2_dp_state; //data pages state --> regions used by the data pages + recordid c1_root; + recordid c1_state; + recordid c1_dp_state; + //epoch_t beginning; + //epoch_t end; + + }; + + const static RegionAllocConf_t DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER; + + logtable_mergedata * mergedata; + +private: + + + +private: + recordid table_rec; + struct table_header tbl_header; + + logtree *tree_c2; //big tree + logtree *tree_c1; //small tree + rbtree_ptr_t tree_c0; // in-mem red black tree + + + int tsize; //number of tuples + int64_t tree_bytes; //number of bytes + + + //DATA PAGE SETTINGS + int fixed_page_count;//number of pages in a datapage + +// logtable_mergedata * mergedata; + + tuplemerger *tmerger; +}; + + +typedef struct logtreeIterator_s { + Page * p; + recordid current; + indexnode_rec *t; + int justOnePage; +} logtreeIterator_s; + + +class logtreeIterator +{ + +public: + static lladdIterator_t* open(int xid, recordid root); + static lladdIterator_t* openAt(int xid, recordid root, const byte* key); + static int next(int xid, lladdIterator_t *it); + //static lladdIterator_t *copy(int xid, lladdIterator_t* i); + static void close(int xid, lladdIterator_t *it); + + + static inline int key (int xid, lladdIterator_t *it, byte **key) + { + logtreeIterator_s * impl = (logtreeIterator_s*)it->impl; + *key = (byte*)(impl->t+1); + return (int) impl->current.size - sizeof(indexnode_rec); + } + + + static inline int value(int xid, lladdIterator_t *it, byte **value) + { + logtreeIterator_s * impl = (logtreeIterator_s*)it->impl; + *value = (byte*)&(impl->t->ptr); + return sizeof(impl->t->ptr); + } + + static inline void tupleDone(int xid, void *it) { } + static inline void releaseLock(int xid, void *it) { } + +}; + + +#endif diff --git a/merger.cpp b/merger.cpp new file mode 100644 index 0000000..bcdced0 --- /dev/null +++ b/merger.cpp @@ -0,0 +1,836 @@ + +#include +#include "merger.h" +#include "logiterators.cpp" +#include "datapage.cpp" +//pageid_t merge_scheduler::C0_MEM_SIZE = 1000 * 1000 * 1000; + +//template <> struct merger_args; +//template <> struct merger_args; +inline DataPage* +insertTuple(int xid, DataPage *dp, datatuple &t, + logtable *ltable, + logtree * ltree, + recordid & dpstate, + int64_t &dpages, int64_t &npages); + +int merge_scheduler::addlogtable(logtable *ltable) +{ + + struct logtable_mergedata * mdata = new logtable_mergedata; + + // initialize merge data + mdata->header_lock = initlock(); + mdata->rbtree_mut = new pthread_mutex_t; + pthread_mutex_init(mdata->rbtree_mut,0); + mdata->old_c0 = new rbtree_ptr_t; + *mdata->old_c0 = 0; + + mdata->input_needed = new bool(false); + + mdata->input_ready_cond = new pthread_cond_t; + pthread_cond_init(mdata->input_ready_cond,0); + + mdata->input_needed_cond = new pthread_cond_t; + pthread_cond_init(mdata->input_needed_cond,0); + + mdata->input_size = new int64_t(100); + + mdata->diskmerge_args = new merger_args; + mdata->memmerge_args = new merger_args; + + mergedata.push_back(std::make_pair(ltable, mdata)); + return mergedata.size()-1; + +} + +merge_scheduler::~merge_scheduler() +{ + for(int i=0; iheader_lock); + delete mdata->rbtree_mut; + delete mdata->old_c0; + delete mdata->input_needed; + delete mdata->input_ready_cond; + delete mdata->input_needed_cond; + delete mdata->input_size; + + //delete the merge thread structure variables + delete (recordid*) mdata->memmerge_args->pageAllocState; + delete (recordid*) mdata->memmerge_args->oldAllocState; + delete mdata->memmerge_args->still_open; + + delete (recordid*) mdata->diskmerge_args->pageAllocState; + delete (recordid*) mdata->diskmerge_args->oldAllocState; + + pthread_cond_destroy(mdata->diskmerge_args->in_block_needed_cond); + delete mdata->diskmerge_args->in_block_needed_cond; + delete mdata->diskmerge_args->in_block_needed; + + pthread_cond_destroy(mdata->diskmerge_args->out_block_needed_cond); + delete mdata->diskmerge_args->out_block_needed_cond; + delete mdata->diskmerge_args->out_block_needed; + + pthread_cond_destroy(mdata->diskmerge_args->in_block_ready_cond); + delete mdata->diskmerge_args->in_block_ready_cond; + pthread_cond_destroy(mdata->diskmerge_args->out_block_ready_cond); + delete mdata->diskmerge_args->out_block_ready_cond; + + delete mdata->diskmerge_args->my_tree_size; + + delete mdata->diskmerge_args; + delete mdata->memmerge_args; + + + } + mergedata.clear(); + +} + +void merge_scheduler::shutdown() +{ + //signal shutdown + for(int i=0; iflushTable(); + + pthread_mutex_lock(mdata->rbtree_mut); + *(mdata->memmerge_args->still_open)=false; + pthread_cond_signal(mdata->input_ready_cond); + + //*(mdata->diskmerge_args->still_open)=false;//same pointer so no need + + pthread_mutex_unlock(mdata->rbtree_mut); + + } + + for(int i=0; imemmerge_thread,0); + pthread_join(mdata->diskmerge_thread,0); + } + + +} + +void merge_scheduler::startlogtable(int index) +{ + logtable * ltable = mergedata[index].first; + struct logtable_mergedata *mdata = mergedata[index].second; + + pthread_cond_t * block1_needed_cond = new pthread_cond_t; + pthread_cond_init(block1_needed_cond,0); + pthread_cond_t * block2_needed_cond = new pthread_cond_t; + pthread_cond_init(block2_needed_cond,0); + + pthread_cond_t * block1_ready_cond = new pthread_cond_t; + pthread_cond_init(block1_ready_cond,0); + pthread_cond_t * block2_ready_cond = new pthread_cond_t; + pthread_cond_init(block2_ready_cond,0); + + bool *block1_needed = new bool(false); + bool *block2_needed = new bool(false); + bool *system_running = new bool(true); + + //wait to merge the next block until we have merged block FUDGE times. + static const int FUDGE = 1; + static double R = MIN_R; + int64_t * block1_size = new int64_t; + *block1_size = FUDGE * ((int)R) * (*(mdata->input_size)); + + //initialize rb-tree + ltable->set_tree_c0(new rbtree_t); + + //disk merger args + recordid * ridp = new recordid; + *ridp = ltable->get_tree_c2()->get_tree_state(); //h.bigTreeAllocState; + recordid * oldridp = new recordid; + *oldridp = NULLRID; + + logtree ** block1_scratch = new logtree*; + *block1_scratch=0; + + //recordid * allocer_scratch = new recordid; + RegionAllocConf_t *allocer_scratch = new RegionAllocConf_t; + + + struct merger_args diskmerge_args= { + ltable, + 1, //worker id + logtree::alloc_region_rid, //pageAlloc + ridp, // pageAllocState + oldridp, // oldAllocState + mdata->rbtree_mut, //block_ready_mutex + block1_needed_cond, //in_block_needed_cond + block1_needed, //in_block_needed + block2_needed_cond, //out_block_needed_cond + block2_needed, //out_block_needed + block1_ready_cond, //in_block_ready_cond + block2_ready_cond, //out_block_ready_cond + system_running, //still_open i.e. system running + block1_size, //mytree_size ? + 0, //out_tree_size, biggest component computes its size directly. + 0, //max_tree_size No max size for biggest component + &R, //r_i + block1_scratch, //in-tree + allocer_scratch, //in_tree_allocer + 0, //out_tree + 0, //out_tree_allocer + new treeIterator::treeIteratorHandle(ltable->get_tree_c2()->get_root_rec()), // my_tree + ltable->get_table_rec() //tree + }; + + *mdata->diskmerge_args = diskmerge_args; + + DEBUG("Tree C2 is %lld\n", (long long)ltable->get_tree_c2()->get_root_rec().page); + + + //memory merger args + ridp = new recordid; + *ridp = ltable->get_tree_c1()->get_tree_state(); + oldridp = new recordid; + *oldridp = NULLRID; + + DEBUG("Tree C1 is %lld\n", (long long)ltable->get_tree_c1()->get_root_rec().page); + + struct merger_args memmerge_args = + { + ltable, + 2, + logtree::alloc_region_rid, //pageAlloc + ridp, // pageAllocState + oldridp, // oldAllocState + mdata->rbtree_mut, //block_ready_mutex + mdata->input_needed_cond, + mdata->input_needed, + block1_needed_cond, + block1_needed, + mdata->input_ready_cond, + block1_ready_cond, + system_running, + mdata->input_size, + block1_size, + (int64_t)(R * R * MAX_C0_SIZE), + &R, + mdata->old_c0, + 0, + block1_scratch, + allocer_scratch, + new treeIterator::treeIteratorHandle(ltable->get_tree_c1()->get_root_rec()), + ltable->get_table_rec() //tree + }; + + *mdata->memmerge_args = memmerge_args; + + void * (*diskmerger)(void*) = diskMergeThread; + void * (*memmerger)(void*) = memMergeThread; + + pthread_create(&mdata->diskmerge_thread, 0, diskmerger, mdata->diskmerge_args); + pthread_create(&mdata->memmerge_thread, 0, memmerger, mdata->memmerge_args); + +} + +//TODO: flush the data pages +// deallocate/free their region +// create new data region for new data pages +void* memMergeThread(void*arg) +{ + + int xid;// = Tbegin(); + + merger_args * a = (merger_args*)(arg); + assert(a->my_tree->r_.size != -1); + + logtable * ltable = a->ltable; + + int merge_count =0; +// pthread_mutex_lock(a->block_ready_mut); + + while(true) + { + writelock(ltable->mergedata->header_lock,0); + int done = 0; + // get a new input for merge + while(!*(a->in_tree)) + { + pthread_mutex_lock(a->block_ready_mut); + *a->in_block_needed = true; + //pthread_cond_signal(a->in_block_needed_cond); + pthread_cond_broadcast(a->in_block_needed_cond); + + if(!*(a->still_open)){ + done = 1; + pthread_mutex_unlock(a->block_ready_mut); + break; + } + + printf("mmt:\twaiting for block ready cond\n"); + unlock(ltable->mergedata->header_lock); + + pthread_cond_wait(a->in_block_ready_cond, a->block_ready_mut); + pthread_mutex_unlock(a->block_ready_mut); + + writelock(ltable->mergedata->header_lock,0); + printf("mmt:\tblock ready\n"); + + } + *a->in_block_needed = false; + + if(done==1) + { + pthread_mutex_lock(a->block_ready_mut); + pthread_cond_signal(a->out_block_ready_cond); + pthread_mutex_unlock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + break; + } + + if((*a->in_tree)->size()==0) //input empty, this can only happen during shutdown + { + delete *a->in_tree; + *a->in_tree = 0; + unlock(ltable->mergedata->header_lock); + continue; + } + + uint64_t insertedTuples=0; + int64_t mergedPages=0; + + assert(a->my_tree->r_.size != -1); + + //create the iterators + treeIterator *itrA = new treeIterator(a->my_tree->r_); + memTreeIterator *itrB = + new memTreeIterator(*a->in_tree); + memTreeIterator *itrBend = itrB->end(); + + //Tcommit(xid); + xid = Tbegin(); + + //create a new tree + logtree * scratch_tree = new logtree; + recordid scratch_root = scratch_tree->create(xid); + + //save the old dp state values + RegionAllocConf_t olddp_state; + Tread(xid, ltable->get_dpstate1(), &olddp_state); + //reinitialize the dp state + Tset(xid, ltable->get_dpstate1(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + //pthread_mutex_unlock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + + //: do the merge + printf("mmt:\tMerging:\n"); + + int64_t npages = 0; + mergedPages = merge_iterators(xid, itrA, itrB, ltable, scratch_tree, npages); + + delete itrA; + delete itrB; + delete itrBend; + + //force write the new region to disk + recordid scratch_alloc_state = scratch_tree->get_tree_state(); + //TlsmForce(xid,scratch_root,logtree::force_region_rid, &scratch_alloc_state); + logtree::force_region_rid(xid, &scratch_alloc_state); + //force write the new datapages + DataPage::force_region_rid(xid, <able->get_dpstate1()); + + //writes complete + //now automically replace the old c1 with new c1 + //pthread_mutex_lock(a->block_ready_mut); + + writelock(ltable->mergedata->header_lock,0); + merge_count++; + *a->my_tree_size = mergedPages; + printf("mmt:\tmerge_count %d #pages written %lld\n", merge_count, npages); + + delete ltable->get_tree_c1(); + ltable->set_tree_c1(scratch_tree); + + logtable::table_header h; + void * oldAllocState = a->pageAllocState; + Tread(xid, a->tree, &h); + + h.c1_root = scratch_root; + h.c1_state = scratch_alloc_state; + //note we already updated the dpstate before the merge + printf("mmt:\tUpdated C1's position on disk to %lld\n",scratch_root.page); + Tset(xid, a->tree, &h); + + //Tcommit(xid); + //xid = Tbegin(); + + // free old my_tree here + //TODO: check + logtree::free_region_rid(xid, a->my_tree->r_, logtree::dealloc_region_rid, oldAllocState); + + + //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState); + //TODO: check + //free the old data pages + DataPage::dealloc_region_rid(xid, &olddp_state); + + Tcommit(xid); + //xid = Tbegin(); + + + //TODO: this is simplistic for now + //signal the other merger if necessary + double target_R = *(a->r_i); + double new_c1_size = npages * PAGE_SIZE; + assert(target_R >= MIN_R); + if( (new_c1_size / MAX_C0_SIZE > target_R) || + (a->max_size && new_c1_size > a->max_size ) ) + { + printf("mmt:\tsignaling C2 for merge\n"); + printf("mmt:\tnew_c1_size %.2f\tMAX_C0_SIZE %lld\ta->max_size %lld\t targetr %.2f \n", new_c1_size, + MAX_C0_SIZE, a->max_size, target_R); + + // XXX need to report backpressure here! + while(*a->out_tree) { + pthread_mutex_lock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + + pthread_cond_wait(a->out_block_needed_cond, a->block_ready_mut); + pthread_mutex_unlock(a->block_ready_mut); + writelock(ltable->mergedata->header_lock,0); + } + + + *a->out_tree = scratch_tree; + xid = Tbegin(); + Tread(xid, ltable->get_dpstate1(), a->out_tree_allocer); + + pthread_cond_signal(a->out_block_ready_cond); + + + logtree *empty_tree = new logtree; + empty_tree->create(xid); + + *(recordid*)(a->pageAllocState) = empty_tree->get_tree_state(); + + a->my_tree->r_ = empty_tree->get_root_rec(); + + ltable->set_tree_c1(empty_tree); + + logtable::table_header h; + Tread(xid, a->tree, &h); + h.c1_root = empty_tree->get_root_rec(); //update root + h.c1_state = empty_tree->get_tree_state(); //update index alloc state + printf("mmt:\tUpdated C1's position on disk to %lld\n",empty_tree->get_root_rec().page); + Tset(xid, a->tree, &h); + //update datapage alloc state + Tset(xid, ltable->get_dpstate1(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + Tcommit(xid); + //xid = Tbegin(); + + } + else //not signaling the C2 for merge yet + { + printf("mmt:\tnot signaling C2 for merge\n"); + *(recordid*)a->pageAllocState = scratch_alloc_state; + a->my_tree->r_ = scratch_root; + } + + rbtree_ptr_t deltree = *a->in_tree; + *a->in_tree = 0; + + + //Tcommit(xid); + unlock(ltable->mergedata->header_lock); + + //TODO: get the freeing outside of the lock + //// ----------- Free in_tree + for(rbtree_t::iterator delitr=deltree->begin(); + delitr != deltree->end(); delitr++) + free((*delitr).keylen); + + delete deltree; + //deltree = 0; + + + /* + for(rbtree_t::iterator delitr=(*a->in_tree)->begin(); + delitr != (*a->in_tree)->end(); delitr++) + free((*delitr).keylen); + + delete *a->in_tree; + *a->in_tree = 0; + */ + } + + //pthread_mutex_unlock(a->block_ready_mut); + + return 0; + +} + +void *diskMergeThread(void*arg) +{ + int xid;// = Tbegin(); + + merger_args * a = (merger_args*)(arg); + assert(a->my_tree->r_.size != -1); + + logtable * ltable = a->ltable; + + int merge_count =0; + //pthread_mutex_lock(a->block_ready_mut); + + while(true) + { + writelock(ltable->mergedata->header_lock,0); + int done = 0; + // get a new input for merge + while(!*(a->in_tree)) + { + pthread_mutex_lock(a->block_ready_mut); + *a->in_block_needed = true; + pthread_cond_signal(a->in_block_needed_cond); + + if(!*(a->still_open)){ + done = 1; + pthread_mutex_unlock(a->block_ready_mut); + break; + } + + printf("dmt:\twaiting for block ready cond\n"); + unlock(ltable->mergedata->header_lock); + + pthread_cond_wait(a->in_block_ready_cond, a->block_ready_mut); + pthread_mutex_unlock(a->block_ready_mut); + + printf("dmt:\tblock ready\n"); + writelock(ltable->mergedata->header_lock,0); + } + *a->in_block_needed = false; + if(done==1) + { + pthread_cond_signal(a->out_block_ready_cond); + unlock(ltable->mergedata->header_lock); + break; + } + + + uint64_t insertedTuples=0; + int64_t mergedPages=0; + + assert(a->my_tree->r_.size != -1); + + //create the iterators + treeIterator *itrA = new treeIterator(a->my_tree->r_); + treeIterator *itrB = + new treeIterator((*a->in_tree)->get_root_rec()); + + //Tcommit(xid); + xid = Tbegin(); + + //create a new tree + logtree * scratch_tree = new logtree; + recordid scratch_root = scratch_tree->create(xid); + + //save the old dp state values + RegionAllocConf_t olddp_state; + Tread(xid, ltable->get_dpstate2(), &olddp_state); + //reinitialize the dp state + //TODO: maybe you want larger regions for the second tree? + Tset(xid, ltable->get_dpstate2(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + //pthread_mutex_unlock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + + + //do the merge + printf("dmt:\tMerging:\n"); + + int64_t npages = 0; + mergedPages = merge_iterators(xid, itrA, itrB, ltable, scratch_tree, npages); + + delete itrA; + delete itrB; + + //force write the new region to disk + recordid scratch_alloc_state = scratch_tree->get_tree_state(); + //TODO: + //TlsmForce(xid,scratch_root,logtree::force_region_rid, &scratch_alloc_state); + logtree::force_region_rid(xid, &scratch_alloc_state); + //force write the new datapages + DataPage::force_region_rid(xid, <able->get_dpstate2()); + + + //writes complete + //now automically replace the old c2 with new c2 + //pthread_mutex_lock(a->block_ready_mut); + writelock(ltable->mergedata->header_lock,0); + + merge_count++; + *a->my_tree_size = mergedPages; + //update the current optimal R value + *(a->r_i) = std::max(MIN_R, sqrt( (npages * 1.0) / (MAX_C0_SIZE/PAGE_SIZE) ) ); + + printf("dmt:\tmerge_count %d\t#written pages: %lld\n optimal r %.2f", merge_count, npages, *(a->r_i)); + + delete ltable->get_tree_c2(); + ltable->set_tree_c2(scratch_tree); + + logtable::table_header h; + void * oldAllocState = a->pageAllocState; + Tread(xid, a->tree, &h); + + h.c2_root = scratch_root; + h.c2_state = scratch_alloc_state; + //note we already updated the dpstate before the merge + printf("dmt:\tUpdated C2's position on disk to %lld\n",scratch_root.page); + Tset(xid, a->tree, &h); + + + + // free old my_tree here + //TODO: check + logtree::free_region_rid(xid, a->my_tree->r_, logtree::dealloc_region_rid, oldAllocState); + //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState); + + //TODO: check + //free the old data pages + DataPage::dealloc_region_rid(xid, &olddp_state); + + + + *(recordid*)a->pageAllocState = scratch_alloc_state; + a->my_tree->r_ = scratch_root; + + //// ----------- Free in_tree + //TODO: check + logtree::free_region_rid(xid, (*a->in_tree)->get_root_rec(), + logtree::dealloc_region_rid, + &((*a->in_tree)->get_tree_state())); + //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState); + + //TODO: check + //free the old data pages + DataPage::dealloc_region_rid(xid, a->in_tree_allocer);//TODO: + + Tcommit(xid); + + //xid = Tbegin(); + //Tcommit(xid); + delete *a->in_tree; + *a->in_tree = 0; + + unlock(ltable->mergedata->header_lock); + + } + + //pthread_mutex_unlock(a->block_ready_mut); + + return 0; + + +} + +int64_t merge_iterators(int xid, + treeIterator *itrA, + memTreeIterator * itrB, + logtable *ltable, + logtree *scratch_tree, + int64_t &npages ) +{ + int64_t dpages = 0; + //int npages = 0; + int64_t ntuples = 0; + DataPage *dp = 0; + + memTreeIterator *itrBend = itrB->end(); + datatuple *t1 = itrA->getnext(); + + while(*itrB != *itrBend) + { + datatuple t2 = **itrB; + DEBUG("tuple\t%lld: keylen %d datalen %d\n", ntuples, *t2.keylen,*t2.datalen ); + + while(t1 != 0 && datatuple::compare(t1->key, t2.key) < 0) // t1 is less than t2 + { + //insert t1 + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + if(t1 != 0 && datatuple::compare(t1->key, t2.key) == 0) + { + datatuple *mtuple = ltable->gettuplemerger()->merge(t1,&t2); + //insert merged tuple + dp = insertTuple(xid, dp, *mtuple, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + free(t1->keylen); + free(t1); + t1 = itrA->getnext(); //advance itrA + free(mtuple->keylen); + free(mtuple); + } + else + { + //insert t2 + dp = insertTuple(xid, dp, t2, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + //free(t2.keylen); //cannot free here it may still be read through a lookup + } + + ntuples++; + ++(*itrB); + } + + while(t1 != 0) // t1 is less than t2 + { + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + + delete itrBend; + if(dp!=NULL) + delete dp; + DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples); + fflush(stdout); + + + return dpages; + +} + + +int64_t merge_iterators(int xid, + treeIterator *itrA, //iterator on c2 + treeIterator *itrB, //iterator on c1 + logtable *ltable, + logtree *scratch_tree, + int64_t &npages) +{ + int64_t dpages = 0; + //int npages = 0; + int64_t ntuples = 0; + DataPage *dp = 0; + + datatuple *t1 = itrA->getnext(); + datatuple *t2 = 0; + + while( (t2=itrB->getnext()) != 0) + { + DEBUG("tuple\t%lld: keylen %d datalen %d\n", + ntuples, *(t2->keylen),*(t2->datalen) ); + + while(t1 != 0 && datatuple::compare(t1->key, t2->key) < 0) // t1 is less than t2 + { + //insert t1 + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, + ltable->get_dpstate2(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + if(t1 != 0 && datatuple::compare(t1->key, t2->key) == 0) + { + datatuple *mtuple = ltable->gettuplemerger()->merge(t1,t2); + + //insert merged tuple, drop deletes + if(!mtuple->isDelete()) + dp = insertTuple(xid, dp, *mtuple, ltable, scratch_tree, ltable->get_dpstate2(), + dpages, npages); + + free(t1->keylen); + free(t1); + t1 = itrA->getnext(); //advance itrA + free(mtuple->keylen); + free(mtuple); + } + else + { + //insert t2 + dp = insertTuple(xid, dp, *t2, ltable, scratch_tree, ltable->get_dpstate2(), + dpages, npages); + } + + free(t2->keylen); + free(t2); + ntuples++; + } + + while(t1 != 0) + { + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate2(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + if(dp!=NULL) + delete dp; + DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples); + fflush(stdout); + + return dpages; + +} + + + +inline DataPage* +insertTuple(int xid, DataPage *dp, datatuple &t, + logtable *ltable, + logtree * ltree, + recordid & dpstate, + int64_t &dpages, int64_t &npages) +{ + if(dp==0) + { + dp = ltable->insertTuple(xid, t, dpstate, ltree); + dpages++; + } + else if(!dp->append(xid, t)) + { + npages += dp->get_page_count(); + delete dp; + dp = ltable->insertTuple(xid, t, dpstate, ltree); + dpages++; + } + + return dp; +} + + + + diff --git a/merger.h b/merger.h new file mode 100644 index 0000000..def1859 --- /dev/null +++ b/merger.h @@ -0,0 +1,127 @@ +#ifndef _MERGER_H_ +#define _MERGER_H_ + +#include +#include + +#include "logstore.h" +#include "logiterators.h" + +typedef std::set rbtree_t; +typedef rbtree_t* rbtree_ptr_t; + +//TODO: 400 bytes overhead per tuple, this is nuts, check if this is true... +static const int RB_TREE_OVERHEAD = 400; +static const int64_t MAX_C0_SIZE = 800 *1024*1024; //max size of c0 +static const double MIN_R = 3.0; +//T is either logtree or red-black tree +template +struct merger_args +{ + logtable * ltable; + int worker_id; + + //page allocation information + pageid_t(*pageAlloc)(int,void*); + void *pageAllocState; + void *oldAllocState; + + pthread_mutex_t * block_ready_mut; + + pthread_cond_t * in_block_needed_cond; + bool * in_block_needed; + + pthread_cond_t * out_block_needed_cond; + bool * out_block_needed; + + pthread_cond_t * in_block_ready_cond; + pthread_cond_t * out_block_ready_cond; + + bool * still_open; + + int64_t * my_tree_size; + int64_t * out_tree_size; + int64_t max_size; //pageid_t + double * r_i; + + T ** in_tree; + void * in_tree_allocer; + + logtree ** out_tree; + void * out_tree_allocer; + + treeIterator::treeIteratorHandle *my_tree; + + recordid tree; +}; + + + +struct logtable_mergedata +{ + //merge threads + pthread_t diskmerge_thread; + pthread_t memmerge_thread; + + rwl *header_lock; + + pthread_mutex_t * rbtree_mut; + rbtree_ptr_t *old_c0; //in-mem red black tree being merged / to be merged + + bool *input_needed; // memmerge-input needed + + pthread_cond_t * input_ready_cond; + pthread_cond_t * input_needed_cond; + int64_t * input_size; + + //merge args 1 + struct merger_args *diskmerge_args; + //merge args 2 + struct merger_args *memmerge_args; + +}; + + +class merge_scheduler +{ + std::vector > mergedata; + +public: + //static pageid_t C0_MEM_SIZE; + ~merge_scheduler(); + + int addlogtable(logtable * ltable); + void startlogtable(int index); + + struct logtable_mergedata *getMergeData(int index){return mergedata[index].second;} + + void shutdown(); + + + +}; + + +void* memMergeThread(void* arg); + +//merges and returns the number of data pages used +int64_t merge_iterators(int xid, + treeIterator *itrA, + memTreeIterator * itrB, + logtable *ltable, + logtree *scratch_tree, + int64_t &npages); + + +int64_t merge_iterators(int xid, + treeIterator *itrA, + treeIterator *itrB, + logtable *ltable, + logtree *scratch_tree, + int64_t &npages); + + +void* diskMergeThread(void* arg); + + +#endif diff --git a/tuplemerger.cpp b/tuplemerger.cpp new file mode 100644 index 0000000..0adbf22 --- /dev/null +++ b/tuplemerger.cpp @@ -0,0 +1,84 @@ +#include "tuplemerger.h" +#include "logstore.h" + +datatuple* tuplemerger::merge(datatuple *t1, datatuple *t2) +{ + assert(!t1->isDelete() || !t2->isDelete()); //both cannot be delete + + datatuple *t; + + if(t1->isDelete()) //delete - t2 + { + t = datatuple::from_bytes(t2->to_bytes()); + } + else if(t2->isDelete()) + { + t = datatuple::from_bytes(t2->to_bytes()); + } + else //neither is a delete + { + t = (*merge_fp)(t1,t2); + } + + return t; + +} + +/** + * appends the data in t2 to data from t1 + * + * deletes are handled by the tuplemerger::merge function + * so here neither t1 nor t2 is a delete datatuple + **/ +datatuple* append_merger(datatuple *t1, datatuple *t2) +{ + static const size_t isize = sizeof(uint32_t); + struct datatuple *t = (datatuple*) malloc(sizeof(datatuple)); + + byte *arr = (byte*)malloc(t1->byte_length() + *t2->datalen); + + t->keylen = (uint32_t*) arr; + *(t->keylen) = *(t1->keylen); + + t->datalen = (uint32_t*) (arr+isize); + *(t->datalen) = *(t1->datalen) + *(t2->datalen); + + t->key = (datatuple::key_t) (arr+isize+isize); + memcpy((byte*)t->key, (byte*)t1->key, *(t1->keylen)); + + t->data = (datatuple::data_t) (arr+isize+isize+ *(t1->keylen)); + memcpy((byte*)t->data, (byte*)t1->data, *(t1->datalen)); + memcpy(((byte*)t->data) + *(t1->datalen), (byte*)t2->data, *(t2->datalen)); + + return t; + +} + +/** + * replaces the data with data from t2 + * + * deletes are handled by the tuplemerger::merge function + * so here neither t1 nor t2 is a delete datatuple + **/ +datatuple* replace_merger(datatuple *t1, datatuple *t2) +{ + static const size_t isize = sizeof(uint32_t); + struct datatuple *t = (datatuple*) malloc(sizeof(datatuple)); + + byte *arr = (byte*)malloc(t2->byte_length()); + + t->keylen = (uint32_t*) arr; + *(t->keylen) = *(t2->keylen); + + t->datalen = (uint32_t*) (arr+isize); + *(t->datalen) = *(t2->datalen); + + t->key = (datatuple::key_t) (arr+isize+isize); + memcpy((byte*)t->key, (byte*)t2->key, *(t2->keylen)); + + t->data = (datatuple::data_t) (arr+isize+isize+ *(t2->keylen)); + memcpy((byte*)t->data, (byte*)t2->data, *(t2->datalen)); + + return t; + +} diff --git a/tuplemerger.h b/tuplemerger.h new file mode 100644 index 0000000..b8314ba --- /dev/null +++ b/tuplemerger.h @@ -0,0 +1,34 @@ +#ifndef _TUPLE_MERGER_H_ +#define _TUPLE_MERGER_H_ + +struct datatuple; + +typedef datatuple* (*merge_fn_t) (datatuple*, datatuple *); + +datatuple* append_merger(datatuple *t1, datatuple *t2); + +datatuple* replace_merger(datatuple *t1, datatuple *t2); + + +class tuplemerger +{ + +public: + + tuplemerger(merge_fn_t merge_fp) + { + this->merge_fp = merge_fp; + } + + + datatuple* merge(datatuple *t1, datatuple *t2); + +private: + + merge_fn_t merge_fp; + +}; + + + +#endif