commit d016498f8d621fa897e0f1442d6f49981d902854 Author: sears Date: Sat Jan 23 02:13:59 2010 +0000 initial import; removed cruft from mert's tarball, tweaked make's clean targets git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@520 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe diff --git a/FwCode.h b/FwCode.h new file mode 100644 index 0000000..5af3d06 --- /dev/null +++ b/FwCode.h @@ -0,0 +1,165 @@ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +#ifndef __FW_CODE__H +#define __FW_CODE__H + +#include + +/** + * Global framework response codes. + */ +class FwCode { + public: + + typedef int ResponseCode; + + static const std::string unknownCodeStr; + + /** + * The convention here is to keep related codes grouped together, so + * that it is easier to find all existing codes for a particular + * module. Each section is given a range of 50 codes, so that adding + * a new code to an existing section won't invalidate all of the codes + * following it in the enum (causing binary incompatibility). + */ + + //----------- Generic section ------------- + static const ResponseCode FwOk = 0; //!< All successes + static const ResponseCode FwError = 1; //!< General error code + + static const ResponseCode FwCrit = 2; //!< General critical error. could be originated by low level library to indicate some nasty error has occurred. + + static const ResponseCode MdbmOpenFailed = 3; //!< Any kind of mdbm open failure + static const ResponseCode MdbmOperationFailed = 4; //!< Any store/fetch/lock from mdbm failed + static const ResponseCode NoMem = 5; //!< Out Of Memory + static const ResponseCode InvalidParam = 6; //!< Invalid parameter + static const ResponseCode NotFound = 7; //!< Fail to find the specified info; usuall returned by access methods + static const ResponseCode InvalidState = 8; //!< Invalid state + static const ResponseCode ConnReset = 9; //!< connection reset + static const ResponseCode Timeout = 10; //!< operation timed out + static const ResponseCode InvalidData = 11; //!< buffer data is invalid + static const ResponseCode BufTooSmall = 12; //!< Buffer size is smaller than required + static const ResponseCode MalformedRequest = 13; //!< Request data (like the URI) is malformed + static const ResponseCode RequestTooLarge = 14; //!< Request data (like the body) is too big + static const ResponseCode ConvertToDhtDataFailed = 15; // !< Failed convert json string to DHT::Data + static const ResponseCode ConvertFromDhtDataFailed = 16; // !< Failed to convert DHT::Data to json string + static const ResponseCode BadHexString = 17; //!< Failed to parse a hex string + static const ResponseCode ShmemCorrupted = 18; //!< A shared mem corruption has been detected. + static const ResponseCode ParseError = 19; //!< Generic parsing problem + /// If mdbm unlock fails, most of the time we want to shut off the + /// system automatically, without letting the caller know that we did + /// so. On specific instances where the caller is the FaultHandler, or + /// Oversight Fault counter (there may be other examples), we don't want + /// to do this because we want to avoid cross-dependency. + static const ResponseCode MdbmUnlockFailed = 20; + + //----------- Generic section ------------- + // Config + static const ResponseCode ConfigFailure = 50; //!< Failure to find or parse a config entry + + //----------- UChar section ------------- + // UCharUtils + static const ResponseCode UcnvOpenFailed = 100; //!< Failed to open ucnv converter for utf-8 + static const ResponseCode DataNotUtf8 = 101; //!< Data is not in utf-8 format + static const ResponseCode ConvertToUCharFailed = 102; //!< Failed to convert utf-8 string to UChar string + static const ResponseCode CompileRegExFailed = 103; //!< Failed to compile the regular expression + + //----------- Yca section ------------- + // YcaClient + static const ResponseCode YcaOpenFailed = 150; //!< Failed to open the yca database + static const ResponseCode YcaCertInvalid = 151; //!< Validation of presented cert failed + static const ResponseCode YcaCertNotFound = 152; //!< certificate for the requested appID was not found + + //----------- Broker section ------------- + static const ResponseCode BrokerClientOpenFailed = 200; //!< Failed to connect to broker + static const ResponseCode UncertainPublish = 201; //!< Publish was uncertain - unknown if it happened + static const ResponseCode PublishFailed = 202; //!< Publish failed (for certain :)) + static const ResponseCode SubscribeFailed = 203; //!< Failed to subscribe to a topic + static const ResponseCode NoSubscriptionFound = 204; //!< Operation on a sub failed because we (locally) + // don't know about it + static const ResponseCode RegisterFailed = 205; //!< Failed to register handler for subscription + static const ResponseCode UnsubscribeFailed = 206; //!< Failed to unsubscribe from sub + static const ResponseCode ListTopicsFailed = 207; //!< Failed to list subscribed topics + static const ResponseCode ConsumeFailed = 208; //!< Failed to consume messages for a topic + static const ResponseCode TopicInvalid = 209; //!< Topic is invalid (was usurped or ymb 'lost' it) + static const ResponseCode NoMessageDelivered = 210; //!< Call to deliver() found no messages ready + static const ResponseCode ConsumeFailedBadTopic = 211; //!< The topic is bad - our handle is bad, + // or it got usurped + static const ResponseCode ConsumeFailedBadHandle = 212; //!< Our ymb handle is bad - not usable anymore + static const ResponseCode ConsumeFailedConnectionError = 213; //!< a recoverable connection error + static const ResponseCode ConsumeFailedServerBusy = 214; //!< ymb server is having a temporary issue, + // not a failure per se + // second argument to messageProcessed() + static const ResponseCode ConsumeMessage = 215; //!< consume this message + static const ResponseCode ConsumeAndUnsubscribe = 216; //!< end this channel + // Internal to ymb implementation + static const ResponseCode YmbSubscribeTempFailure = 217; //!< A failure that might be resolved on a retry + static const ResponseCode YmbSubscribeTimedout = 218; //!< A timeout failure + static const ResponseCode YmbSubscriptionExists = 219; //!< Attempt to create a sub that already exists + static const ResponseCode NoSuchSubscription = 220; //!< Attempt to attach to a sub that does not exist + static const ResponseCode AttachNoSuchSubscription = 221; //!< Specific to attach, no subscription to attach to (not necessarily an error) + static const ResponseCode BrokerInitFailed = 222; //!< Config or allocation failed + static const ResponseCode BrokerConnectionLost = 223; //!< Lost connection to broker + static const ResponseCode BrokerFatalError = 224; //!< Generally shared mem corruption + + + //----------- Daemon section ------------- + // Daemon + static const ResponseCode NoImpl = 250; //!< No op + static const ResponseCode Restart = 251; //!< Exit the daemon so that it is restarted right away. + // request that the daemon do a soft restart + static const ResponseCode Exit = 252; //!< Exit the daemon so that it is NOT restarted right away. A monitoring process may restart the entire system later. + static const ResponseCode StopDelivery = 253; //!< Stop delivery on the topic, returned by Broker handlers only. + static const ResponseCode RetryDelivery = 254; //!< Stop delivery on the topic but retry after sometime, returned by Broker handlers only. + + //----------- Lock section ------------- + // LockManager + //ALL these lock errors are handled in SuFaulHandler.cc + //Any addition to these error codes requires update to the SuFaultHandler + static const ResponseCode LockSyserr = 301; //!< System error during lock/unlock op + static const ResponseCode LockInconsis = 302; //!< Inconsistency detected in LockManager. + static const ResponseCode LockNested = 303; //!< Nested locking of same key not allowed. + static const ResponseCode LockNosuchpid = 304; //!< This pid does not hold the lock. + static const ResponseCode LockUnavail = 305; //!< Outa lock + static const ResponseCode LockInitfail = 306; //!< Initialization failure of the lock subsystem + static const ResponseCode LockInvalidarg = 307; //!< Invalid arguments to lock subsystem. + + //----------- Message section ------------- + //Message and Message serialization + static const ResponseCode SerializeFailed = 350; //!< Message Serialization Failed + static const ResponseCode DeserializeFailed = 351; //!< Message Deserialization failed + static const ResponseCode NoResponseCodeInMessage = 352; + + //----------- Transport Errors ------------- + static const ResponseCode TransportSendError = 400; //!< Curl error in communicating with other server + static const ResponseCode TransportSetHeaderFailed = 401; //!< Error in setting header in curl request + static const ResponseCode TransportCurlInitError = 402; // !< Error initializing curl handle -- should be curl specific + static const ResponseCode TransportUncertain = 403; //!< Send came back uncertain (timeout, usually) + static const ResponseCode TransportInvalidResponseBody = 404; //!< Send came back unparsable body + + //----------- Apache/Web section ------------- + static const ResponseCode EndOfBody = 450; //!< Normal end of incoming request body + static const ResponseCode BodyReadFailed = 451; //!< Failed reading incoming request body + static const ResponseCode BodyWriteFailed = 452; //!< Failed writing outgoing request body + static const ResponseCode EncryptionFailed = 453; //!< Failed to encrypt body or header + static const ResponseCode DecryptionFailed = 454; //!< Failed to decrypt body or header + + /** + * Give back a basic, generic string description of the response code. + * + * @param rc The response code to convert. + * @return The string describing it. + */ + static std::string toString(ResponseCode rc); + +}; + +/* For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ +#endif diff --git a/LogUtils.cc b/LogUtils.cc new file mode 100644 index 0000000..3dea981 --- /dev/null +++ b/LogUtils.cc @@ -0,0 +1,77 @@ +/*! \file log4_util.cc + * \brief This file has the helper functions for log4cpp; + * + * Copyright (c) 2008 Yahoo, Inc. + * All rights reserved. + */ +#include +#include + +#include "LogUtils.h" + +using namespace log4cpp; +using namespace std; + +// hacked link to actioncontext +std::string s_trackPathLog; + +LogMethod:: +LogMethod(log4cpp::Category& log, log4cpp::Priority::Value priority, + const char *function) : + log_(log), priority_(priority), function_(function) +{ + if(log_.isPriorityEnabled(priority_)) { + log_.getStream(priority_) << "Entering: " << function_; + } +} + + +LogMethod:: +~LogMethod() +{ + if(log_.isPriorityEnabled(priority_)) { + log_.getStream(priority_) << "Exiting: " << function_; + } +} + +// Protects against multiple calls (won't try to re-init) and gives +// back the same answer the original call got. +static int log4cppInitResult = -1; + +bool +initLog4cpp(const string &confFile) +{ + + if (log4cppInitResult != -1) { + return (log4cppInitResult == 0 ? true : false); + } + + log4cppInitResult = 0; // Assume success. + try { + PropertyConfigurator::configure(confFile); + } catch (log4cpp::ConfigureFailure &e) { + cerr << "log4cpp configuration failure while loading '" << + confFile << "' : " << e.what() << endl; + log4cppInitResult = 1; + } catch (std::exception &e) { + cerr << "exception caught while configuring log4cpp via '" << + confFile << "': " << e.what() << endl; + log4cppInitResult = 1; + } catch (...) { + cerr << "unknown exception while configuring log4cpp via '" << + confFile << "'." << endl; + log4cppInitResult = 1; + } + + return (log4cppInitResult == 0 ? true : false); +} + +/* + * For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ diff --git a/LogUtils.h b/LogUtils.h new file mode 100644 index 0000000..73c0af6 --- /dev/null +++ b/LogUtils.h @@ -0,0 +1,130 @@ +/* Copyright (C) 2007 Yahoo! Inc. All Rights Reserved. */ + +#ifndef LOG_UTIL_H +#define LOG_UTIL_H + +#include +#include "StringUtils.h" + +/** + * Quick and dirty link between LogUtils and ActionContext without having to + * resolve cross-inclusion issues, or force all components to start including + * ActionContext if they don't already. + */ +extern std::string s_trackPathLog; + +// These macros cannot be protected by braces because of the trailing stream +// arguments that get appended. Care must taken not to use them inside if/else +// blocks that do not use curly braces. +// I.e., the following will give unexpected results: +// if(foo) +// DHT_DEBUG_STREAM() << "heyheyhey"; +// else +// blah(); +// The 'else' will end up applying to the 'if' within the debug macro. +// Regardless of this, our standards say to always use curly brackets +// on every block anyway, no matter what. + +#define DHT_DEBUG_STREAM() if(log.isDebugEnabled()) log.debugStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_STREAM() if(log.isInfoEnabled()) log.infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_WITH_STACK_STREAM() if(log.isInfoEnabled()) log.infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_WARN_STREAM() if(log.isWarnEnabled()) log.warnStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_ERROR_STREAM() if(log.isErrorEnabled()) log.errorStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_CRIT_STREAM() if(log.isCritEnabled()) log.critStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_TRACE_PRIORITY log4cpp::Priority::DEBUG + 50 +#define DHT_TRACE_STREAM() if (log.isPriorityEnabled(DHT_TRACE_PRIORITY)) log.getStream(DHT_TRACE_PRIORITY) << __FUNCTION__ << "():" << __LINE__ << ":" + +// Sadly, sometimes 'log' is reserved by someone else so the code needs to +// use a different name for log. In that case, it can be passed in to these. +#define DHT_DEBUG_STREAML(x_log_hdl_x) if((x_log_hdl_x).isDebugEnabled()) (x_log_hdl_x).debugStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_STREAML(x_log_hdl_x) if((x_log_hdl_x).isInfoEnabled()) (x_log_hdl_x).infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" +#define DHT_INFO_WITH_STACK_STREAML(x_log_hdl_x) if((x_log_hdl_x).isInfoEnabled()) (x_log_hdl_x).infoStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_WARN_STREAML(x_log_hdl_x) if((x_log_hdl_x).isWarnEnabled()) (x_log_hdl_x).warnStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_ERROR_STREAML(x_log_hdl_x) if((x_log_hdl_x).isErrorEnabled()) (x_log_hdl_x).errorStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_CRIT_STREAML(x_log_hdl_x) if((x_log_hdl_x).isCritEnabled()) (x_log_hdl_x).critStream() << __FUNCTION__ << "():" << __LINE__ << ":" << s_trackPathLog +#define DHT_TRACE_STREAML(x_log_hdl_x) if ((x_log_hdl_x).isPriorityEnabled(DHT_TRACE_PRIORITY)) (x_log_hdl_x).getStream(DHT_TRACE_PRIORITY) << __FUNCTION__ << "():" << __LINE__ << ":" + +//Macros to use when a function returns on error without writing any log message +// or error translation +#define RETURN_IF_NOT_OK(x_call_x) \ +{ \ + FwCode::ResponseCode rcx___ = (x_call_x); \ + if(rcx___ != FwCode::FwOk) { \ + return rcx___; \ + } \ +} + +#define RETURN_THIS_IF_NOT_OK(x_othercode_x, x_call_x) \ +{ \ + FwCode::ResponseCode rcx___ = (x_call_x); \ + if(rcx___ != FwCode::FwOk) { \ + return (x_othercode_x); \ + } \ +} + +/// Caution! Only use in checks for 'impossible' code conditions. Regular errors +/// should be handled regularly +#define BAD_CODE_ABORT() \ + { \ + std::string x_msg_x("Bad code at " __FILE__ ":"); \ + x_msg_x.append(StringUtils::toString(__LINE__)); \ + throw std::runtime_error(x_msg_x); \ + } + +#define BAD_CODE_IF_NOT_OK(x_call_x) \ + do {\ + if((x_call_x) != FwCode::FwOk) { \ + BAD_CODE_ABORT(); \ + } \ + } while(0) + +/* + * Above macros are meant to be used by all components. + */ + +/** + * Class that allows for method entry/exit logging with a single declaration. + * Always uses debug. + */ +class LogMethod +{ + public: + LogMethod(log4cpp::Category& log, log4cpp::Priority::Value priority, + const char *function); + virtual ~LogMethod(); + + private: + log4cpp::Category& log_; + log4cpp::Priority::Value priority_; + const char *function_; +}; + +// convenience macros to use the above class +#define LOG_METHOD() LogMethod log_method_entry_exit(log, log4cpp::Priority::DEBUG, __FUNCTION__) +#define TRACE_METHOD() LogMethod log_method_entry_exit(log, DHT_TRACE_PRIORITY, __FUNCTION__) + +/** Initialize log4cpp config file. + * This function needs to be called once for each executable. Multiple + * initializations will return the result of the first initialization (IOW, + * an executable can be initialized with exactly one config file). Errors + * encountered by this function are printed onto cerr. See log4cpp + * documentation for what happens when PropertyConfigurator::configure() + * fails. + * \param confFile is the path name of the log4cpp config file. + * Depending on the machine that the executable is running in, the path + * will be different. + * \return true if the initialization succeeds, false if it fails. + */ +bool initLog4cpp(const std::string & confFile); + +#endif + +/* + * For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7fcb172 --- /dev/null +++ b/Makefile @@ -0,0 +1,74 @@ +STASIS_DIR=../stasis + +LIB=$(STASIS_DIR)/build/src/stasis \ + -L/home/y/lib +INCLUDE=-I$(STASIS_DIR)/src/ -I$(STASIS_DIR) -I./ \ + -I/home/y/include + +LIBLIST=-lpthread \ + -lstasis \ + -lm +# -licui18n \ +# -licuuc \ +# -licudata \ +# -licuio \ +# -llog4cpp_y \ +# -lthoth + +FLAGS=-pg -g -O1 +#FLAGS=-O3 + +HFILES=logserver.h logstore.h logiterators.h datapage.h merger.h tuplemerger.h datatuple.h +CFILES=logserver.cpp logstore.cpp logiterators.cpp datapage.cpp merger.cpp tuplemerger.cpp + + +# STASIS_DIR=../stasis +# LD_LIBRARY_PATH=$STASIS_DIR/build/src/stasis +# LD_LIBRARY_PATH=$STASIS_DIR/build/src/stasis ./hello + + +logstore: check_gen.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +test: dp_check lt_check ltable_check merger_check rb_check \ + lmerger_check tmerger_check server_check tcpclient_check + +lt_check: check_logtree.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +dp_check: check_datapage.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +ltable_check: check_logtable.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +merger_check: check_merge.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +lmerger_check: check_mergelarge.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +tmerger_check: check_mergetuple.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +rb_check: check_rbtree.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +server_check: check_server.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +tcpclient_check: check_tcpclient.cpp $(HFILES) $(CFILES) + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + + +hello : hello.cpp UCharUtils.cc LogUtils.cc + g++ -o $@ $^ -L$(LIB) $(INCLUDE) $(LIBLIST) $(FLAGS) + +clean: + rm -f logstore server_check hello lt_check merger_check lmerger_check rb_check \ + dp_check ltable_check tmerger_check rose tcpclient_check +veryclean: clean + rm -f *~ gmon.out prof.res + + + diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..cc6008a --- /dev/null +++ b/NOTES @@ -0,0 +1,152 @@ +###################################################################################### +constants.h +###################################################################################### + +#define PAGE_SIZE 4096 +#define BLOB_THRESHOLD_SIZE (PAGE_SIZE-30) + +SLOT TYPES + +#define INVALID_SLOT (-1) +/** This constant is used as a placeholder to mark slot locations that contain blobs. + @see slotted.c, indirect.c, blobManager.c */ +#define BLOB_SLOT (-2) +#define NORMAL_SLOT (-3) +#define SLOT_TYPE_END (-4) + +###################################################################################### +allocationPolicy.h +###################################################################################### + +struct allocationPolicy { + struct LH_ENTRY(table) * xidAlloced; + struct LH_ENTRY(table) * xidDealloced; + struct RB_ENTRY(tree) * availablePages; + struct LH_ENTRY(table) * pageOwners; + struct LH_ENTRY(table) * allPages; +}; + +typedef struct allocationPolicy stasis_allocation_policy_t; + +typedef struct availablePage { + int freespace; + pageid_t pageid; + int lockCount; // Number of active transactions that have alloced or dealloced from this page. +} availablePage; + +availablePage * stasis_allocation_policy_pick_suitable_page(stasis_allocation_policy_t * ap, int xid, int freespace); + +//////////////////////////////////////////////////////////////////////////////////// + +==15277== Thread 4: +==15277== Invalid free() / delete / delete[] +==15277== at 0x401BEFA: free (vg_replace_malloc.c:235) +==15277== by 0x4FD60FB: free_mem (in /lib/tls/libc-2.3.4.so) +==15277== by 0x4FD5B21: __libc_freeres (in /lib/tls/libc-2.3.4.so) +==15277== by 0x4017336: _vgw_freeres (vg_preloaded.c:62) +==15277== by 0x4030B25: pthread_cond_wait@@GLIBC_2.3.2 (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so) +==15277== Address 0x4EC66B8 is not stack'd, malloc'd or (recently) free'd +==15277== +==15277== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 40 from 1) +==15277== malloc/free: in use at exit: 8,540,389 bytes in 912 blocks. +==15277== malloc/free: 1,815,016 allocs, 1,814,105 frees, 1,121,769,405 bytes allocated. +==15277== For counts of detected errors, rerun with: -v +==15277== searching for pointers to 912 not-freed blocks. +==15277== checked 43,383,184 bytes. +==15277== +==15277== Thread 1: +==15277== +==15277== 4,883,561 (32 direct, 4,883,529 indirect) bytes in 1 blocks are definitely lost in loss record 16 of 46 +==15277== at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164) +==15277== by 0x8052C01: __gnu_cxx::new_allocator >::allocate(unsigned, void const*) (new_allocator.h:81) +==15277== by 0x8052B79: std::_Rb_tree, datatuple, std::allocator >::_M_get_node() (stl_tree.h:356) +==15277== by 0x8052ACC: std::_Rb_tree, datatuple, std::allocator >::_M_create_node(datatuple const&) (stl_tree.h:365) +==15277== by 0x8052978: std::_Rb_tree, datatuple, std::allocator >::_M_insert(std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, datatuple const&) (stl_tree.h:783) +==15277== by 0x805270C: std::_Rb_tree, datatuple, std::allocator >::insert_unique(datatuple const&) (stl_tree.h:881) +==15277== by 0x8052332: std::set >::insert(datatuple const&) (stl_set.h:314) +==15277== by 0x8050077: logtable::insertTuple(datatuple&) (logstore.cpp:1030) +==15277== by 0x804A641: insertProbeIter(int) (check_merge.cpp:160) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 336 (28 direct, 308 indirect) bytes in 1 blocks are definitely lost in loss record 17 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x404D906: stasis_dirty_page_table_init (dirtyPageTable.c:133) +==15277== by 0x404BFA5: Tinit (transactional2.c:66) +==15277== by 0x804A2AE: insertProbeIter(int) (check_merge.cpp:97) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 40 bytes in 1 blocks are definitely lost in loss record 20 of 46 +==15277== at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164) +==15277== by 0x8053025: merge_scheduler::addlogtable(logtable*) (merger.cpp:20) +==15277== by 0x804A33E: insertProbeIter(int) (check_merge.cpp:113) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 80 bytes in 10 blocks are definitely lost in loss record 32 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x804D75E: logtree::create(int) (logstore.cpp:169) +==15277== by 0x8053BD5: memMergeThread(void*) (merger.cpp:236) +==15277== by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so) +==15277== +==15277== +==15277== 4,792 (432 direct, 4,360 indirect) bytes in 18 blocks are definitely lost in loss record 40 of 46 +==15277== at 0x401B790: operator new(unsigned) (vg_replace_malloc.c:164) +==15277== by 0x80501C5: logtable::insertTuple(int, datatuple&, recordid&, logtree*) (logstore.cpp:1064) +==15277== by 0x8054FA7: insertTuple(int, DataPage*, datatuple&, logtable*, logtree*, recordid&, int&, int&) (merger.cpp:643) +==15277== by 0x8054AFF: merge_iterators(int, treeIterator*, memTreeIterator >, datatuple>*, logtable*, logtree*, int&) (merger.cpp:534) +==15277== by 0x8053C8F: memMergeThread(void*) (merger.cpp:251) +==15277== by 0x402E370: start_thread (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x4F96FFD: clone (in /lib/tls/libc-2.3.4.so) +==15277== +==15277== +==15277== 576 bytes in 4 blocks are possibly lost in loss record 41 of 46 +==15277== at 0x401C6BF: calloc (vg_replace_malloc.c:279) +==15277== by 0x400E71A: _dl_allocate_tls (in /lib/ld-2.3.4.so) +==15277== by 0x402E91E: pthread_create@@GLIBC_2.1 (in /lib/tls/libpthread-2.3.4.so) +==15277== by 0x80538FF: merge_scheduler::startlogtable(int) (merger.cpp:184) +==15277== by 0x804A37E: insertProbeIter(int) (check_merge.cpp:116) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 3,175 bytes in 1 blocks are possibly lost in loss record 42 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x8051BC7: DataPage::readbytes(int, int, int, unsigned char**) (datapage.cpp:235) +==15277== by 0x8051F7F: DataPage::RecordIterator::getnext(int) (datapage.cpp:442) +==15277== by 0x80512E0: DataPage::recordRead(int, unsigned char*, unsigned, datatuple**) (datapage.cpp:206) +==15277== by 0x8050449: logtable::findTuple(int, unsigned char*, unsigned, logtree*) (logstore.cpp:1104) +==15277== by 0x804FF48: logtable::findTuple(int, unsigned char*, unsigned) (logstore.cpp:979) +==15277== by 0x804A8D3: insertProbeIter(int) (check_merge.cpp:198) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 173,599 bytes in 2 blocks are possibly lost in loss record 43 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x804FFD0: logtable::insertTuple(datatuple&) (logstore.cpp:1014) +==15277== by 0x804A641: insertProbeIter(int) (check_merge.cpp:160) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== +==15277== 2,281,057 bytes in 681 blocks are definitely lost in loss record 45 of 46 +==15277== at 0x401B405: malloc (vg_replace_malloc.c:149) +==15277== by 0x8051BC7: DataPage::readbytes(int, int, int, unsigned char**) (datapage.cpp:235) +==15277== by 0x8051F7F: DataPage::RecordIterator::getnext(int) (datapage.cpp:442) +==15277== by 0x80512E0: DataPage::recordRead(int, unsigned char*, unsigned, datatuple**) (datapage.cpp:206) +==15277== by 0x8050449: logtable::findTuple(int, unsigned char*, unsigned, logtree*) (logstore.cpp:1104) +==15277== by 0x804FF81: logtable::findTuple(int, unsigned char*, unsigned) (logstore.cpp:990) +==15277== by 0x804A8D3: insertProbeIter(int) (check_merge.cpp:198) +==15277== by 0x804AB9B: main (check_merge.cpp:235) +==15277== +==15277== LEAK SUMMARY: +==15277== definitely lost: 2,281,669 bytes in 712 blocks. +==15277== indirectly lost: 4,888,197 bytes in 150 blocks. +==15277== possibly lost: 177,350 bytes in 7 blocks. +==15277== still reachable: 1,193,173 bytes in 43 blocks. +==15277== suppressed: 0 bytes in 0 blocks. +==15277== Reachable blocks (those to which a pointer was found) are not shown. +==15277== To see them, rerun with: --show-reachable=yes +Killed diff --git a/StringUtils.h b/StringUtils.h new file mode 100644 index 0000000..d098b76 --- /dev/null +++ b/StringUtils.h @@ -0,0 +1,345 @@ +/* $Id: StringUtils.h,v 1.17 2009/03/25 20:32:51 dlomax Exp $ */ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +#ifndef __STRING_UTIL_H +#define __STRING_UTIL_H +#include +#include +#include +#include "FwCode.h" + +/** + * Container for static string manipulation utilities. + */ +class StringUtils +{ + public: + + /** + * Our replacement for yax_getroot(). Allows our code to have a different + * root than components we use or link with. Is nice for unit testing. + * @return Copy of the value in a std::string + */ + static std::string getDhtRoot(); + + /** + * Parse a tablet name into left and right limits. + * @return true if parsing successful, false if incorrect format + */ + static bool parseTabletName(const std::string& tablet, std::string& leftLimit, + std::string& rightLimit); + + /** + * Construct a tablet name from left and right limits. + */ + static void buildTabletName(const std::string& leftLimit, + const std::string& rightLimit, + std::string& tablet); + + /** + * General purpose method to assemble a full path name, using + * getDhtRoot() so that + * the root will be configurable. DO NOT supply "/home/y" in path1. + */ + static std::string makePath(const std::string& path1 = "", + const std::string& path2 = "", + const std::string& path3 = "", + const std::string& path4 = "", + const std::string& path5 = "", + const std::string& path6 = ""); + + /** + * Append additional paths to an existing one - does not prepend ROOT. + */ + static void appendPath(std::string& base_path, const std::string& path2 = "", + const std::string& path3 = "", + const std::string& path4 = ""); + + /** + * Construct a topic name from a table/tablet. + * + * @return the topic name + */ + static std::string buildTopicName(const std::string& table, + const std::string& tablet); + + /** + * Construct a topic name from a table/tablet. + * @param topic Is filled with the topic name. + */ + static void buildTopicName(const std::string& table, + const std::string& tablet, + std::string &topic); + + /** + * Parses topic into table and tablet portions. + * + * @param table Filled with the table name. + * @param tablet Filled with the tablet name. + * @param true if the parsing succeeded, false if not. + */ + static bool parseTopicName(const std::string& topic, + std::string& table, + std::string &tablet); + + /** + * Only for use in log statements - this is slow. Produce a printable + * string where binary (<32) characters are hex encoded, but all others + * are left alone. + * + * @param str string to encode + * @param len length of string + * @return encoded string. + */ + static std::string toPrintable(const char *str, size_t len); + + /** + * Convert a formatted hex string back into its original + * 64-bit value + * + * @param value the hex-encoded string + * @param out the value + * @return FwCode::FwOk on success, FwCode::BadHexString on parse failure + */ + static FwCode::ResponseCode + convertHexStringToUI64(const std::string& value, uint64_t& out); + + /** + * Convert a formatted hex string back into its original + * 32-bit value + * + * @param value the hex-encoded string + * @param out the value + * @return FwCode::FwOk on success, FwCode::BadHexString on parse failure + */ + static FwCode::ResponseCode + convertHexStringToUI32(const std::string& value, uint32_t& out); + + /** + * Standard means for formatting a 0x prefixed hex string from a + * 64-bit unsigned value. Will produce upper-case letters. Will + * pad with zeros at the beginning to fill out 16 hex chars. + * + * @param the value to format + * @return the formatted value, like "0xDEADBEEF00000000" + */ + static std::string convertUI64ToHexString( uint64_t val ); + + /** + * Standard means for formatting a 0x prefixed hex string from a + * 32-bit unsigned value. Will produce upper-case letters. Will + * pad with zeros at the beginning to fill out 8 hex chars. + * + * @param the value to format + * @return the formatted value, like "0xDEADBEEF" + */ + static std::string convertUI32ToHexString( unsigned int val ); + + /** + * Standard means for formatting a small hex string from a + * 32-bit unsigned value. The "0x" will NOT be included. + * Will produce upper-case letters. Will NOT pad with zeros + * at the beginning. + * + * @param the value to format + * @return the formatted value, like "DEADBEEF" + */ + static std::string convertUI32ToMinimalHexString( unsigned int val ); + + /** + * Assemble the fields of ENCRYPTED_BODY_HEADER and encrypt it for + * sending to the remote side. + * @param result is the out parameter having the resulting string. + * @param encKeyName is the name of the key in keydb whose value will be + * used as the encryption key + * @param bodyEncVersion is the version of the encryption scheme used to + * encrypt the body (not the encryption scheme of this header itself). + * @param expireTime is the time (in usecs) after which the request + * should not be processed by the receiver of this header. + */ + static FwCode::ResponseCode makeEncryptedBodyHdr(std::string & result, + const char *encKeyName, uint32_t bodyEncVersion, uint64_t expireTime); + + /** + * Parse the incoming ENCRYPTED_BODY_HEADER, decrypting it, and + * separating the fields in it. + * @param inval is the incoming encrypted string. + * @param encKeyName is the name of the key in keydb whose value will be + * used as the decryption key + * @param bodyEncVersion is the version of the encryption scheme to be + * used to * decrypt the body (not for the decryption of this header + * itself). + * @param expireTime is the time (in usecs) after which the response + * should not be processed by the receiver of this header. + */ + static FwCode::ResponseCode parseEncryptedBodyHdr(const std::string & inval, + const char *encKeyName, uint32_t & bodyEncVersion, uint64_t & expireTime); + + /** + * Get the hash for an un-normalized record name. + * + * @param unnormalizedRecordName a raw record name from user input + * @param (output) hashResult the hex string of the hash value. + * @return FwCode::FwOk on success, else an error relating to normalization + */ + static FwCode::ResponseCode normalizeAndHashRecordName + ( const std::string& unnormalizedRecordName, + std::string & hashResult /* out */ ); + + /** + * Get the hash for a normalized record name. + * + * @param recordName the record name. MUST be previously normalized. + * @return hashResult the uint32_t of the hash value. + */ + static uint32_t hashRecordName(const std::string& recordName); + + /** + * Get the hash for a normalized record name. + * + * @param recordName the record name. MUST be previously normalized. + * @param (output) hashResult the hex string of the hash value. + */ + static void hashRecordName( const std::string& recordName, + std::string & hashResult /* out */ ); + /** + * Get the hash for a normalized record name in string and int form + * + * @param recordName the record name. MUST be previously normalized. + * @param (output) hashResult the hex string of the hash value. + * @param (output) hexNum numerical value of hash + */ + static void hashRecordName( const std::string& recordName, + std::string & hashResult /* out */, + uint32_t& hexNum); + + /** + * Method to hash a string using crc32. + * + * @param buf data to hash + * @param len length of buf + * @return hash value + */ + static uint32_t crcHash(const char * buf, uint32_t len); + + /** + * util function to convert any type to a string + */ + template static inline std::string toString(T item); + + /** + * convert string to any type of value + * @param strValue string value to parse + * @param value(out) value to read from strValue + * @return FwCode::FwOk on success + * FwCode::FwError on failure (error is *not* logged) + */ + template static inline + FwCode::ResponseCode fromString(const std::string& strValue, + T& value); + + /** + * convert a hexadecimal number to string representation + * of fixed width ( 2 * sizeof(T) ) + * @param value number to convert to string + * @return string representation of value + */ + template static inline + std::string numberToHexString(T value); + + /** + * convert a hexadecimal number to minimal string representation + * @param value number to convert to string + * @return string representation of value + */ + template static inline + std::string numberToMinimalHexString(T value); + + /** + * convert a hexadecimal string to a number + * @param strvalue input string to read from + * @param value(out) output number + * @return FwCode::FwOk on successful conversion + * FwCode::FwError on failure to convert strvalue + * to number + */ + template static inline + FwCode::ResponseCode hexStringToNumber(const std::string& strvalue, + T& value); + + + static const std::string EMPTY_STRING; +}; + +template +std::string StringUtils:: +toString(T item) +{ + std::ostringstream buf; + buf << item; + return buf.str(); +} + +template +FwCode::ResponseCode StringUtils:: +fromString(const std::string& strValue, + T& value) +{ + std::istringstream buf(strValue); + buf >> value; + if(buf.fail()|| + (strValue.length() != buf.tellg() )) + { + return FwCode::FwError; + } + return FwCode::FwOk; +} + +template +std::string StringUtils:: +numberToHexString(T value) +{ + std::ostringstream buf; + buf << "0x" << std::hex + << std::setw(sizeof(T) * 2) << std::setfill('0') + << std::uppercase << value; + return buf.str(); + +} + +template +std::string StringUtils:: +numberToMinimalHexString(T value) +{ + std::ostringstream buf; + buf << std::hex << std::uppercase << value; + return buf.str(); + +} + +template +FwCode::ResponseCode StringUtils:: +hexStringToNumber(const std::string& strvalue, + T& value) +{ + std::istringstream buf(strvalue); + buf >> std::hex >> value; + if(buf.fail() || + (strvalue.length() != buf.tellg() )) + { + return FwCode::FwError; + } + return FwCode::FwOk; + +} + +/* + * For customized vim control + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: sw=4:ts=4:et + * vim<600: sw=4:ts=4:et + */ +#endif diff --git a/UCharUtils.cc b/UCharUtils.cc new file mode 100644 index 0000000..2133034 --- /dev/null +++ b/UCharUtils.cc @@ -0,0 +1,326 @@ +/* $Id: UCharUtils.cc,v 1.16 2009/03/03 20:19:18 dlomax Exp $ */ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +//#include +#include "UCharUtils.h" +#include +#include "LogUtils.h" +//#include "ActionContext.h" +#include +#include +#include // To make sure we have UTF-8 + +static log4cpp::Category &log = + log4cpp::Category::getInstance("dht.framework." __FILE__); + + +UCharUtilsImpl *UCharUtils::instance_ = NULL; + +UCharUtilsImpl:: +UCharUtilsImpl() : uconv_(NULL) { + LOG_METHOD(); + + ucBuffLen = 0; + ucBuff = NULL; + + ucNormBuffLen = 0; + ucNormBuff = NULL; + + charBuffLen = 0; + charBuff = NULL; +} + +FwCode::ResponseCode UCharUtilsImpl:: +init() +{ + UErrorCode erc = U_ZERO_ERROR; + + uconv_ = ucnv_open("utf-8", &erc); + if (uconv_ == NULL) { + DHT_ERROR_STREAM() << "EC:UNICODE:Problem geting utf-8 converter, erc:" << erc + << ", " << u_errorName(erc); + return FwCode::UcnvOpenFailed; + } + return FwCode::FwOk; +} + +UCharUtilsImpl:: +~UCharUtilsImpl() { + reset(); + if (uconv_ != NULL) { + ucnv_close(uconv_); + uconv_ = NULL; + } +} + +void UCharUtilsImpl:: +reset() { + LOG_METHOD(); + + if (ucBuff != NULL) { + delete[] ucBuff; + ucBuffLen = 0; + ucBuff = NULL; + } + if (ucNormBuff != NULL) { + delete[] ucNormBuff; + ucNormBuffLen = 0; + ucNormBuff = NULL; + } + if (charBuff != NULL) { + delete[] charBuff; + charBuffLen = 0; + charBuff = NULL; + } +} + +/** + * Small wrapper to hide multi-line thoth api inside single-line call. + */ +bool UCharUtils:: +isUTF8(const std::string& value) +{ + size_t pos = 0; + thoth_result result = thoth_validate_utf8(value.c_str(), value.length(), + &pos); + + if(result != UTF8_VALID) { + std::cerr + //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8) + << "value (" << value << ") is not UTF-8. thoth_result:" << result + << ", position=" << pos; + return false; + } + return true; +} + +/** + * Small wrapper to hide multi-line thoth api inside single-line call. + */ +bool UCharUtils:: +isUTF8(const char * value, size_t value_len) +{ + size_t pos = 0; + thoth_result result = thoth_validate_utf8(value, value_len, &pos); + + if(result != UTF8_VALID) { + //RESPONSE_DEBUG_STREAM(FwCode::DataNotUtf8) + std::cerr + << "value (" << std::string(value, value_len) + << ") is not UTF-8. thoth_result:" << result + << ", position=" << pos; + return false; + } + return true; +} + +// Convert an input string (expected to be UTF-8) into unicode UChars +// The result of the conversion will be sitting in our ucBuff area. +FwCode::ResponseCode UCharUtilsImpl:: +convert(const std::string &input, int32_t &len) +{ + LOG_METHOD(); + + //UTF-8 validation + if(!UCharUtils::isUTF8(input)) { + return FwCode::DataNotUtf8; + } + + int size = input.length() * 2; + + // Check if we already have a big enough buffer + if (ucBuffLen < size) { + // Nope, first check if we need to release what we've been using + if (ucBuff) { + delete[] ucBuff; + } + ucBuffLen = size; + ucBuff = new UChar[ucBuffLen]; + } + + UErrorCode erc = U_ZERO_ERROR; + len = ucnv_toUChars(uconv_, + ucBuff, + ucBuffLen, + input.data(), + input.length(), &erc); + + if (U_FAILURE(erc)) { + //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed) + std::cerr + << "EC:UNICODE:error:" << erc + << ", " << u_errorName(erc) + << " from converting input:'" << input << "'"; + len = 0; + return FwCode::ConvertToUCharFailed; + } + return FwCode::FwOk; +} + +// Normalize an input string. Note that all three internal buffers will +// be used by this operation, but by the time we finish, we'll be done +// with them. +FwCode::ResponseCode UCharUtilsImpl:: +normalize(const std::string &input, std::string &result /* out */) +{ + LOG_METHOD(); + + // convert our UTF-8 into UChar + int32_t inLen = 0; + FwCode::ResponseCode rc = convert(input, inLen); + + if (rc != FwCode::FwOk) { + result.erase(); + return rc; + } + + // Do a quick check if the input is already normalized so that + // we can duck out early + UErrorCode status = U_ZERO_ERROR; + if (unorm_quickCheck(ucBuff, inLen, + UNORM_NFC, &status) == UNORM_YES) { + DHT_DEBUG_STREAM() << "already normalized input:" << input; + result = input; + return FwCode::FwOk; + } + + // Check if we have enough space for the normalized result. + // We'll make the output space twice as big as the input (although + // it's more likely that the normalized result will be shorter + // as it combines characters. E.g. 'A' 'put an accent on the previous' + int32_t newSize = inLen * 2; + if (newSize > ucNormBuffLen) { + DHT_DEBUG_STREAM() << "newSize:" << newSize + << " ucNormBuffLen:" << ucNormBuffLen; + if (ucNormBuff) { + delete[] ucNormBuff; + } + ucNormBuffLen = newSize; + ucNormBuff = new UChar[ucNormBuffLen]; + } + + // Do the actual normalization + status = U_ZERO_ERROR; + int32_t normLen = unorm_normalize(ucBuff, inLen, + UNORM_NFC, 0, + ucNormBuff, + ucNormBuffLen, + &status); + if(U_FAILURE(status)) { + //RESPONSE_ERROR_STREAM(FwCode::FwError) + std::cerr + << "EC:UNICODE:error:" << status << ", " << u_errorName(status) + <<" in unorm_normalize, inLen:" << inLen + << " ucNormBuffLen:" << ucNormBuffLen; + return FwCode::FwError; + } + + // Make sure we have some space to convert back to UTF-8 + int32_t resultLen = normLen * 4; + if (resultLen > charBuffLen) { + DHT_DEBUG_STREAM() << "resultLen:" << resultLen + << " charBuffLen:" << charBuffLen; + if (charBuff) { + delete[] charBuff; + charBuff= NULL; + } + charBuffLen = resultLen; + charBuff = new char[charBuffLen]; + } + + DHT_DEBUG_STREAM() <<"calling ucnv_fromUChars, normLen:" << normLen; + + // Go from UChar array to UTF-8 + int32_t actualLen = ucnv_fromUChars(uconv_, + charBuff, charBuffLen, + ucNormBuff, normLen, + &status); + if(U_FAILURE(status)) { + //RESPONSE_ERROR_STREAM(FwCode::FwError) + std::cerr + << "EC:UNICODE:error:" << status << ", " << u_errorName(status) + << " in ucnv_fromUChars charBuffLen:" << charBuffLen + << " normLen:" << normLen; + return FwCode::FwError; + } + + // Smack our UTF-8 characters into the result string + result.assign(charBuff, actualLen); + DHT_DEBUG_STREAM() << "leaving actualLen:" << actualLen + << " result:" << result; + return FwCode::FwOk; +} + + +FwCode::ResponseCode UCharUtils:: +init() +{ + if (instance_ == NULL) { + instance_ = new UCharUtilsImpl(); + return instance_->init(); + } + return FwCode::FwOk; // already initialized +} + +void UCharUtils:: +close() +{ + if(instance_ != NULL) { + delete instance_; + instance_ = NULL; + } +} + +// Given an input string, return a unicode UChar array. Note that the +// return value is a pointer to our internal buffer. +UChar * UCharUtils:: +getUChar(const std::string &input, int32_t& len) { + LOG_METHOD(); + + // do the conversion...somehow need 2x input len for utf8 to utf16 + if(instance_->convert(input, len) != FwCode::FwOk) { + len = 0; + return NULL; + } + + return instance_->ucBuff; +} + +FwCode::ResponseCode UCharUtils:: +normalize(const std::string &input, std::string &result) { + LOG_METHOD(); + return(instance_->normalize(input, result)); +} + + +FwCode::ResponseCode UCharUtils:: +parseRegExpPattern(const std::string &pattern, + URegularExpression * & result /* out */) +{ + UParseError perr; + UErrorCode erc = U_ZERO_ERROR; + int32_t ureglen = 0; + + // Do not delete uregexp, it's a static reusable buffer inside UCharUtils + UChar *uregexp = UCharUtils::getUChar(pattern, ureglen); + if (uregexp == NULL) { + //RESPONSE_ERROR_STREAM(FwCode::ConvertToUCharFailed) + std::cerr + << "EC:UNICODE|IMPOSSIBLE:Unable to convert pattern to unicode: " << pattern; + return FwCode::ConvertToUCharFailed; + } + + URegularExpression *regexp= uregex_open(uregexp, ureglen, 0, + &perr, + &erc); + if(erc != U_ZERO_ERROR) { + //RESPONSE_DEBUG_STREAM(FwCode::CompileRegExFailed) + std::cerr + << "Compiling regex failed at: " << perr.offset + << "; re=" << pattern; + return FwCode::CompileRegExFailed; + } + + result = regexp; + return FwCode::FwOk; +} diff --git a/UCharUtils.h b/UCharUtils.h new file mode 100644 index 0000000..4f751be --- /dev/null +++ b/UCharUtils.h @@ -0,0 +1,139 @@ +/* Copyright (C) 2008 Yahoo! Inc. All Rights Reserved. */ + +#ifndef UCHAR_UTILS_H +#define UCHAR_UTILS_H + +#include +#include +#include "FwCode.h" +#include + +// Forward declaration +class UCharUtilsImpl; + +/** + * Some handy utilities for working with unicode characters. Yes, these + * could have just been some regular routines instead of static methods + * in a class, but doing it this way gives us some containment of what + * other static tidbits might be necessary (like reusable buffer space). + * which are all hidden within the UCharUtilsImpl class. + * + * This is a singleton - do not use in a threaded program. + */ +class UCharUtils { + private: + + /** + * Our pointer to all sorts of goodness. + */ + static UCharUtilsImpl *instance_; + public: + + /** + * Initialize the utilities. Primarily opens the utf-8 converter. + * Calling this is required prior to using the converter. + * + * @return FwCode::FwOk on success, FwCode::UcnvOpenFailed on + * failure. + */ + static FwCode::ResponseCode init(); + + /** + * Release all resources. init() must be called again + * in order to use again. + */ + static void close(); + + /** + * Small wrapper to hide multi-line thoth api inside single-line call. + * + * @param value string to be tested for utf-8-ness + * @return true if it is utf-8, false if not + */ + static bool isUTF8(const std::string& value); + + /** + * Small wrapper to hide multi-line thoth api inside single-line call. + * + * @param value char string to be tested for utf-8-ness + * @param value_len length of value + * @return true if it is utf-8, false if not + */ + static bool isUTF8(const char * value, size_t value_len); + + /** + * Convert utf-8 strings into UChar strings. Note that the + * result is an internal reusable buffer so the caller should + * *not* release it. + * @param input utf-8 string to convert + * @param len set to length of output string + * @return NULL if anything bad happens, otherwise an allocated UChar * + * the caller must *NEVER* free this pointer. + */ + static UChar * getUChar(const std::string &input, int32_t& len); + + /** + * Do a NFC normalization so that different yet equivalent strings + * will have a single representation. See + * http://www.unicode.org/unicode/reports/tr15/ + * for more information. + * @param input A UTF-8 string that we want to normalize + * @param result (output) the normalized UTF-8 string + * @return FwCode::FwOk on success, + * FwCode::FwError on conversion failure, + * FwCode::InvalidData if input was not utf-8 + */ + static FwCode::ResponseCode normalize(const std::string &input, + std::string &result); + + /** + * Compile a regular expression in a unicode-friendly way. + * + * @param pattern the regexp pattern to compile. Assumed to + * be utf-8. + * @param result (output) Set to point to the compiled regexp. + * Must be released by the caller via uregex_close() when + * finished with it. + * @return FwCode::FwOk if compilation succeeded, + * FwCode::CompileRegExFailed or FwCode::ConvertToUCharFailed + * on failure. + */ + static FwCode::ResponseCode parseRegExpPattern + (const std::string &pattern, + URegularExpression * & result /* out */); + +}; + +/** + * Bug 2574599 - Impl exposed for use by multiple threads; singleton not + * appropriate for multi-threaded program. + */ +class UCharUtilsImpl +{ +private: + UConverter *uconv_; + +public: + UCharUtilsImpl(); + ~UCharUtilsImpl(); + + FwCode::ResponseCode init(); + void reset(); + FwCode::ResponseCode convert(const std::string &input, int32_t &len); + + FwCode::ResponseCode normalize(const std::string &nput, std::string &result); + + // Buffer used to convert from UTF-* into UChar + int32_t ucBuffLen; + UChar *ucBuff; + + // Buffer used for UChar normalization output + int32_t ucNormBuffLen; + UChar *ucNormBuff; + + // Buffer used to convert UChars back to UTF-8 + int32_t charBuffLen; + char *charBuff; +}; + +#endif // _DHT_UCHAR_UTILS_ diff --git a/adriana-lima.awk b/adriana-lima.awk new file mode 100755 index 0000000..4454496 --- /dev/null +++ b/adriana-lima.awk @@ -0,0 +1,130 @@ +#! /usr/bin/awk -f + +BEGIN{ + + READ_SLA = 500; + WRITE_SLA = 750; + + readcnt = 0; + writecnt = 0; + + wlat_tot = 0; + wlat_max = 0; + wlat_sqtot = 0; + wlat_slafail = 0; + + DIST_BUCKET_LENGTH = 100; + DIST_BUCKET_COUNT = 20; + for(i=1; i<=DIST_BUCKET_COUNT; i++) + { + rlat_dist[i] = 0; + wlat_dist[i] = 0; + } + + + rlat_tot = 0; + rlat_max = 0; + rlat_sqtot = 0; + rlat_slafail = 0; + + printf("READ SLA:\t%d\n", READ_SLA); + printf("WRITE SLA:\t%d\n", WRITE_SLA); + printf("\n"); + +} + +/INFO - doRead()/ { readcnt = readcnt + 1; + + split(substr($0, match($0, "latency:")+ length("latency:")+1), tmp_arr, " "); + #printf("%d\n", strtonum(tmp_arr[1])); + + lat_val = strtonum(tmp_arr[1]); + + dist_index = int(lat_val / DIST_BUCKET_LENGTH) + 1; + if(dist_index > DIST_BUCKET_COUNT) + dist_index = DIST_BUCKET_COUNT; + rlat_dist[dist_index]++; + + rlat_tot = rlat_tot + lat_val; + + rlat_sqtot = rlat_sqtot + lat_val*lat_val; + + if(lat_val > rlat_max) + rlat_max = lat_val; + + if(lat_val > READ_SLA) + rlat_slafail = rlat_slafail + 1; + +} + + +/INFO - doInsert()/ { writecnt = writecnt + 1; + + split(substr($0, match($0, "latency:")+ length("latency:")+1), tmp_arr, " "); + + lat_val = tmp_arr[1]; + + if(index(tmp_arr[1], ",")!= 0) + lat_val = substr(tmp_arr[1],1,index(tmp_arr[1],",")-1); + + #printf("%d\n", strtonum(lat_val)); + lat_val = strtonum(lat_val); + + dist_index = int(lat_val / DIST_BUCKET_LENGTH) + 1; + if(dist_index > DIST_BUCKET_COUNT) + dist_index = DIST_BUCKET_COUNT; + wlat_dist[dist_index]++; + + wlat_tot = wlat_tot + lat_val; + + wlat_sqtot = wlat_sqtot + lat_val*lat_val; + + if(lat_val > wlat_max) + wlat_max = lat_val; + + if(lat_val > WRITE_SLA) + wlat_slafail = wlat_slafail + 1; + + +} + + +END{ + + printf("R/W ratio:\t%.2f\n", strtonum(readcnt) / strtonum(writecnt)); + + printf("\n"); + + printf("#reads:\t%d\n",readcnt); + if(strtonum(readcnt) != 0) + { + printf("avg read latency:\t%.2f\n", (rlat_tot / readcnt)); + printf("var read latency:\t%.2f\n", (rlat_sqtot/readcnt) - (rlat_tot/readcnt)*(rlat_tot/readcnt)); + printf("max read latency:\t%.2f\n", rlat_max); + printf("read SLA fail:\t%d\n", rlat_slafail); + + printf("\nREAD LATENCY DISTRIBUTION\n"); + for(i=1; i +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" + +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + +template class DataPage; + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector &arr) +{ + + for(int i=arr.size()-1; i>0; i--) + { + if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i]))) + arr.erase(arr.begin()+i); + + } + +} + +void preprandstr(int count, std::vector &arr, int avg_len=50, bool duplicates_allowed=false) +{ + + for ( int j=0; jnextPage == a->endOfRegion) { + if(a->regionList.size == -1) { + //DEBUG("nextPage: %lld\n", a->nextPage); + a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t)); + DEBUG("regionList.page: %lld\n", a->regionList.page); + DEBUG("regionList.slot: %d\n", a->regionList.slot); + DEBUG("regionList.size: %lld\n", a->regionList.size); + + a->regionCount = 0; + } + DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page); + TarrayListExtend(xid,a->regionList,1); + a->regionList.slot = a->regionCount; + DEBUG("region lst slot %d\n",a->regionList.slot); + a->regionCount++; + DEBUG("region count %lld\n",a->regionCount); + a->nextPage = TregionAlloc(xid, a->regionSize,12); + DEBUG("next page %lld\n",a->nextPage); + a->endOfRegion = a->nextPage + a->regionSize; + Tset(xid,a->regionList,&a->nextPage); + DEBUG("next page %lld\n",a->nextPage); + } + + DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion); + pageid_t ret = a->nextPage; + // Ensure the page is in buffer cache without accessing disk (this + // sets it to clean and all zeros if the page is not in cache). + // Hopefully, future reads will get a cache hit, and avoid going to + // disk. + + Page * p = loadUninitializedPage(xid, ret); + releasePage(p); + DEBUG("ret %lld\n",ret); + (a->nextPage)++; + return ret; + +} + + +pageid_t alloc_region_rid(int xid, void * ridp) { + recordid rid = *(recordid*)ridp; + RegionAllocConf_t conf; + Tread(xid,rid,&conf); + pageid_t ret = alloc_region(xid,&conf); + DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + + Tset(xid,rid,&conf); + return ret; +} + + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + std::vector data_arr; + std::vector key_arr; + preprandstr(NUM_ENTRIES, data_arr, 5*4096, true); + preprandstr(NUM_ENTRIES+200, key_arr, 50, true);//well i can handle upto 200 + + std::sort(key_arr.begin(), key_arr.end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr.size() > NUM_ENTRIES) + key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end()); + + NUM_ENTRIES=key_arr.size(); + + if(data_arr.size() > NUM_ENTRIES) + data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end()); + + //for(int i = 0; i < NUM_ENTRIES; i++) + //{ + // printf("%s\t", arr[i].c_str()); + // int keylen = arr[i].length()+1; + // printf("%d\n", keylen); + //} + + + + recordid alloc_state = Talloc(xid,sizeof(RegionAllocConf_t)); + + Tset(xid,alloc_state, &logtree::REGION_ALLOC_STATIC_INITIALIZER); + + + + + + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + int pcount = 10; + int dpages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = key_arr[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + for(int j=0; jappend(xid, newtuple)) + { + dpages++; + if(dp) + delete dp; + + dp = new DataPage(xid, pcount, &DataPage::dp_alloc_region_rid, &alloc_state ); + + if(!dp->append(xid, newtuple)) + { + delete dp; + dp = new DataPage(xid, pcount, &DataPage::dp_alloc_region_rid, &alloc_state ); + assert(dp->append(xid, newtuple)); + } + + dsp.push_back(dp->get_start_pid()); + } + + + } + + printf("Total data set length: %d\n", datasize); + printf("Storage utilization: %.2f\n", (datasize+.0) / (PAGE_SIZE * pcount * dpages)); + printf("Number of datapages: %d\n", dpages); + printf("Writes complete.\n"); + + Tcommit(xid); + xid = Tbegin(); + + + printf("Stage 2: Reading %d tuples\n", NUM_ENTRIES); + + + int tuplenum = 0; + for(int i = 0; i < dpages ; i++) + { + DataPage dp(xid, dsp[i]); + DataPage::RecordIterator itr = dp.begin(); + datatuple *dt=0; + while( (dt=itr.getnext(xid)) != NULL) + { + assert(*(dt->keylen) == key_arr[tuplenum].length()+1); + assert(*(dt->datalen) == data_arr[tuplenum].length()+1); + tuplenum++; + free(dt->keylen); + free(dt); + dt = 0; + } + + } + + printf("Reads completed.\n"); +/* + + int64_t count = 0; + lladdIterator_t * it = logtreeIterator::open(xid, tree); + + while(logtreeIterator::next(xid, it)) { + byte * key; + byte **key_ptr = &key; + int keysize = logtreeIterator::key(xid, it, (byte**)key_ptr); + + pageid_t *value; + pageid_t **value_ptr = &value; + int valsize = lsmTreeIterator_value(xid, it, (byte**)value_ptr); + //printf("keylen %d key %s\n", keysize, (char*)(key)) ; + assert(valsize == sizeof(pageid_t)); + assert(!mycmp(std::string((char*)key), arr[count]) && !mycmp(arr[count],std::string((char*)key))); + assert(keysize == arr[count].length()+1); + count++; + } + assert(count == NUM_ENTRIES); + + logtreeIterator::close(xid, it); + + + */ + + + Tcommit(xid); + Tdeinit(); +} + + +/** @test + */ +int main() +{ + insertProbeIter(10000); + + + + return 0; +} + diff --git a/check_gen.cpp b/check_gen.cpp new file mode 100644 index 0000000..100d9d0 --- /dev/null +++ b/check_gen.cpp @@ -0,0 +1,39 @@ + + +#include "logstore.h" + +int main(int argc, char **argv) +{ + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + // PAGELAYOUT::initPageLayout(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + logtable ltable; + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + //ltable.startTable(); + +// lsmTableHandle* h = TlsmTableStart(lsmTable, INVALID_COL); + + xid = Tbegin(); + logtreeIterator::open(xid,ltable.get_tree_c2()->get_root_rec() ); + Tcommit(xid); + + + Tdeinit(); + + + +} diff --git a/check_logtable.cpp b/check_logtable.cpp new file mode 100644 index 0000000..5d01500 --- /dev/null +++ b/check_logtable.cpp @@ -0,0 +1,276 @@ + +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + +//template class DataPage; +template class treeIterator; + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector &arr) +{ + + for(int i=arr.size()-1; i>0; i--) + { + if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i]))) + arr.erase(arr.begin()+i); + + } + +} + +void preprandstr(int count, std::vector &arr, int avg_len=50, bool duplicates_allowed=false) +{ + + for ( int j=0; jget_root_rec(); + + + std::vector data_arr; + std::vector key_arr; + preprandstr(NUM_ENTRIES, data_arr, 5*4096, true); + preprandstr(NUM_ENTRIES+200, key_arr, 50, true);//well i can handle upto 200 + + std::sort(key_arr.begin(), key_arr.end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr.size() > NUM_ENTRIES) + key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end()); + + NUM_ENTRIES=key_arr.size(); + + if(data_arr.size() > NUM_ENTRIES) + data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end()); + + + + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + + int dpages = 0; + int npages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = key_arr[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + for(int j=0; jget_start_pid()); + } + else + { + if(!dp->append(xid, newtuple)) + { + npages += dp->get_page_count(); + delete dp; + dp = ltable.insertTuple(xid, newtuple, ltable.get_dpstate1(), lt); + dpages++; + dsp.push_back(dp->get_start_pid()); + } + } + + free(newtuple.key); + free(newtuple.data); + + + } + + printf("\nTREE STRUCTURE\n"); + lt->print_tree(xid); + + printf("Total data set length: %d\n", datasize); + printf("Storage utilization: %.2f\n", (datasize+.0) / (PAGE_SIZE * npages)); + printf("Number of datapages: %d\n", dpages); + printf("Writes complete.\n"); + + Tcommit(xid); + xid = Tbegin(); + + + + + + printf("Stage 2: Sequentially reading %d tuples\n", NUM_ENTRIES); + + + int tuplenum = 0; + treeIterator tree_itr(tree_root); + + + datatuple *dt=0; + while( (dt=tree_itr.getnext()) != NULL) + { + assert(*(dt->keylen) == key_arr[tuplenum].length()+1); + assert(*(dt->datalen) == data_arr[tuplenum].length()+1); + tuplenum++; + free(dt->keylen); + free(dt); + dt = 0; + } + + assert(tuplenum == key_arr.size()); + + printf("Sequential Reads completed.\n"); + + int rrsize=key_arr.size() / 3; + printf("Stage 3: Randomly reading %d tuples by key\n", rrsize); + + for(int i=0; ikeylen) == key_arr[ri].length()+1); + assert(*(dt->datalen) == data_arr[ri].length()+1); + free(dt->keylen); + free(dt); + dt = 0; + } + + printf("Random Reads completed.\n"); + Tcommit(xid); + Tdeinit(); + +} + +/** @test + */ +int main() +{ + insertProbeIter(15000); + + + + return 0; +} + diff --git a/check_logtree.cpp b/check_logtree.cpp new file mode 100644 index 0000000..6e4a3c1 --- /dev/null +++ b/check_logtree.cpp @@ -0,0 +1,331 @@ + +#include +#include +#include +#include +#include "logstore.h" + +#include +#include +#include +#include +#include +#include + +#define LOG_NAME "check_logTree.log" +#define NUM_ENTRIES_A 10000 +#define NUM_ENTRIES_B 10 +#define NUM_ENTRIES_C 0 + +#define OFFSET (NUM_ENTRIES * 10) + +#undef begin +#undef end + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +void preprandstr(int count, std::vector &arr) +{ + + for ( int j=0; jget_root_rec(); + + long oldpagenum = -1; + + std::vector arr; + preprandstr(NUM_ENTRIES, arr); + std::sort(arr.begin(), arr.end(), &mycmp); + + //for(int i = 0; i < NUM_ENTRIES; i++) + //{ + // printf("%s\t", arr[i].c_str()); + // int keylen = arr[i].length()+1; + // printf("%d\n", keylen); + //} + + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + + for(int i = 0; i < NUM_ENTRIES; i++) + { + int keylen = arr[i].length()+1; + byte *currkey = (byte*)malloc(keylen); + for(int j=0; jget_tree_state(); + RegionAllocConf_t alloc_conf; + Tread(xid,rid,&alloc_conf); + + logtree::appendPage(xid, tree, lt->lastLeaf, currkey, keylen, lt->alloc_region, &alloc_conf, i + OFFSET); + + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&alloc_conf); + + + pagenum = logtree::findPage(xid, tree, currkey,keylen); + oldpagenum = pagenum; + //printf("pagenum:%d\n", pagenum); + assert(pagenum == i + OFFSET); + free(currkey); + + + } + + printf("Writes complete."); + + tree = lt->get_root_rec(); + Tcommit(xid); + xid = Tbegin(); + + printf("\nTREE STRUCTURE\n"); + lt->print_tree(xid); + + printf("Stage 2: Looking up %d keys\n", NUM_ENTRIES); + + for(int i = 0; i < NUM_ENTRIES; i++) { + int keylen = arr[i].length()+1; + byte *currkey = (byte*)malloc(keylen); + for(int j=0; jget_root_rec(); + + long oldpagenum = -1; + + for(int32_t i = 0; i < NUM_ENTRIES; i++) { + int keylen = sizeof(int32_t); + byte *currkey = (byte*)malloc(keylen); + memcpy(currkey, (byte*)(&i), keylen); + //currkey[]='\0'; + + printf("\n#########\ni=%d\nkey:\t%d\nkeylen:%d\n",i,*((int32_t*)currkey),keylen); + long pagenum = logtree::findPage(xid, tree, currkey, keylen); + printf("pagenum:%d\n", pagenum); + assert(pagenum == -1 || pagenum == oldpagenum || oldpagenum == -1); + printf("TlsmAppendPage %d\n",i); + + recordid rid = lt->get_tree_state(); + RegionAllocConf_t alloc_conf; + Tread(xid,rid,&alloc_conf); + + logtree::appendPage(xid, tree, lt->lastLeaf, currkey, keylen, lt->alloc_region, &alloc_conf, i + OFFSET); + + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&alloc_conf); + + + pagenum = logtree::findPage(xid, tree, currkey,keylen); + oldpagenum = pagenum; + printf("pagenum:%d\n", pagenum); + assert(pagenum == i + OFFSET); + free(currkey); + } + + printf("Writes complete."); + + tree = lt->get_root_rec(); + Tcommit(xid); + xid = Tbegin(); + + printf("\nTREE STRUCTURE\n"); + lt->print_tree(xid); + + for(int32_t i = 1; i < NUM_ENTRIES; i++) { + int keylen = sizeof(int32_t); + byte *currkey = (byte*)malloc(keylen); + memcpy(currkey, (byte*)(&i), keylen); + + printf("\n#########\ni=%d\nkey:\t%d\nkeylen:%d\n",i,*((int32_t*)currkey),keylen); + long pagenum = logtree::findPage(xid, tree, currkey, keylen); + printf("pagenum:%d\n", pagenum); + assert(pagenum == i + OFFSET); + free(currkey); + } + + /* + int64_t count = 0; + + lladdIterator_t * it = lsmTreeIterator_open(xid, tree); + + while(lsmTreeIterator_next(xid, it)) { + lsmkey_t * key; + lsmkey_t **key_ptr = &key; + int size = lsmTreeIterator_key(xid, it, (byte**)key_ptr); + assert(size == sizeof(lsmkey_t)); + long *value; + long **value_ptr = &value; + size = lsmTreeIterator_value(xid, it, (byte**)value_ptr); + assert(size == sizeof(pageid_t)); + assert(*key + OFFSET == *value); + assert(*key == count); + count++; + } + assert(count == NUM_ENTRIES); + + lsmTreeIterator_close(xid, it); + + */ + Tcommit(xid); + Tdeinit(); +} + +/** @test + */ +int main() +{ + insertProbeIter_str(NUM_ENTRIES_A); + //insertProbeIter_int(NUM_ENTRIES_A); + + + + return 0; +} + + diff --git a/check_merge.cpp b/check_merge.cpp new file mode 100644 index 0000000..79a6bee --- /dev/null +++ b/check_merge.cpp @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void preprandstr(int count, std::vector *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + //data generation + std::vector * data_arr = new std::vector; + std::vector * key_arr = new std::vector; + + preprandstr(NUM_ENTRIES, data_arr, 10*8192); + preprandstr(NUM_ENTRIES+200, key_arr, 100); + + std::sort(key_arr->begin(), key_arr->end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + if(data_arr->size() > NUM_ENTRIES) + data_arr->erase(data_arr->begin()+NUM_ENTRIES, data_arr->end()); + + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + merge_scheduler mscheduler; + logtable ltable; + + int pcount = 5; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + xid = Tbegin(); + + int lindex = mscheduler.addlogtable(<able); + ltable.setMergeData(mscheduler.getMergeData(lindex)); + + mscheduler.startlogtable(lindex); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + //for(int j=0; jprint_tree(xid); + printf("datasize: %d\n", datasize); + //sleep(20); + + Tcommit(xid); + xid = Tbegin(); + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen); + //for(int j=0; jkeylen) == (*key_arr)[ri].length()+1); + assert(*(dt->datalen) == (*data_arr)[ri].length()+1); + free(dt->keylen); + free(dt); + } + dt = 0; + free(rkey); + } + printf("found %d\n", found_tuples); + + key_arr->clear(); + data_arr->clear(); + delete key_arr; + delete data_arr; + + mscheduler.shutdown(); + printf("merge threads finished.\n"); + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + + + + Tcommit(xid); + Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + insertProbeIter(5000); + + + + return 0; +} + diff --git a/check_mergelarge.cpp b/check_mergelarge.cpp new file mode 100644 index 0000000..692b360 --- /dev/null +++ b/check_mergelarge.cpp @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void getnextdata(std::string &data, int avg_len) +{ + int str_len = (rand()%(avg_len*2)) + 3; + + data = std::string(str_len, rand()%10+48); + /* + char *rc = (char*)malloc(str_len); + + for(int i=0; i *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + unlink("storefile.txt"); + unlink("logfile.txt"); + + sync(); + + //data generation +// std::vector * data_arr = new std::vector; + std::vector * key_arr = new std::vector; + +// preprandstr(NUM_ENTRIES, data_arr, 10*8192); + preprandstr(NUM_ENTRIES+200, key_arr, 100); + + std::sort(key_arr->begin(), key_arr->end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + merge_scheduler mscheduler; + logtable ltable; + + int pcount = 100; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + //xid = Tbegin(); + + int lindex = mscheduler.addlogtable(<able); + ltable.setMergeData(mscheduler.getMergeData(lindex)); + + mscheduler.startlogtable(lindex); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + //for(int j=0; jprint_tree(xid); + printf("datasize: %lld\n", datasize); + //sleep(20); + + /* + //Tcommit(xid); + xid = Tbegin(); + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen); + //for(int j=0; jkeylen) == (*key_arr)[ri].length()+1); + //assert(*(dt->datalen) == (*data_arr)[ri].length()+1); + free(dt->keylen); + free(dt); + } + dt = 0; + free(rkey); + } + printf("found %d\n", found_tuples); + + key_arr->clear(); + //data_arr->clear(); + delete key_arr; + //delete data_arr; + */ + + mscheduler.shutdown(); + printf("merge threads finished.\n"); + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + + //Tcommit(xid); + + Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + insertProbeIter(25000); + + + + return 0; +} + diff --git a/check_mergetuple.cpp b/check_mergetuple.cpp new file mode 100644 index 0000000..914515a --- /dev/null +++ b/check_mergetuple.cpp @@ -0,0 +1,409 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void getnextdata(std::string &data, int avg_len) +{ + int str_len = (rand()%(avg_len*2)) + 3; + + data = std::string(str_len, rand()%10+48); + /* + char *rc = (char*)malloc(str_len); + + for(int i=0; i *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + //unlink("storefile.txt"); + //unlink("logfile.txt"); + + sync(); + double delete_freq = .05; + double update_freq = .15; + + //data generation + typedef std::vector key_v_t; + const static int max_partition_size = 100000; + int KEY_LEN = 100; + std::vector *key_v_list = new std::vector; + int list_size = NUM_ENTRIES / max_partition_size + 1; + for(int i =0; ibegin(), key_arr->end(), &mycmp); + key_v_list->push_back(key_arr); + printf("size partition %d is %d\n", i+1, key_arr->size()); + } + + + + key_v_t * key_arr = new key_v_t; + + std::vector iters; + for(int i=0; ibegin())); + } + + int lc = 0; + while(true) + { + int list_index = -1; + for(int i=0; iend()) + continue; + + if(list_index == -1 || mycmp(**iters[i], **iters[list_index])) + list_index = i; + } + + if(list_index == -1) + break; + + if(key_arr->size() == 0 || mycmp(key_arr->back(), **iters[list_index])) + key_arr->push_back(**iters[list_index]); + + (*iters[list_index])++; + lc++; + if(lc % max_partition_size == 0) + printf("%d/%d completed.\n", lc, NUM_ENTRIES); + } + + for(int i=0; iclear(); + delete (*key_v_list)[i]; + delete iters[i]; + } + key_v_list->clear(); + delete key_v_list; + +// preprandstr(NUM_ENTRIES, data_arr, 10*8192); + + printf("key arr size: %d\n", key_arr->size()); + + //removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + merge_scheduler mscheduler; + logtable ltable; + + int pcount = 40; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + xid = Tbegin(); + + int lindex = mscheduler.addlogtable(<able); + ltable.setMergeData(mscheduler.getMergeData(lindex)); + + mscheduler.startlogtable(lindex); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + int delcount = 0, upcount = 0; + DataPage *dp=0; + int64_t datasize = 0; + std::vector dsp; + std::vector del_list; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + //for(int j=0; j= 0 && std::find(del_list.begin(), del_list.end(), del_index) == del_list.end()) + { + delcount++; + datatuple deltuple; + keylen = (*key_arr)[del_index].length()+1; + deltuple.keylen = &keylen; + + deltuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)deltuple.key, (*key_arr)[del_index].c_str(), keylen); + + deltuple.datalen = &datalen; + deltuple.setDelete(); + + gettimeofday(&ti_st,0); + ltable.insertTuple(deltuple); + gettimeofday(&ti_end,0); + insert_time += tv_to_double(ti_end) - tv_to_double(ti_st); + + free(deltuple.key); + + del_list.push_back(del_index); + + } + } + else if(rval < delete_freq + update_freq) //update a record + { + int up_index = i - (rand()%50); //update one of the last inserted 50 elements + if(up_index >= 0 && std::find(del_list.begin(), del_list.end(), up_index) == del_list.end()) + {//only update non-deleted elements + upcount++; + datatuple uptuple; + keylen = (*key_arr)[up_index].length()+1; + uptuple.keylen = &keylen; + + uptuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)uptuple.key, (*key_arr)[up_index].c_str(), keylen); + + getnextdata(ditem, 512); + datalen = ditem.length()+1; + uptuple.datalen = &datalen; + uptuple.data = (datatuple::data_t) malloc(datalen); + memcpy((byte*)uptuple.data, ditem.c_str(), datalen); + + gettimeofday(&ti_st,0); + ltable.insertTuple(uptuple); + gettimeofday(&ti_end,0); + insert_time += tv_to_double(ti_end) - tv_to_double(ti_st); + + free(uptuple.key); + free(uptuple.data); + + } + + } + + } + gettimeofday(&stop_tv,0); + printf("insert time: %6.1f\n", insert_time); + printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + printf("#deletions: %d\n#updates: %d\n", delcount, upcount); + + printf("\nTREE STRUCTURE\n"); + //ltable.get_tree_c1()->print_tree(xid); + printf("datasize: %lld\n", datasize); + //sleep(20); + + Tcommit(xid); + xid = Tbegin(); + + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + memcpy((byte*)rkey, (*key_arr)[ri].c_str(), keylen); + //for(int j=0; jisDelete()); + found_tuples++; + assert(*(dt->keylen) == (*key_arr)[ri].length()+1); + //assert(*(dt->datalen) == (*data_arr)[ri].length()+1); + free(dt->keylen); + free(dt); + } + else + { + if(dt!=0) + { + assert(*(dt->keylen) == (*key_arr)[ri].length()+1); + assert(dt->isDelete()); + free(dt->keylen); + free(dt); + } + } + dt = 0; + free(rkey); + } + printf("found %d\n", found_tuples); + + + + + + key_arr->clear(); + //data_arr->clear(); + delete key_arr; + //delete data_arr; + + mscheduler.shutdown(); + printf("merge threads finished.\n"); + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + + + + Tcommit(xid); + Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + //insertProbeIter(25000); + insertProbeIter(400000); + /* + insertProbeIter(5000); + insertProbeIter(2500); + insertProbeIter(1000); + insertProbeIter(500); + insertProbeIter(1000); + insertProbeIter(100); + insertProbeIter(10); + */ + + return 0; +} + diff --git a/check_rbtree.cpp b/check_rbtree.cpp new file mode 100644 index 0000000..af17780 --- /dev/null +++ b/check_rbtree.cpp @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector &arr) +{ + + for(int i=arr.size()-1; i>0; i--) + { + if(! (mycmp(arr[i], arr[i-1]) || mycmp(arr[i-1], arr[i]))) + arr.erase(arr.begin()+i); + + } + +} + +void preprandstr(int count, std::vector &arr, int avg_len=50, bool duplicates_allowed=false) +{ + + for ( int j=0; j data_arr; + std::vector key_arr; + preprandstr(NUM_ENTRIES, data_arr, 10*8192, true); + preprandstr(NUM_ENTRIES+200, key_arr, 100, true); + + std::sort(key_arr.begin(), key_arr.end(), &mycmp); + + removeduplicates(key_arr); + if(key_arr.size() > NUM_ENTRIES) + key_arr.erase(key_arr.begin()+NUM_ENTRIES, key_arr.end()); + + NUM_ENTRIES=key_arr.size(); + + if(data_arr.size() > NUM_ENTRIES) + data_arr.erase(data_arr.begin()+NUM_ENTRIES, data_arr.end()); + + std::set rbtree; + int64_t datasize = 0; + std::vector dsp; + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = key_arr[i].length()+1; + newtuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + *newtuple.keylen = keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + for(int j=0; jprint_tree(xid); + printf("datasize: %d\n", datasize); + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + + //get the key + uint32_t keylen = key_arr[ri].length()+1; + datatuple::key_t rkey = (datatuple::key_t) malloc(keylen); + for(int j=0; jkeylen) == key_arr[ri].length()+1); + assert(*(ret_tuple->datalen) == data_arr[ri].length()+1); + free(barr); + free(ret_tuple); + } + else + { + printf("Not in scratch_tree\n"); + } + + free(search_tuple.keylen); + free(rkey); + } + printf("found %d\n", found_tuples); +} + + + +/** @test + */ +int main() +{ + insertProbeIter(250); + + + + return 0; +} + diff --git a/check_server.cpp b/check_server.cpp new file mode 100644 index 0000000..60af0cf --- /dev/null +++ b/check_server.cpp @@ -0,0 +1,107 @@ +#include +#include +#include +#include +#include "logstore.h" +#include "datapage.cpp" +#include "logiterators.cpp" +#include "merger.h" +#include +#include +#include +#include +#include +#include + +#include + +#undef begin +#undef end + +logserver *lserver=0; +merge_scheduler *mscheduler=0; + +void terminate (int param) +{ + printf ("Stopping server...\n"); + lserver->stopserver(); + delete lserver; + + printf("Stopping merge threads...\n"); + mscheduler->shutdown(); + delete mscheduler; + + printf("Deinitializing stasis...\n"); + fflush(stdout); + Tdeinit(); + + exit(0); +} + +void insertProbeIter(int NUM_ENTRIES) +{ + //signal handling + void (*prev_fn)(int); + + prev_fn = signal (SIGINT,terminate); + //if (prev_fn==SIG_IGN) + //signal (SIGTERM,SIG_IGN); + + + sync(); + + bufferManagerNonBlockingSlowHandleType = IO_HANDLE_PFILE; + + Tinit(); + + int xid = Tbegin(); + + mscheduler = new merge_scheduler; + logtable ltable; + + + + int pcount = 40; + ltable.set_fixed_page_count(pcount); + + recordid table_root = ltable.allocTable(xid); + + Tcommit(xid); + + int lindex = mscheduler->addlogtable(<able); + ltable.setMergeData(mscheduler->getMergeData(lindex)); + + mscheduler->startlogtable(lindex); + + + lserver = new logserver(10, 32432); + + lserver->startserver(<able); + + +// Tdeinit(); + + +} + + + +/** @test + */ +int main() +{ + //insertProbeIter(25000); + insertProbeIter(10000); + /* + insertProbeIter(5000); + insertProbeIter(2500); + insertProbeIter(1000); + insertProbeIter(500); + insertProbeIter(1000); + insertProbeIter(100); + insertProbeIter(10); + */ + + return 0; +} + diff --git a/check_tcpclient.cpp b/check_tcpclient.cpp new file mode 100644 index 0000000..a505e52 --- /dev/null +++ b/check_tcpclient.cpp @@ -0,0 +1,415 @@ +#include +#include +#include +#include +#include "logstore.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#undef begin +#undef end + + + + +bool mycmp(const std::string & k1,const std::string & k2) +{ + //for char* ending with \0 + return strcmp(k1.c_str(),k2.c_str()) < 0; + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); +} + +//must be given a sorted array +void removeduplicates(std::vector *arr) +{ + + for(int i=arr->size()-1; i>0; i--) + { + if(! (mycmp((*arr)[i], (*arr)[i-1]) || mycmp((*arr)[i-1], (*arr)[i]))) + arr->erase(arr->begin()+i); + + } + +} + +void getnextdata(std::string &data, int avg_len) +{ + int str_len = (rand()%(avg_len*2)) + 3; + + data = std::string(str_len, rand()%10+48); + /* + char *rc = (char*)malloc(str_len); + + for(int i=0; i *arr, int avg_len=50) +{ + + for ( int j=0; jpush_back(str); + + } + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + +datatuple * sendTuple(std::string & servername, int serverport, uint8_t opcode, datatuple &tuple) +{ + struct sockaddr_in serveraddr; + struct hostent *server; + + int sockfd = socket(AF_INET, SOCK_STREAM, 0); + + if (sockfd < 0) + { + printf("ERROR opening socket.\n"); + return 0; + } + + server = gethostbyname(servername.c_str()); + if (server == NULL) { + fprintf(stderr,"ERROR, no such host as %s\n", servername.c_str()); + exit(0); + } + + /* build the server's Internet address */ + bzero((char *) &serveraddr, sizeof(serveraddr)); + serveraddr.sin_family = AF_INET; + bcopy((char *)server->h_addr, + (char *)&serveraddr.sin_addr.s_addr, server->h_length); + serveraddr.sin_port = htons(serverport); + + /* connect: create a connection with the server */ + if (connect(sockfd, (sockaddr*) &serveraddr, sizeof(serveraddr)) < 0) + { + printf("ERROR connecting\n"); + return 0; + } + + + //send the opcode + int n = write(sockfd, (byte*) &opcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + n = write(sockfd, (byte*) tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + n = write(sockfd, (byte*) tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + writetosocket(sockfd, (byte*) tuple.key, *tuple.keylen); + if(!tuple.isDelete() && *tuple.datalen != 0) + writetosocket(sockfd, (byte*) tuple.data, *tuple.datalen); + + //read the reply code + uint8_t rcode; + n = read(sockfd, (byte*) &rcode, sizeof(uint8_t)); + + if(rcode == logserver::OP_SENDING_TUPLE) + { + datatuple *rcvdtuple = (datatuple*)malloc(sizeof(datatuple)); + //read the keylen + rcvdtuple->keylen = (uint32_t*) malloc(sizeof(uint32_t)); + n = read(sockfd, (byte*) rcvdtuple->keylen, sizeof(uint32_t)); + assert(n == sizeof(uint32_t)); + //read the datalen + rcvdtuple->datalen = (uint32_t*) malloc(sizeof(uint32_t)); + n = read(sockfd, (byte*) rcvdtuple->datalen, sizeof(uint32_t)); + assert(n == sizeof(uint32_t)); + //read key + rcvdtuple->key = (byte*) malloc(*rcvdtuple->keylen); + readfromsocket(sockfd, (byte*) rcvdtuple->key, *rcvdtuple->keylen); + if(!rcvdtuple->isDelete()) + { + //read key + rcvdtuple->data = (byte*) malloc(*rcvdtuple->datalen); + readfromsocket(sockfd, (byte*) rcvdtuple->data, *rcvdtuple->datalen); + } + + close(sockfd); + return rcvdtuple; + } + else + assert(rcode == logserver::OP_SUCCESS); + + close(sockfd); + return 0; +} + + +void insertProbeIter(int NUM_ENTRIES) +{ + srand(1000); + std::string servername = "sherpa4"; + int serverport = 32432; + + double delete_freq = .05; + double update_freq = .15; + + //data generation + typedef std::vector key_v_t; + const static int max_partition_size = 100000; + int KEY_LEN = 100; + std::vector *key_v_list = new std::vector; + int list_size = NUM_ENTRIES / max_partition_size + 1; + for(int i =0; ibegin(), key_arr->end(), &mycmp); + key_v_list->push_back(key_arr); + printf("size partition %d is %d\n", i+1, key_arr->size()); + } + + + + key_v_t * key_arr = new key_v_t; + + std::vector iters; + for(int i=0; ibegin())); + } + + int lc = 0; + while(true) + { + int list_index = -1; + for(int i=0; iend()) + continue; + + if(list_index == -1 || mycmp(**iters[i], **iters[list_index])) + list_index = i; + } + + if(list_index == -1) + break; + + if(key_arr->size() == 0 || mycmp(key_arr->back(), **iters[list_index])) + key_arr->push_back(**iters[list_index]); + + (*iters[list_index])++; + lc++; + if(lc % max_partition_size == 0) + printf("%d/%d completed.\n", lc, NUM_ENTRIES); + } + + for(int i=0; iclear(); + delete (*key_v_list)[i]; + delete iters[i]; + } + key_v_list->clear(); + delete key_v_list; + +// preprandstr(NUM_ENTRIES, data_arr, 10*8192); + + printf("key arr size: %d\n", key_arr->size()); + + //removeduplicates(key_arr); + if(key_arr->size() > NUM_ENTRIES) + key_arr->erase(key_arr->begin()+NUM_ENTRIES, key_arr->end()); + + NUM_ENTRIES=key_arr->size(); + + printf("Stage 1: Writing %d keys\n", NUM_ENTRIES); + + struct timeval start_tv, stop_tv, ti_st, ti_end; + double insert_time = 0; + int dpages = 0; + int npages = 0; + int delcount = 0, upcount = 0; + int64_t datasize = 0; + std::vector dsp; + std::vector del_list; + gettimeofday(&start_tv,0); + for(int i = 0; i < NUM_ENTRIES; i++) + { + //prepare the key + datatuple newtuple; + uint32_t keylen = (*key_arr)[i].length()+1; + newtuple.keylen = &keylen; + + newtuple.key = (datatuple::key_t) malloc(keylen); + memcpy((byte*)newtuple.key, (*key_arr)[i].c_str(), keylen); + + //prepare the data + std::string ditem; + getnextdata(ditem, 8192); + uint32_t datalen = ditem.length()+1; + newtuple.datalen = &datalen; + newtuple.data = (datatuple::data_t) malloc(datalen); + memcpy((byte*)newtuple.data, ditem.c_str(), datalen); + + /* + printf("key: \t, keylen: %u\ndata: datalen: %u\n", + //newtuple.key, + *newtuple.keylen, + //newtuple.data, + *newtuple.datalen); + */ + + datasize += newtuple.byte_length(); + + gettimeofday(&ti_st,0); + + //send the data + sendTuple(servername, serverport, logserver::OP_INSERT, newtuple); + + gettimeofday(&ti_end,0); + insert_time += tv_to_double(ti_end) - tv_to_double(ti_st); + + free(newtuple.key); + free(newtuple.data); + + if(i % 10000 == 0 && i > 0) + printf("%d / %d inserted.\n", i, NUM_ENTRIES); + + } + gettimeofday(&stop_tv,0); + printf("insert time: %6.1f\n", insert_time); + printf("insert time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + printf("#deletions: %d\n#updates: %d\n", delcount, upcount); + + + + printf("Stage 2: Looking up %d keys:\n", NUM_ENTRIES); + + int found_tuples=0; + for(int i=NUM_ENTRIES-1; i>=0; i--) + { + int ri = i; + //printf("key index%d\n", i); + fflush(stdout); + + //get the key + uint32_t keylen = (*key_arr)[ri].length()+1; + datatuple searchtuple; + searchtuple.keylen = (uint32_t*)malloc(2*sizeof(uint32_t) + keylen); + *searchtuple.keylen = keylen; + + searchtuple.datalen = searchtuple.keylen + 1; + *searchtuple.datalen = 0; + + searchtuple.key = (datatuple::key_t)(searchtuple.keylen + 2); + memcpy((byte*)searchtuple.key, (*key_arr)[ri].c_str(), keylen); + + //find the key with the given tuple + datatuple *dt = sendTuple(servername, serverport, logserver::OP_FIND, + searchtuple); + + assert(dt!=0); + assert(!dt->isDelete()); + found_tuples++; + assert(*(dt->keylen) == (*key_arr)[ri].length()+1); + + //free dt + free(dt->keylen); + free(dt->datalen); + free(dt->key); + free(dt->data); + free(dt); + + dt = 0; + + free(searchtuple.keylen); + + } + printf("found %d\n", found_tuples); + + + + + + key_arr->clear(); + //data_arr->clear(); + delete key_arr; + //delete data_arr; + + gettimeofday(&stop_tv,0); + printf("run time: %6.1f\n", (tv_to_double(stop_tv) - tv_to_double(start_tv))); + +} + + + +/** @test + */ +int main() +{ + //insertProbeIter(25000); + insertProbeIter(100000); + /* + insertProbeIter(5000); + insertProbeIter(2500); + insertProbeIter(1000); + insertProbeIter(500); + insertProbeIter(1000); + insertProbeIter(100); + insertProbeIter(10); + */ + + return 0; +} + diff --git a/cmds.txt b/cmds.txt new file mode 100644 index 0000000..5b24608 --- /dev/null +++ b/cmds.txt @@ -0,0 +1,9 @@ + dd if=/dev/zero of=storefile.txt bs=1M count=20000 + + +/dhtRecOpsGenerator -d clientType=LogStoreClient host=sherpa4 numOps=10ls existingStartKey=100 existingEndKey=1000 insertRatio=1.0 + + + + +dhtRecOpsGeneratorWrapper startClientID=1 endClientID=4 -d clientType=LogStoreClient host=sherpa4.corp.re1.yahoo.com numOps=5000000 existingStartKey=100 existingEndKey=10000000 insertRatio=1.0 readRatio=0 numClients=3 diff --git a/datapage.cpp b/datapage.cpp new file mode 100644 index 0000000..b931e10 --- /dev/null +++ b/datapage.cpp @@ -0,0 +1,507 @@ + +#include "logstore.h" +#include "datapage.h" + +template +const int32_t DataPage::HEADER_SIZE = sizeof(int32_t); + +template +DataPage::DataPage(int xid, pageid_t pid): + alloc_region(0), + alloc_state(0), + fix_pcount(-1) +{ + assert(pid!=0); + + pcount = readPageCount(xid, pid); + + pidarr = (pageid_t *) malloc(sizeof(pageid_t) * pcount); + + for(int i=0; i +DataPage::DataPage(int xid, int fix_pcount, pageid_t (*alloc_region)(int, void*), void * alloc_state) +{ + assert(fix_pcount >= 1); + byte_offset = -1; + + this->fix_pcount = fix_pcount; + + if(alloc_region != 0) + this->alloc_region = alloc_region; + if(alloc_state != 0) + this->alloc_state = alloc_state; + + initialize(xid); +} + +template +DataPage::~DataPage() +{ + if(pidarr) + free(pidarr); +} + + +template +void DataPage::initialize(int xid) +{ + //initializes to an empty datapage + //alloc a new page + pageid_t pid = alloc_region(xid, alloc_state); + + //load the first page + //Page *p = loadPage(xid, pid); + Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE); + writelock(p->rwlatch,0); + + //initialize header + + //set number of pages to 1 + int32_t * numpages_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, 0); + *numpages_ptr = 1; + + //write 0 to first data size + int32_t * size_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, HEADER_SIZE); + *size_ptr = 0; + + //set the page dirty + stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); + + //release the page + unlock(p->rwlatch); + releasePage(p); + + //set the class variables + byte_offset = HEADER_SIZE; + pcount = 1; + pidarr = (pageid_t *) malloc(fix_pcount * sizeof(pageid_t)); + pidarr[0] = pid; + +} + +template +inline bool DataPage::append(int xid, TUPLE const & dat) +{ + assert(byte_offset >= HEADER_SIZE); + assert(fix_pcount >= 1); + + //check if there is enough space (for the data length + data) + int32_t blen = dat.byte_length() + sizeof(int32_t); + if(PAGE_SIZE * fix_pcount - byte_offset < blen) + { + //check if the record is too large + // and if so do we wanna accomodate here by going over the fix_pcount + if(PAGE_SIZE * fix_pcount - HEADER_SIZE < blen && //record is larger than datapage + PAGE_SIZE * fix_pcount - HEADER_SIZE > 2 * byte_offset)//accept if i am less than half full + { + //nothing + } + else + { + //printf("page has %d bytes left, we needed %d. (byte_offset %d)\n", + //PAGE_SIZE * fix_pcount - byte_offset, blen, byte_offset); + return false; //not enough mana, return + } + } + + //write the length of the data + int32_t dsize = blen - sizeof(int32_t); + + if(!writebytes(xid, sizeof(int32_t), (byte*)(&dsize))) + return false; + byte_offset += sizeof(int32_t); + + //write the data + byte * barr = dat.to_bytes(); + if(!writebytes(xid, dsize, barr)) //if write fails, undo the previous write + { + byte_offset -= sizeof(int32_t); + free(barr); + //write 0 for the next tuple size, if there is enough space in this page + if(PAGE_SIZE - (byte_offset % PAGE_SIZE) >= sizeof(int32_t)) + { + dsize = 0; + writebytes(xid, sizeof(int32_t), (byte*)(&dsize));//this will succeed, since there is enough space on the page + } + return false; + } + free(barr); + byte_offset += dsize; + + //write 0 for the next tuple size, if there is enough space in this page + if(PAGE_SIZE - (byte_offset % PAGE_SIZE) >= sizeof(int32_t)) + { + dsize = 0; + writebytes(xid, sizeof(int32_t), (byte*)(&dsize));//this will succeed, since there is enough space on the page + } + + return true; +} + +template +bool DataPage::writebytes(int xid, int count, byte *data) +{ + + int32_t bytes_copied = 0; + while(bytes_copied < count) + { + //load the page to copy into + int pindex = (byte_offset + bytes_copied) / PAGE_SIZE; + if(pindex == pcount) //then this page must be allocated + { + pageid_t newid = alloc_region(xid, alloc_state); + //check continuity + if(pidarr[pindex-1] != newid - 1)//so we started a new region and that is not right after the prev region in the file + { + return false;//we cant store this + } + + //check whether we need to extend the pidarr, add fix_pcount many pageid_t slots + if(pindex >= fix_pcount && (pindex % fix_pcount==0)) + { + pidarr = (pageid_t*)realloc(pidarr, (pindex + fix_pcount)*sizeof(pageid_t)); + } + pidarr[pindex] = newid; + pcount++; + incrementPageCount(xid, pidarr[0]); + } + //Page *p = loadPage(xid, pidarr[pindex]); + Page *p = loadPageOfType(xid, pidarr[pindex], SEGMENT_PAGE); + writelock(p->rwlatch,0); + + //copy the portion of bytes we can copy in this page + int32_t page_offset = (byte_offset+bytes_copied) % PAGE_SIZE; + int32_t copy_len = ( (PAGE_SIZE - page_offset < count - bytes_copied ) ? PAGE_SIZE - page_offset: count - bytes_copied); + + byte * pb_ptr = stasis_page_byte_ptr_from_start(p, page_offset); + memcpy(pb_ptr, data+bytes_copied ,copy_len); + + //release the page + stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); + unlock(p->rwlatch); + releasePage(p); + + //update the copied bytes_count + bytes_copied += copy_len; + + + } + + assert(bytes_copied == count); + return true; +} + +template +bool DataPage::recordRead(int xid, typename TUPLE::key_t key, size_t keySize, TUPLE ** buf) +{ + RecordIterator itr(this); + + int match = -1; + while((*buf=itr.getnext(xid)) != 0) + { + match = TUPLE::compare((*buf)->get_key(), key); + + if(match<0) //keep searching + { + free((*buf)->keylen); + free(*buf); + *buf=0; + } + else if(match==0) //found + { + return true; + } + else // match > 0, then does not exist + { + free((*buf)->keylen); + free(*buf); + *buf = 0; + break; + } + } + + return false; +} + +template +void DataPage::readbytes(int xid, int32_t offset, int count, byte **data) +{ + + if(*data==NULL) + *data = (byte*)malloc(count); + + int32_t bytes_copied = 0; + while(bytes_copied < count) + { + //load the page to copy from + int pindex = (offset + bytes_copied) / PAGE_SIZE; + + //Page *p = loadPage(xid, pidarr[pindex]); + Page *p = loadPageOfType(xid, pidarr[pindex], SEGMENT_PAGE); + readlock(p->rwlatch,0); + + //copy the portion of bytes we can copy from this page + int32_t page_offset = (offset+bytes_copied) % PAGE_SIZE; + int32_t copy_len = ( (PAGE_SIZE - page_offset < count - bytes_copied ) ? PAGE_SIZE - page_offset : count - bytes_copied); + + byte * pb_ptr = stasis_page_byte_ptr_from_start(p, page_offset); + memcpy((*data)+bytes_copied, pb_ptr, copy_len); + + //release the page + unlock(p->rwlatch); + releasePage(p); + + //update the copied bytes_count + bytes_copied += copy_len; + } + + assert(bytes_copied == count); +} + + +template +inline int DataPage::readPageCount(int xid, pageid_t pid) +{ + + //Page *p = loadPage(xid, pid); + Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE); + readlock(p->rwlatch,0); + + int32_t numpages = *((int32_t*)stasis_page_byte_ptr_from_start(p, 0)); + + unlock(p->rwlatch); + releasePage(p); + + return numpages; +} + +template +inline void DataPage::incrementPageCount(int xid, pageid_t pid, int add) +{ + //Page *p = loadPage(xid, pid); + Page *p = loadPageOfType(xid, pid, SEGMENT_PAGE); + writelock(p->rwlatch,0); + + int32_t *numpages_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, 0); + + *numpages_ptr = *numpages_ptr + add; + + stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); + + unlock(p->rwlatch); + releasePage(p); + + + +} + + +template +inline uint16_t DataPage::recordCount(int xid) +{ + + return 0; +} + +template +pageid_t DataPage::dp_alloc_region(int xid, void *conf) +{ + RegionAllocConf_t* a = (RegionAllocConf_t*)conf; + + + if(a->nextPage == a->endOfRegion) { + if(a->regionList.size == -1) { + //DEBUG("nextPage: %lld\n", a->nextPage); + a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t)); + DEBUG("regionList.page: %lld\n", a->regionList.page); + DEBUG("regionList.slot: %d\n", a->regionList.slot); + DEBUG("regionList.size: %lld\n", a->regionList.size); + + a->regionCount = 0; + } + DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page); + TarrayListExtend(xid,a->regionList,1); + a->regionList.slot = a->regionCount; + DEBUG("region lst slot %d\n",a->regionList.slot); + a->regionCount++; + DEBUG("region count %lld\n",a->regionCount); + a->nextPage = TregionAlloc(xid, a->regionSize,12); + DEBUG("next page %lld\n",a->nextPage); + a->endOfRegion = a->nextPage + a->regionSize; + Tset(xid,a->regionList,&a->nextPage); + DEBUG("next page %lld\n",a->nextPage); + } + + DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion); + pageid_t ret = a->nextPage; + // Ensure the page is in buffer cache without accessing disk (this + // sets it to clean and all zeros if the page is not in cache). + // Hopefully, future reads will get a cache hit, and avoid going to + // disk. + + Page * p = loadUninitializedPage(xid, ret); + //writelock(p->rwlatch,0); + p->pageType = SEGMENT_PAGE; + //unlock(p->rwlatch); + releasePage(p); + DEBUG("ret %lld\n",ret); + (a->nextPage)++; + return ret; + +} + +template +pageid_t DataPage::dp_alloc_region_rid(int xid, void * ridp) { + recordid rid = *(recordid*)ridp; + RegionAllocConf_t conf; + Tread(xid,rid,&conf); + pageid_t ret = dp_alloc_region(xid,&conf); + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&conf); + return ret; +} + +template +void DataPage::dealloc_region_rid(int xid, void *conf) +{ + RegionAllocConf_t a = *((RegionAllocConf_t*)conf); + DEBUG("{%lld <- dealloc region arraylist}\n", a.regionList.page); + + for(int i = 0; i < a.regionCount; i++) { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + TregionDealloc(xid,pid); + } +} + +template +void DataPage::force_region_rid(int xid, void *conf) +{ + recordid rid = *(recordid*)conf; + RegionAllocConf_t a; + Tread(xid,rid,&a); + + for(int i = 0; i < a.regionCount; i++) + { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + stasis_dirty_page_table_flush_range((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), pid, pid+a.regionSize); + forcePageRange(pid, pid+a.regionSize); + } +} + + +/////////////////////////////////////////////////////////////// +//RECORD ITERATOR +/////////////////////////////////////////////////////////////// + + +template +TUPLE* DataPage::RecordIterator::getnext(int xid) +{ + + + int pindex = offset / PAGE_SIZE; + + if(pindex == dp->pcount)//past end + return 0; + if(pindex == dp->pcount - 1 && (PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t))) + return 0; + + //Page *p = loadPage(xid, dp->pidarr[pindex]); + Page *p = loadPageOfType(xid, dp->pidarr[pindex], SEGMENT_PAGE); + readlock(p->rwlatch,0); + + int32_t *dsize_ptr; + if(PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)) //int spread in two pages + { + dsize_ptr = 0; + dp->readbytes(xid, offset, sizeof(int32_t), (byte**)(&dsize_ptr)); + } + else //int in a single page + dsize_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, offset % PAGE_SIZE); + + offset += sizeof(int32_t); + + if(*dsize_ptr == 0) //no more keys + { + unlock(p->rwlatch); + releasePage(p); + return 0; + } + + byte* tb=0; + dp->readbytes(xid, offset, *dsize_ptr, &tb); + + TUPLE *tup = TUPLE::from_bytes(tb); + + offset += *dsize_ptr; + + unlock(p->rwlatch); + releasePage(p); + + return tup; +} + + + +template +void DataPage::RecordIterator::advance(int xid, int count) +{ + + int pindex = -1; + Page *p = 0; + + for(int i=0; irwlatch); + releasePage(p); + } + + pindex = offset / PAGE_SIZE; + + if(pindex == dp->pcount)//past end + return; + + //p = loadPage(xid, dp->pidarr[pindex]); + p = loadPageOfType(xid, dp->pidarr[pindex], SEGMENT_PAGE); + readlock(p->rwlatch,0); + } + + if(pindex == dp->pcount - 1 && (PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t))) + return; + + int32_t *dsize_ptr=0; + if(PAGE_SIZE - (offset % PAGE_SIZE) < sizeof(int32_t)) //int spread in two pages + dp->readbytes(xid, offset, sizeof(int32_t), (byte**)(&dsize_ptr)); + else //int in a single page + dsize_ptr = (int32_t*)stasis_page_byte_ptr_from_start(p, offset % PAGE_SIZE); + + offset += sizeof(int32_t); + + if(*dsize_ptr == 0) //no more keys + { + unlock(p->rwlatch); + releasePage(p); + return; + } + + offset += *dsize_ptr; + + } + +} diff --git a/datapage.h b/datapage.h new file mode 100644 index 0000000..f26f454 --- /dev/null +++ b/datapage.h @@ -0,0 +1,110 @@ +#ifndef _SIMPLE_DATA_PAGE_H_ +#define _SIMPLE_DATA_PAGE_H_ + +#include + +#include +#include + + + +template +class DataPage +{ +public: + + class RecordIterator + { + public: + RecordIterator(DataPage *dp) + { + offset = HEADER_SIZE; + this->dp = dp; + } + + RecordIterator(const RecordIterator &rhs) + { + this->offset = rhs.offset; + this->dp = rhs.dp; + } + + void operator=(const RecordIterator &rhs) + { + this->offset = rhs.offset; + this->dp = rhs.dp; + } + + + //returns the next tuple and also advances the iterator + TUPLE *getnext(int xid); + + //advance the iterator by count tuples, i.e. skip over count tuples + void advance(int xid, int count=1); + + + int32_t offset ; + DataPage *dp; + + + }; + + +public: + + //to be used when reading an existing data page from disk + DataPage( int xid, pageid_t pid ); + + //to be used to create new data pages + DataPage( int xid, int fix_pcount, pageid_t (*alloc_region)(int, void*), void * alloc_state); + + ~DataPage(); + + inline bool append(int xid, TUPLE const & dat); + bool recordRead(int xid, typename TUPLE::key_t key, size_t keySize, TUPLE ** buf); + + inline uint16_t recordCount(int xid); + + + RecordIterator begin(){return RecordIterator(this);} + + pageid_t get_start_pid(){return pidarr[0];} + int get_page_count(){return pcount;} + + static pageid_t dp_alloc_region(int xid, void *conf); + + static pageid_t dp_alloc_region_rid(int xid, void * ridp); + + static void dealloc_region_rid(int xid, void* conf); + + static void force_region_rid(int xid, void *conf); + +public: + +private: + + void initialize(int xid); + + //reads the page count information from the first page + int readPageCount(int xid, pageid_t pid); + void incrementPageCount(int xid, pageid_t pid, int add=1); + + bool writebytes(int xid, int count, byte *data); + inline void readbytes(int xid, int32_t offset, int count, byte **data=0); + +private: + int fix_pcount; //number of pages in a standard data page + int pcount; + pageid_t *pidarr; + int32_t byte_offset;//points to the next free byte + + + //page alloc function + pageid_t (*alloc_region)(int, void*); + void *alloc_state; + + static const int32_t HEADER_SIZE; + + +}; + +#endif diff --git a/datatuple.h b/datatuple.h new file mode 100644 index 0000000..0e1e4ce --- /dev/null +++ b/datatuple.h @@ -0,0 +1,147 @@ +#ifndef _DATATUPLE_H_ +#define _DATATUPLE_H_ + + +typedef unsigned char uchar; + +#include + +//#define byte unsigned char +typedef unsigned char byte; +#include + +//#include +//#include +//#include + +typedef struct datatuple +{ + typedef uchar* key_t; + typedef uchar* data_t; + uint32_t *keylen; //key length should be size of string + 1 for \n + uint32_t *datalen; + key_t key; + data_t data; + + //this is used by the stl set + bool operator() (const datatuple& lhs, const datatuple& rhs) const + { + //std::basic_string s1(lhs.key); + //std::basic_string s2(rhs.key); + return strcmp((char*)lhs.key,(char*)rhs.key) < 0; + //return (*((int32_t*)lhs.key)) <= (*((int32_t*)rhs.key)); + } + + /** + * return -1 if k1 < k2 + * 0 if k1 == k2 + * 1 of k1 > k2 + **/ + static int compare(const key_t k1,const key_t k2) + { + //for char* ending with \0 + return strcmp((char*)k1,(char*)k2); + + //for int32_t + //printf("%d\t%d\n",(*((int32_t*)k1)) ,(*((int32_t*)k2))); + //return (*((int32_t*)k1)) <= (*((int32_t*)k2)); + } + + void setDelete() + { + *datalen = UINT_MAX; + } + + inline bool isDelete() const + { + return *datalen == UINT_MAX; + } + + static std::string key_to_str(const byte* k) + { + //for strings + return std::string((char*)k); + //for int + /* + std::ostringstream ostr; + ostr << *((int32_t*)k); + return ostr.str(); + */ + } + + //returns the length of the byte array representation + int32_t byte_length() const{ + static const size_t isize = sizeof(uint32_t); + if(isDelete()) + return isize + *keylen + isize; + else + return isize + *keylen + isize + (*datalen); + } + + //format: key length _ data length _ key _ data + byte * to_bytes() const { + static const size_t isize = sizeof(uint32_t); + byte * ret; + if(!isDelete()) + ret = (byte*) malloc(isize + *keylen + isize + *datalen); + else + ret = (byte*) malloc(isize + *keylen + isize); + + memcpy(ret, (byte*)(keylen), isize); + memcpy(ret+isize, (byte*)(datalen), isize); + memcpy(ret+isize+isize, key, *keylen); + if(!isDelete()) + memcpy(ret+isize+isize+*keylen, data, *datalen); + return ret; + } + + //does not copy the data again + //just sets the pointers in the datatuple to + //right positions in the given arr + + static datatuple* from_bytes(const byte * arr) + { + static const size_t isize = sizeof(uint32_t); + datatuple *dt = (datatuple*) malloc(sizeof(datatuple)); + + dt->keylen = (uint32_t*) arr; + dt->datalen = (uint32_t*) (arr+isize); + dt->key = (key_t) (arr+isize+isize); + if(!dt->isDelete()) + dt->data = (data_t) (arr+isize+isize+ *(dt->keylen)); + else + dt->data = 0; + + return dt; + } + /* + static datatuple form_tuple(const byte * arr) + { + static const size_t isize = sizeof(uint32_t); + datatuple dt; + + dt.keylen = (uint32_t*) arr; + dt.datalen = (uint32_t*) (arr+isize); + dt.key = (key_t) (arr+isize+isize); + if(!dt.isDelete()) + dt.data = (data_t) (arr+isize+isize+ *(dt.keylen)); + else + dt.data = 0; + + return dt; + } + */ + + byte * get_key() { return (byte*) key; } + byte * get_data() { return (byte*) data; } + + //releases only the tuple + static void release(datatuple *dt) + { + free(dt); + } + +} datatuple; + + +#endif diff --git a/hello.cpp b/hello.cpp new file mode 100644 index 0000000..118fccb --- /dev/null +++ b/hello.cpp @@ -0,0 +1,48 @@ + +#include +#include +#include +#include + +typedef unsigned char uchar; +typedef struct datatuple +{ + + typedef byte* key_t; + typedef byte* data_t; + uint32_t keylen; + uint32_t datalen; + key_t key; + data_t data; + + +}; + +int main(int argc, char** argv) { + +bool * m1 = new bool(false); +std::cout << *m1 << std::endl; + + datatuple t; + std::cout << "size of datatuple:\t" << sizeof(datatuple) << std::endl; + + t.key = (datatuple::key_t) malloc(10); + const char * str = "12345678"; + strcpy((char*)t.key, (str)); + + t.keylen = strlen((char*)t.key); + + t.data = (datatuple::data_t) malloc(10); + const char * str2 = "1234567"; + strcpy((char*)t.data, (str2)); + + t.datalen = strlen((char*)t.data); + + std::cout << "size of datatuple:\t" << sizeof(datatuple) << std::endl; + std::cout << "keylen:\t" << t.keylen << + "\tdatalen:\t" << t.datalen << + "\t" << t.key << + "\t" << t.data << + std::endl; + +} diff --git a/logiterators.cpp b/logiterators.cpp new file mode 100644 index 0000000..80a079b --- /dev/null +++ b/logiterators.cpp @@ -0,0 +1,200 @@ + +#include "logstore.h" +//#include "datapage.cpp" +#include "logiterators.h" + + + + +//template +/* +template <> +const byte* toByteArray, datatuple>( + memTreeIterator, datatuple> * const t) +{ + return (*(t->it_)).to_bytes(); +} +*/ + + +///////////////////////////////////////////////////////////////////// +// tree iterator implementation +///////////////////////////////////////////////////////////////////// + +template +treeIterator::treeIterator(recordid tree) : + tree_(tree), + lsmIterator_(logtreeIterator::open(-1,tree)), + curr_tuple(0) +{ + init_helper(); +} + +template +treeIterator::treeIterator(recordid tree, TUPLE& key) : + tree_(tree), + //scratch_(), + lsmIterator_(logtreeIterator::openAt(-1,tree,key.get_key()))//toByteArray())), + //slot_(0) +{ + init_helper(); + + /* + treeIterator * end = this->end(); + for(;*this != *end && **this < key; ++(*this)) + { + DEBUG("treeIterator was not at the given TUPLE"); + } + delete end; + */ + +} + +template +treeIterator::~treeIterator() +{ + if(lsmIterator_) + logtreeIterator::close(-1, lsmIterator_); + + if(curr_tuple != NULL) + free(curr_tuple); + + if(curr_page!=NULL) + { + delete curr_page; + curr_page = 0; + } + + +} + +template +void treeIterator::init_helper() +{ + if(!lsmIterator_) + { + printf("treeIterator:\t__error__ init_helper():\tnull lsmIterator_"); + curr_page = 0; + dp_itr = 0; + } + else + { + if(logtreeIterator::next(-1, lsmIterator_) == 0) + { + //printf("treeIterator:\t__error__ init_helper():\tlogtreeIteratr::next returned 0." ); + curr_page = 0; + dp_itr = 0; + } + else + { + pageid_t * pid_tmp; + pageid_t ** hack = &pid_tmp; + logtreeIterator::value(-1,lsmIterator_,(byte**)hack); + + curr_pageid = *pid_tmp; + curr_page = new DataPage(-1, curr_pageid); + dp_itr = new DPITR_T(curr_page->begin()); + } + + } +} + +template +TUPLE * treeIterator::getnext() +{ + assert(this->lsmIterator_); + + if(dp_itr == 0) + return 0; + + TUPLE* readTuple = dp_itr->getnext(-1); + + + if(!readTuple) + { + delete dp_itr; + dp_itr = 0; + delete curr_page; + curr_page = 0; + + if(logtreeIterator::next(-1,lsmIterator_)) + { + pageid_t *pid_tmp; + + pageid_t **hack = &pid_tmp; + logtreeIterator::value(-1,lsmIterator_,(byte**)hack); + curr_pageid = *pid_tmp; + curr_page = new DataPage(-1, curr_pageid); + dp_itr = new DPITR_T(curr_page->begin()); + + + readTuple = dp_itr->getnext(-1); + assert(readTuple); + } + else + { + // TODO: what is this? + //past end of iterator! "end" should contain the pageid of the + // last leaf, and 1+ numslots on that page. + //abort(); + } + } + + return curr_tuple=readTuple; +} + + + +/* +template +treeIterator::treeIterator(treeIteratorHandle* tree, TUPLE& key) : + tree_(tree?tree->r_:NULLRID), + scratch_(), + lsmIterator_(logtreeIterator::openAt(-1,tree?tree->r_:NULLRID,key.get_key())),//toByteArray())), + slot_(0) +{ + init_helper(); + if(lsmIterator_) { + treeIterator * end = this->end(); + for(;*this != *end && **this < key; ++(*this)) { } + delete end; + } else { + this->slot_ = 0; + this->pageid_ = 0; + } +} + +template +treeIterator::treeIterator(recordid tree, TUPLE &scratch) : + tree_(tree), + scratch_(scratch), + lsmIterator_(logtreeIterator::open(-1,tree)), + slot_(0) +{ + init_helper(); +} + +template +treeIterator::treeIterator(treeIteratorHandle* tree) : + tree_(tree?tree->r_:NULLRID), + scratch_(), + lsmIterator_(logtreeIterator::open(-1,tree?tree->r_:NULLRID)), + slot_(0) +{ + init_helper(); +} + +template +treeIterator::treeIterator(treeIterator& t) : + tree_(t.tree_), + scratch_(t.scratch_), + lsmIterator_(t.lsmIterator_?logtreeIterator::copy(-1,t.lsmIterator_):0), + slot_(t.slot_), + pageid_(t.pageid_), + p_((Page*)((t.p_)?loadPage(-1,t.p_->id):0)) + //currentPage_((PAGELAYOUT*)((p_)?p_->impl:0)) +{ + if(p_) + readlock(p_->rwlatch,0); +} +*/ diff --git a/logiterators.h b/logiterators.h new file mode 100644 index 0000000..8d61867 --- /dev/null +++ b/logiterators.h @@ -0,0 +1,173 @@ +#ifndef _LOG_ITERATORS_H_ +#define _LOG_ITERATORS_H_ + +#include +#include + +#undef begin +#undef end + +template class memTreeIterator; + +template +const byte* toByteArray(memTreeIterator * const t); + +template +class DataPage; + +////////////////////////////////////////////////////////////// +// memTreeIterator +///////////////////////////////////////////////////////////// + +template +class memTreeIterator{ + +private: + typedef typename MEMTREE::const_iterator MTITER; + +public: + memTreeIterator( MEMTREE *s ) + { + it_ = s->begin(); + itend_ = s->end(); + } + + + memTreeIterator( MTITER& it, MTITER& itend ) + { + it_ = it; + itend_ = itend; + } + + explicit memTreeIterator(memTreeIterator &i) + { + it_ = i.it_; + itend_ = i.itend_; + } + + const TUPLE& operator* () + { + return *it_; + } + + void seekEnd() + { + it_ = itend_; + } + + + memTreeIterator * end() + { + return new memTreeIterator(itend_,itend_); + } + + inline bool operator==(const memTreeIterator &o) const { + return it_ == o.it_; + } + inline bool operator!=(const memTreeIterator &o) const { + return !(*this == o); + } + inline void operator++() { + ++it_; + } + inline void operator--() { + --it_; + } + + inline int operator-(memTreeIterator &i) { + return it_ - i.it_; + } + + inline void operator=(memTreeIterator const &i) + { + it_ = i.it_; + itend_ = i.itend_; + } + +public: + typedef MEMTREE* handle; + +private: + + MTITER it_; + MTITER itend_; + + friend const byte* toByteArray(memTreeIterator * const t); + +}; + +template +const byte* toByteArray(memTreeIterator * const t) +{ + return (*(t->it_)).to_bytes();//toByteArray(); +} + +///////////////////////////////////////////////////////////////// + +/** + Scans through an LSM tree's leaf pages, each tuple in the tree, in + order. This iterator is designed for maximum forward scan + performance, and does not support all STL operations. +**/ +template +class treeIterator +{ + + public: + // typedef recordid handle; + class treeIteratorHandle + { + public: + treeIteratorHandle() : r_(NULLRID) {} + treeIteratorHandle(const recordid r) : r_(r) {} + + treeIteratorHandle * operator=(const recordid &r) { + r_ = r; + return this; + } + + recordid r_; + }; + + typedef treeIteratorHandle* handle; + + explicit treeIterator(recordid tree); + + explicit treeIterator(recordid tree,TUPLE &key); + + //explicit treeIterator(treeIteratorHandle* tree, TUPLE& key); + + //explicit treeIterator(treeIteratorHandle* tree); + + //explicit treeIterator(treeIterator& t); + + ~treeIterator(); + + TUPLE * getnext(); + + //void advance(int count=1); + +private: + inline void init_helper(); + + explicit treeIterator() { abort(); } + void operator=(treeIterator & t) { abort(); } + int operator-(treeIterator & t) { abort(); } + +private: + recordid tree_; //root of the tree + + lladdIterator_t * lsmIterator_; //logtree iterator + + pageid_t curr_pageid; //current page id + DataPage *curr_page; //current page + typedef typename DataPage::RecordIterator DPITR_T; + DPITR_T *dp_itr; + TUPLE *curr_tuple; //current tuple +}; + + + + +#endif + diff --git a/logserver.cpp b/logserver.cpp new file mode 100644 index 0000000..3f9eb54 --- /dev/null +++ b/logserver.cpp @@ -0,0 +1,649 @@ + + + +#include "logserver.h" +#include "datatuple.h" + +#include "logstore.h" + +#include +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end +#undef try + + +//server codes +uint8_t logserver::OP_SUCCESS = 1; +uint8_t logserver::OP_FAIL = 2; +uint8_t logserver::OP_SENDING_TUPLE = 3; + +//client codes +uint8_t logserver::OP_FIND = 4; +uint8_t logserver::OP_INSERT = 5; + +uint8_t logserver::OP_DONE = 6; + +uint8_t logserver::OP_INVALID = 32; + +void *serverLoop(void *args); + +void logserver::startserver(logtable *ltable) +{ + sys_alive = true; + this->ltable = ltable; + + selcond = new pthread_cond_t; + pthread_cond_init(selcond, 0); + + //initialize threads + for(int i=0; ith_handle = new pthread_t; + struct pthread_data *worker_data = new pthread_data; + worker_th->data = worker_data; + + worker_data->idleth_queue = &idleth_queue; + worker_data->ready_queue = &ready_queue; + worker_data->work_queue = &work_queue; + + worker_data->qlock = qlock; + + worker_data->selcond = selcond; + + worker_data->th_cond = new pthread_cond_t; + pthread_cond_init(worker_data->th_cond,0); + + worker_data->th_mut = new pthread_mutex_t; + pthread_mutex_init(worker_data->th_mut,0); + + worker_data->workitem = new int; + *(worker_data->workitem) = -1; + + //worker_data->table_lock = lsmlock; + + worker_data->ltable = ltable; + + worker_data->sys_alive = &sys_alive; + + pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th); + + idleth_queue.push(*worker_th); + + + } + + + + //start server socket + sdata = new serverth_data; + sdata->server_socket = &serversocket; + sdata->server_port = server_port; + sdata->idleth_queue = &idleth_queue; + sdata->ready_queue = &ready_queue; + sdata->selcond = selcond; + sdata->qlock = qlock; + + pthread_create(&server_thread, 0, serverLoop, sdata); + + //start monitoring loop + eventLoop(); + +} + +void logserver::stopserver() +{ + //close the server socket + //stops receiving data on the server socket + shutdown(serversocket, 0); + + //wait for all threads to be idle + while(idleth_queue.size() != nthreads) + sleep(1); + + #ifdef STATS_ENABLED + printf("\n\nSTATISTICS\n"); + std::map num_reqsc; + std::map work_timec; + #endif + + //set the system running flag to false + sys_alive = false; + for(int i=0; idata->th_mut); + pthread_cond_signal(idle_th->data->th_cond); + pthread_mutex_unlock(idle_th->data->th_mut); + //wait for it to join + pthread_join(*(idle_th->th_handle), 0); + //free the thread variables + pthread_cond_destroy(idle_th->data->th_cond); + + #ifdef STATS_ENABLED + if(i == 0) + { + tot_threadwork_time = 0; + num_reqs = 0; + } + + tot_threadwork_time += idle_th->data->work_time; + num_reqs += idle_th->data->num_reqs; + + printf("thread %d: work_time %.3f\t#calls %d\tavg req process time:\t%.3f\n", + i, + idle_th->data->work_time, + idle_th->data->num_reqs, + (( idle_th->data->num_reqs == 0 ) ? 0 : idle_th->data->work_time / idle_th->data->num_reqs) + ); + + for(std::map::const_iterator itr = idle_th->data->num_reqsc.begin(); + itr != idle_th->data->num_reqsc.end(); itr++) + { + std::string ckey = (*itr).first; + printf("\t%s\t%d\t%.3f\t%.3f\n", ckey.c_str(), (*itr).second, idle_th->data->work_timec[ckey], + idle_th->data->work_timec[ckey] / (*itr).second); + + if(num_reqsc.find(ckey) == num_reqsc.end()){ + num_reqsc[ckey] = 0; + work_timec[ckey] = 0; + } + num_reqsc[ckey] += (*itr).second; + work_timec[ckey] += idle_th->data->work_timec[ckey]; + } + #endif + + delete idle_th->data->th_cond; + delete idle_th->data->th_mut; + delete idle_th->data->workitem; + delete idle_th->data; + delete idle_th->th_handle; + } + + th_list.clear(); + + #ifdef STATS_ENABLED + + printf("\n\nAggregated Stats:\n"); + for(std::map::const_iterator itr = num_reqsc.begin(); + itr != num_reqsc.end(); itr++) + { + std::string ckey = (*itr).first; + printf("\t%s\t%d\t%.3f\t%.3f\n", ckey.c_str(), (*itr).second, work_timec[ckey], + work_timec[ckey] / (*itr).second); + } + + tot_time = (stop_tv.tv_sec - start_tv.tv_sec) * 1000 + + (stop_tv.tv_usec / 1000 - start_tv.tv_usec / 1000); + + printf("\ntot time:\t%f\n",tot_time); + printf("tot work time:\t%f\n", tot_threadwork_time); + printf("load avg:\t%f\n", tot_threadwork_time / tot_time); + + printf("tot num reqs\t%d\n", num_reqs); + if(num_reqs!= 0) + { + printf("tot work time / num reqs:\t%.3f\n", tot_threadwork_time / num_reqs); + printf("tot time / num reqs:\t%.3f\n", tot_time / num_reqs ); + } + #endif + + //close(serversocket); + + return; +} + +void logserver::eventLoop() +{ + + fd_set readfs; + std::vector sel_list; + + int maxfd; + + struct timeval Timeout; + struct timespec ts; + + while(true) + { + //clear readset + FD_ZERO(&readfs); + maxfd = -1; + + ts.tv_nsec = 250000; //nanosec + ts.tv_sec = 0; + + //Timeout.tv_usec = 250; /* microseconds */ + //Timeout.tv_sec = 0; /* seconds */ + + //update select set + pthread_mutex_lock(qlock); + + //while(ready_queue.size() == 0) + if(sel_list.size() == 0) + { + while(ready_queue.size() == 0) + pthread_cond_wait(selcond, qlock); + //pthread_cond_timedwait(selcond, qlock, &ts); + //printf("awoke\n"); + } + + //new connections + processed conns are in ready_queue + //add them to select list + while(ready_queue.size() > 0) + { + sel_list.push_back(ready_queue.front()); + ready_queue.pop(); + } + pthread_mutex_unlock(qlock); + + //ready select set + for(std::vector::const_iterator itr=sel_list.begin(); + itr != sel_list.end(); itr++) + { + if(maxfd < *itr) + maxfd = *itr; + FD_SET(*itr, &readfs); + } + + //select events + int sel_res = select(maxfd+1, &readfs, NULL, NULL, NULL);// &Timeout); + //printf("sel_res %d %d\n", sel_res, errno); + //fflush(stdout); + //job assignment to threads + //printf("sel_list size:\t%d ready_cnt\t%d\n", sel_list.size(), sel_res); + + #ifdef STATS_ENABLED + if(num_selcalls == 0) + gettimeofday(&start_tv, 0); + + num_selevents += sel_res; + num_selcalls++; + #endif + + pthread_mutex_lock(qlock); + for(int i=0; i 0) //assign the job to an indle thread + { + pthread_item idle_th = idleth_queue.front(); + idleth_queue.pop(); + + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = currsock; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + //printf("%d:\tconn %d assigned.\n", i, currsock); + } + else + { + //insert the given element to the work queue + work_queue.push(currsock); + //printf("work queue size:\t%d\n", work_queue.size()); + } + +// pthread_mutex_unlock(qlock); + + //remove from the sel_list + sel_list.erase(sel_list.begin()+i); + i--; + } + } + + pthread_mutex_unlock(qlock); + + #ifdef STATS_ENABLED + gettimeofday(&stop_tv, 0); + #endif + + } + +} + +void *serverLoop(void *args) +{ + + serverth_data *sdata = (serverth_data*)args; + + int sockfd; //socket descriptor + struct sockaddr_in serv_addr; + struct sockaddr_in cli_addr; + int newsockfd; //newly created + socklen_t clilen = sizeof(cli_addr); + + + //open a socket + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + printf("ERROR opening socket\n"); + return 0; + } + + bzero((char *) &serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(sdata->server_port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) + { + printf("ERROR on binding.\n"); + return 0; + } + + //start listening on the server socket + //second arg is the max number of coonections waiting in queue + if(listen(sockfd,SOMAXCONN)==-1) + { + printf("ERROR on listen.\n"); + return 0; + } + + printf("LSM Server listenning...\n"); + + *(sdata->server_socket) = sockfd; + int flag, result; + while(true) + { + newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen); + if (newsockfd < 0) + { + printf("ERROR on accept.\n"); + return 0; // we probably want to continue instead of return here (when not debugging) + } + + flag = 1; + result = setsockopt(newsockfd, /* socket affected */ + IPPROTO_TCP, /* set option at TCP level */ + TCP_NODELAY, /* name of option */ + (char *) &flag, /* the cast is historical + cruft */ + sizeof(int)); /* length of option value */ + if (result < 0) + { + printf("ERROR on setting socket option TCP_NODELAY.\n"); + return 0; + } + + char clientip[20]; + inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20); + printf("Connection from:\t%s\n", clientip); + + //printf("Number of idle threads %d\n", idleth_queue.size()); + + pthread_mutex_lock(sdata->qlock); + + //insert the given element to the ready queue + sdata->ready_queue->push(newsockfd); + + if(sdata->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(sdata->selcond); + + pthread_mutex_unlock(sdata->qlock); + + } + + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + + + + + +void * thread_work_fn( void * args) +{ + pthread_item * item = (pthread_item *) args; + + pthread_mutex_lock(item->data->th_mut); + while(true) + { + while(*(item->data->workitem) == -1) + { + if(!*(item->data->sys_alive)) + break; + pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job + } + + + #ifdef STATS_ENABLED + gettimeofday(& (item->data->start_tv), 0); + std::ostringstream ostr; + ostr << *(item->data->workitem) << "_"; + #endif + + if(!*(item->data->sys_alive)) + { + //printf("thread quitted.\n"); + break; + } + + //step 1: read the opcode + uint8_t opcode; + ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t)); + assert( n == sizeof(uint8_t)); + assert( opcode < logserver::OP_INVALID ); + + if( opcode == logserver::OP_DONE ) //close the conn on failure + { + pthread_mutex_lock(item->data->qlock); + printf("client done. conn closed. (%d, %d, %d, %d)\n", + n, errno, *(item->data->workitem), item->data->work_queue->size()); + close(*(item->data->workitem)); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + //printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + continue; + } + + + //step 2: read the tuple from client + datatuple tuple; + tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t)); + + //read the key length + n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + //read the data length + n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + //read the key + tuple.key = (byte*) malloc(*tuple.keylen); + readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen); + //read the data + if(!tuple.isDelete() && opcode != logserver::OP_FIND) + { + tuple.data = (byte*) malloc(*tuple.datalen); + readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen); + } + else + tuple.data = 0; + + //step 3: process the tuple + //pthread_mutex_lock(item->data->table_lock); + //readlock(item->data->table_lock,0); + + if(opcode == logserver::OP_INSERT) + { + //insert/update/delete + item->data->ltable->insertTuple(tuple); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + //step 4: send response + uint8_t rcode = logserver::OP_SUCCESS; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + } + else if(opcode == logserver::OP_FIND) + { + //find the tuple + datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + + #ifdef STATS_ENABLED + + if(dt == 0) + printf("key not found:\t%s\n", datatuple::key_to_str(tuple.key).c_str()); + else if( *dt->datalen != 1024) + printf("data len for\t%s:\t%d\n", datatuple::key_to_str(tuple.key).c_str(), + *dt->datalen); + + if(datatuple::compare(tuple.key, dt->key) != 0) + printf("key not equal:\t%s\t%s\n", datatuple::key_to_str(tuple.key).c_str(), + datatuple::key_to_str(dt->key).c_str()); + + #endif + + if(dt == 0) //tuple deleted + { + dt = (datatuple*) malloc(sizeof(datatuple)); + dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen); + *dt->keylen = *tuple.keylen; + dt->datalen = dt->keylen + 1; + dt->key = (datatuple::key_t) (dt->datalen+1); + memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen); + dt->setDelete(); + } + + //send the reply code + uint8_t rcode = logserver::OP_SENDING_TUPLE; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length()); + + //free datatuple + free(dt->keylen); + free(dt); + } + + //close the socket + //close(*(item->data->workitem)); + + //free the tuple + free(tuple.keylen); + free(tuple.datalen); + free(tuple.key); + free(tuple.data); + + //printf("socket %d: work completed.", *(item->data->workitem)); + + pthread_mutex_lock(item->data->qlock); + + //add conn desc to ready queue + item->data->ready_queue->push(*(item->data->workitem)); + //printf("ready queue size: %d sock(%d)\n", item->data->ready_queue->size(), *(item->data->workitem)); + if(item->data->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(item->data->selcond); + + //printf("work complete, added to ready queue %d (size %d)\n", *(item->data->workitem), + // item->data->ready_queue->size()); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + //printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + + #ifdef STATS_ENABLED + if( item->data->num_reqs == 0 ) + item->data->work_time = 0; + gettimeofday(& (item->data->stop_tv), 0); + (item->data->num_reqs)++; + //item->data->work_time += tv_to_double(item->data->stop_tv) - tv_to_double(item->data->start_tv); + item->data->work_time += (item->data->stop_tv.tv_sec - item->data->start_tv.tv_sec) * 1000 + + (item->data->stop_tv.tv_usec / 1000 - item->data->start_tv.tv_usec / 1000); + + int iopcode = opcode; + ostr << iopcode; + std::string clientkey = ostr.str(); + if(item->data->num_reqsc.find(clientkey) == item->data->num_reqsc.end()) + { + item->data->num_reqsc[clientkey]=0; + item->data->work_timec[clientkey]=0; + } + + item->data->num_reqsc[clientkey]++; + item->data->work_timec[clientkey] += (item->data->stop_tv.tv_sec - item->data->start_tv.tv_sec) * 1000 + + (item->data->stop_tv.tv_usec / 1000 - item->data->start_tv.tv_usec / 1000);; + #endif + + + } + pthread_mutex_unlock(item->data->th_mut); + + +} + + diff --git a/logserver.h b/logserver.h new file mode 100644 index 0000000..dd9888a --- /dev/null +++ b/logserver.h @@ -0,0 +1,197 @@ +#ifndef _LOGSERVER_H_ +#define _LOGSERVER_H_ + + +#include +#include + +//#include "logstore.h" + +#include "datatuple.h" + + + +#include +#include + +#undef begin +#undef try +#undef end + +#define STATS_ENABLED 1 + +#ifdef STATS_ENABLED +#include +#include +#include +#endif + +class logtable; + + + +struct pthread_item; + +struct pthread_data { + std::queue *idleth_queue; + std::queue *ready_queue; + std::queue *work_queue; + pthread_mutex_t * qlock; + + pthread_cond_t *selcond; + + pthread_cond_t * th_cond; + pthread_mutex_t * th_mut; + + int *workitem; //id of the socket to work + + //pthread_mutex_t * table_lock; + //rwl *table_lock; + logtable *ltable; + bool *sys_alive; + + #ifdef STATS_ENABLED + int num_reqs; + struct timeval start_tv, stop_tv; + double work_time; + std::map num_reqsc; + std::map work_timec; + #endif + +}; + +struct pthread_item{ + pthread_t * th_handle; + pthread_data *data; +}; + + +//struct work_item +//{ +// int sockd; //socket id +// datatuple in_tuple; //request +// datatuple out_tuple; //response +//}; + +struct serverth_data +{ + int *server_socket; + int server_port; + std::queue *idleth_queue; + std::queue *ready_queue; + + pthread_cond_t *selcond; + + pthread_mutex_t *qlock; + + + +}; + +void * thread_work_fn( void *); + +class logserver +{ +public: + //server codes + static uint8_t OP_SUCCESS; + static uint8_t OP_FAIL; + static uint8_t OP_SENDING_TUPLE; + + //client codes + static uint8_t OP_FIND; + static uint8_t OP_INSERT; + + static uint8_t OP_DONE; + + static uint8_t OP_INVALID; + +public: + logserver(int nthreads, int server_port){ + this->nthreads = nthreads; + this->server_port = server_port; + //lsmlock = new pthread_mutex_t; + //pthread_mutex_init(lsmlock,0); + + //lsmlock = initlock(); + + qlock = new pthread_mutex_t; + pthread_mutex_init(qlock,0); + + ltable = 0; + + #ifdef STATS_ENABLED + num_selevents = 0; + num_selcalls = 0; + #endif + + + } + + ~logserver() + { + //delete lsmlock; + //deletelock(lsmlock); + delete qlock; + } + + void startserver(logtable *ltable); + + void stopserver(); + + +public: + +private: + + //main loop of server + //accept connections, assign jobs to threads + //void dispatchLoop(); + + void eventLoop(); + + +private: + + int server_port; + + int nthreads; + + bool sys_alive; + + int serversocket; //server socket file descriptor + + //ccqueue conn_queue; //list of active connections (socket list) + + //ccqueue idleth_queue; //list of idle threads + + std::queue ready_queue; //connections to go inside select + std::queue work_queue; //connections to be processed by worker threads + std::queue idleth_queue; + pthread_mutex_t *qlock; + + pthread_t server_thread; + serverth_data *sdata; + pthread_cond_t *selcond; //server loop cond + + std::vector th_list; // list of threads + + //rwl *lsmlock; //lock for using lsm table + + logtable *ltable; + + + #ifdef STATS_ENABLED + int num_reqs; + int num_selevents; + int num_selcalls; + struct timeval start_tv, stop_tv; + double tot_threadwork_time; + double tot_time; + #endif + + +}; + + +#endif diff --git a/logserver_pers.cpp b/logserver_pers.cpp new file mode 100644 index 0000000..4c7f2bb --- /dev/null +++ b/logserver_pers.cpp @@ -0,0 +1,519 @@ + + + +#include "logserver.h" +#include "datatuple.h" + +#include "logstore.h" + +#include +#include +#include +#include +#include +#include +#include + +#undef begin +#undef end +#undef try + + +//server codes +uint8_t logserver::OP_SUCCESS = 1; +uint8_t logserver::OP_FAIL = 2; +uint8_t logserver::OP_SENDING_TUPLE = 3; + +//client codes +uint8_t logserver::OP_FIND = 4; +uint8_t logserver::OP_INSERT = 5; + +uint8_t logserver::OP_DONE = 6; + +uint8_t logserver::OP_INVALID = 32; + +void *serverLoop(void *args); + +void logserver::startserver(logtable *ltable) +{ + sys_alive = true; + this->ltable = ltable; + + selcond = new pthread_cond_t; + pthread_cond_init(selcond, 0); + + //initialize threads + for(int i=0; ith_handle = new pthread_t; + struct pthread_data *worker_data = new pthread_data; + worker_th->data = worker_data; + + worker_data->idleth_queue = &idleth_queue; + worker_data->ready_queue = &ready_queue; + worker_data->work_queue = &work_queue; + + worker_data->qlock = qlock; + + worker_data->selcond = selcond; + + worker_data->th_cond = new pthread_cond_t; + pthread_cond_init(worker_data->th_cond,0); + + worker_data->th_mut = new pthread_mutex_t; + pthread_mutex_init(worker_data->th_mut,0); + + worker_data->workitem = new int; + *(worker_data->workitem) = -1; + + worker_data->table_lock = lsmlock; + + worker_data->ltable = ltable; + + worker_data->sys_alive = &sys_alive; + + pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th); + + idleth_queue.push(*worker_th); + + + } + + + + //start server socket + sdata = new serverth_data; + sdata->server_socket = &serversocket; + sdata->server_port = server_port; + sdata->idleth_queue = &idleth_queue; + sdata->ready_queue = &ready_queue; + sdata->selcond = selcond; + sdata->qlock = qlock; + + pthread_create(&server_thread, 0, serverLoop, sdata); + + //start monitoring loop + eventLoop(); + +} + +void logserver::stopserver() +{ + //close the server socket + //stops receiving data on the server socket + shutdown(serversocket, 0); + + //wait for all threads to be idle + while(idleth_queue.size() != nthreads) + sleep(1); + + //set the system running flag to false + sys_alive = false; + for(int i=0; idata->th_mut); + pthread_cond_signal(idle_th->data->th_cond); + pthread_mutex_unlock(idle_th->data->th_mut); + //wait for it to join + pthread_join(*(idle_th->th_handle), 0); + //free the thread variables + pthread_cond_destroy(idle_th->data->th_cond); + delete idle_th->data->th_cond; + delete idle_th->data->th_mut; + delete idle_th->data->workitem; + delete idle_th->data; + delete idle_th->th_handle; + } + + th_list.clear(); + + //close(serversocket); + + return; +} + +void logserver::eventLoop() +{ + + fd_set readfs; + std::vector sel_list; + + int maxfd; + + struct timeval Timeout; + struct timespec ts; + + while(true) + { + //clear readset + FD_ZERO(&readfs); + maxfd = -1; + + ts.tv_nsec = 250000; //nanosec + ts.tv_sec = 0; + + //Timeout.tv_usec = 250; /* microseconds */ + //Timeout.tv_sec = 0; /* seconds */ + + //update select set + pthread_mutex_lock(qlock); + + while(ready_queue.size() == 0) + { + pthread_cond_wait(selcond, qlock); + //pthread_cond_timedwait(selcond, qlock, &ts); + //printf("awoke\n"); + } + + //new connections + processed conns are in ready_queue + //add them to select list + while(ready_queue.size() > 0) + { + sel_list.push_back(ready_queue.front()); + ready_queue.pop(); + } + pthread_mutex_unlock(qlock); + + //ready select set + for(std::vector::const_iterator itr=sel_list.begin(); + itr != sel_list.end(); itr++) + { + if(maxfd < *itr) + maxfd = *itr; + FD_SET(*itr, &readfs); + } + + //select events + int sel_res = select(maxfd+1, &readfs, NULL, NULL, NULL);// &Timeout); + //printf("sel_res %d %d\n", sel_res, errno); + //fflush(stdout); + //job assignment to threads + + for(int i=0; i 0) //assign the job to an indle thread + { + pthread_item idle_th = idleth_queue.front(); + idleth_queue.pop(); + + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = currsock; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + } + else + { + //insert the given element to the work queue + work_queue.push(currsock); + printf("work queue size:\t%d\n", work_queue.size()); + } + + //remove from the sel_list + sel_list.erase(sel_list.begin()+i); + i--; + + pthread_mutex_unlock(qlock); + + } + } + } + +} + +void *serverLoop(void *args) +{ + + serverth_data *sdata = (serverth_data*)args; + + int sockfd; //socket descriptor + struct sockaddr_in serv_addr; + struct sockaddr_in cli_addr; + int newsockfd; //newly created + socklen_t clilen = sizeof(cli_addr); + + + //open a socket + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + printf("ERROR opening socket\n"); + return 0; + } + + bzero((char *) &serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(sdata->server_port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) + { + printf("ERROR on binding.\n"); + return 0; + } + + //start listening on the server socket + //second arg is the max number of coonections waiting in queue + if(listen(sockfd,SOMAXCONN)==-1) + { + printf("ERROR on listen.\n"); + return 0; + } + + printf("LSM Server listenning...\n"); + + *(sdata->server_socket) = sockfd; + int flag, result; + while(true) + { + newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen); + if (newsockfd < 0) + { + printf("ERROR on accept.\n"); + return 0; // we probably want to continue instead of return here (when not debugging) + } + + flag = 1; + result = setsockopt(newsockfd, /* socket affected */ + IPPROTO_TCP, /* set option at TCP level */ + TCP_NODELAY, /* name of option */ + (char *) &flag, /* the cast is historical + cruft */ + sizeof(int)); /* length of option value */ + if (result < 0) + { + printf("ERROR on setting socket option TCP_NODELAY.\n"); + return 0; + } + + char clientip[20]; + inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20); + printf("Connection from:\t%s\n", clientip); + + //printf("Number of idle threads %d\n", idleth_queue.size()); + + pthread_mutex_lock(sdata->qlock); + + //insert the given element to the ready queue + sdata->ready_queue->push(newsockfd); + + if(sdata->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(sdata->selcond); + + pthread_mutex_unlock(sdata->qlock); + + } + + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + + + + + +void * thread_work_fn( void * args) +{ + pthread_item * item = (pthread_item *) args; + + pthread_mutex_lock(item->data->th_mut); + while(true) + { + while(*(item->data->workitem) == -1) + { + if(!*(item->data->sys_alive)) + break; + pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job + } + + + if(!*(item->data->sys_alive)) + { + //printf("thread quitted.\n"); + break; + } + + //step 1: read the opcode + uint8_t opcode; + ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t)); + assert( n == sizeof(uint8_t)); + assert( opcode < logserver::OP_INVALID ); + + if( opcode == logserver::OP_DONE ) //close the conn on failure + { + pthread_mutex_lock(item->data->qlock); + printf("client done. conn closed. (%d, %d, %d, %d)\n", + n, errno, *(item->data->workitem), item->data->work_queue->size()); + close(*(item->data->workitem)); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_cond_signal(item->data->selcond); + + pthread_mutex_unlock(item->data->qlock); + continue; + } + + + //step 2: read the tuple from client + datatuple tuple; + tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t)); + + //read the key length + n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + //read the data length + n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + //read the key + tuple.key = (byte*) malloc(*tuple.keylen); + readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen); + //read the data + if(!tuple.isDelete() && opcode != logserver::OP_FIND) + { + tuple.data = (byte*) malloc(*tuple.datalen); + readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen); + } + else + tuple.data = 0; + + //step 3: process the tuple + //pthread_mutex_lock(item->data->table_lock); + //readlock(item->data->table_lock,0); + + if(opcode == logserver::OP_INSERT) + { + //insert/update/delete + item->data->ltable->insertTuple(tuple); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + //step 4: send response + uint8_t rcode = logserver::OP_SUCCESS; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + } + else if(opcode == logserver::OP_FIND) + { + //find the tuple + datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + + if(dt == 0) //tuple deleted + { + dt = (datatuple*) malloc(sizeof(datatuple)); + dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen); + *dt->keylen = *tuple.keylen; + dt->datalen = dt->keylen + 1; + dt->key = (datatuple::key_t) (dt->datalen+1); + memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen); + dt->setDelete(); + } + + //send the reply code + uint8_t rcode = logserver::OP_SENDING_TUPLE; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length()); + + //free datatuple + free(dt->keylen); + free(dt); + } + + //close the socket + //close(*(item->data->workitem)); + + //free the tuple + free(tuple.keylen); + free(tuple.datalen); + free(tuple.key); + free(tuple.data); + + //printf("socket %d: work completed.\n", *(item->data->workitem)); + + pthread_mutex_lock(item->data->qlock); + + //add conn desc to ready queue + item->data->ready_queue->push(*(item->data->workitem)); + //printf("ready queue size: %d sock(%d)\n", item->data->ready_queue->size(), *(item->data->workitem)); + if(item->data->ready_queue->size() == 1) //signal the event loop + pthread_cond_signal(item->data->selcond); + + if(item->data->work_queue->size() > 0) + { + int new_work = item->data->work_queue->front(); + item->data->work_queue->pop(); + printf("work queue size:\t%d\n", item->data->work_queue->size()); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + + } + pthread_mutex_unlock(item->data->th_mut); + + +} + + diff --git a/logserver_pers.h b/logserver_pers.h new file mode 100644 index 0000000..94a10b7 --- /dev/null +++ b/logserver_pers.h @@ -0,0 +1,163 @@ +#ifndef _LOGSERVER_H_ +#define _LOGSERVER_H_ + + +#include +#include + +//#include "logstore.h" + +#include "datatuple.h" + + + +#include +#include + +#undef begin +#undef try +#undef end + +class logtable; + + + +struct pthread_item; + +struct pthread_data { + std::queue *idleth_queue; + std::queue *ready_queue; + std::queue *work_queue; + pthread_mutex_t * qlock; + + pthread_cond_t *selcond; + + pthread_cond_t * th_cond; + pthread_mutex_t * th_mut; + + int *workitem; //id of the socket to work + + //pthread_mutex_t * table_lock; + rwl *table_lock; + logtable *ltable; + bool *sys_alive; +}; + +struct pthread_item{ + pthread_t * th_handle; + pthread_data *data; +}; + + +//struct work_item +//{ +// int sockd; //socket id +// datatuple in_tuple; //request +// datatuple out_tuple; //response +//}; + +struct serverth_data +{ + int *server_socket; + int server_port; + std::queue *idleth_queue; + std::queue *ready_queue; + + pthread_cond_t *selcond; + + pthread_mutex_t *qlock; + + + +}; + +void * thread_work_fn( void *); + +class logserver +{ +public: + //server codes + static uint8_t OP_SUCCESS; + static uint8_t OP_FAIL; + static uint8_t OP_SENDING_TUPLE; + + //client codes + static uint8_t OP_FIND; + static uint8_t OP_INSERT; + + static uint8_t OP_DONE; + + static uint8_t OP_INVALID; + +public: + logserver(int nthreads, int server_port){ + this->nthreads = nthreads; + this->server_port = server_port; + //lsmlock = new pthread_mutex_t; + //pthread_mutex_init(lsmlock,0); + + lsmlock = initlock(); + + qlock = new pthread_mutex_t; + pthread_mutex_init(qlock,0); + + ltable = 0; + + } + + ~logserver() + { + //delete lsmlock; + deletelock(lsmlock); + delete qlock; + } + + void startserver(logtable *ltable); + + void stopserver(); + + +public: + +private: + + //main loop of server + //accept connections, assign jobs to threads + //void dispatchLoop(); + + void eventLoop(); + + +private: + + int server_port; + + int nthreads; + + bool sys_alive; + + int serversocket; //server socket file descriptor + + //ccqueue conn_queue; //list of active connections (socket list) + + //ccqueue idleth_queue; //list of idle threads + + std::queue ready_queue; //connections to go inside select + std::queue work_queue; //connections to be processed by worker threads + std::queue idleth_queue; + pthread_mutex_t *qlock; + + pthread_t server_thread; + serverth_data *sdata; + pthread_cond_t *selcond; //server loop cond + + std::vector th_list; // list of threads + + rwl *lsmlock; //lock for using lsm table + + logtable *ltable; + +}; + + +#endif diff --git a/logserver_simple.cpp b/logserver_simple.cpp new file mode 100644 index 0000000..56f9ceb --- /dev/null +++ b/logserver_simple.cpp @@ -0,0 +1,409 @@ + + + +#include "logserver.h" +#include "datatuple.h" + +#include "logstore.h" + +#include +#include +#include +#include +#include + +#undef begin +#undef end +#undef try + + +//server codes +uint8_t logserver::OP_SUCCESS = 1; +uint8_t logserver::OP_FAIL = 2; +uint8_t logserver::OP_SENDING_TUPLE = 3; + +//client codes +uint8_t logserver::OP_FIND = 4; +uint8_t logserver::OP_INSERT = 5; + +uint8_t logserver::OP_INVALID = 32; + + +void logserver::startserver(logtable *ltable) +{ + sys_alive = true; + this->ltable = ltable; + //initialize threads + for(int i=0; ith_handle = new pthread_t; + struct pthread_data *worker_data = new pthread_data; + worker_th->data = worker_data; + + worker_data->idleth_queue = &idleth_queue; + + worker_data->conn_queue = &conn_queue; + + worker_data->qlock = qlock; + + worker_data->th_cond = new pthread_cond_t; + pthread_cond_init(worker_data->th_cond,0); + + worker_data->th_mut = new pthread_mutex_t; + pthread_mutex_init(worker_data->th_mut,0); + + worker_data->workitem = new int; + *(worker_data->workitem) = -1; + + worker_data->table_lock = lsmlock; + + worker_data->ltable = ltable; + + worker_data->sys_alive = &sys_alive; + + pthread_create(worker_th->th_handle, 0, thread_work_fn, worker_th); + + idleth_queue.push(*worker_th); + + + } + + dispatchLoop(); + +} + +void logserver::stopserver() +{ + //close the server socket + //stops receiving data on the server socket + shutdown(serversocket, 0); + + //wait for all threads to be idle + while(idleth_queue.size() != nthreads) + sleep(1); + + //set the system running flag to false + sys_alive = false; + for(int i=0; idata->th_mut); + pthread_cond_signal(idle_th->data->th_cond); + pthread_mutex_unlock(idle_th->data->th_mut); + //wait for it to join + pthread_join(*(idle_th->th_handle), 0); + //free the thread variables + pthread_cond_destroy(idle_th->data->th_cond); + delete idle_th->data->th_cond; + delete idle_th->data->th_mut; + delete idle_th->data->workitem; + delete idle_th->data; + delete idle_th->th_handle; + } + + th_list.clear(); + + return; +} + +void logserver::dispatchLoop() +{ + + int sockfd; //socket descriptor + struct sockaddr_in serv_addr; + struct sockaddr_in cli_addr; + int newsockfd; //newly created + socklen_t clilen = sizeof(cli_addr); + + + //open a socket + sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd < 0) + { + printf("ERROR opening socket\n"); + return; + } + + bzero((char *) &serv_addr, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(server_port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) + { + printf("ERROR on binding.\n"); + return; + } + + //start listening on the server socket + //second arg is the max number of coonections waiting in queue + if(listen(sockfd,SOMAXCONN)==-1) + { + printf("ERROR on listen.\n"); + return; + } + + printf("LSM Server listenning...\n"); + + serversocket = sockfd; + int flag, result; + while(true) + { + newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen); + if (newsockfd < 0) + { + printf("ERROR on accept.\n"); + return; // we probably want to continue instead of return here (when not debugging) + } + + flag = 1; + result = setsockopt(newsockfd, /* socket affected */ + IPPROTO_TCP, /* set option at TCP level */ + TCP_NODELAY, /* name of option */ + (char *) &flag, /* the cast is historical + cruft */ + sizeof(int)); /* length of option value */ + if (result < 0) + { + printf("ERROR on setting socket option TCP_NODELAY.\n"); + return; + } + + char clientip[20]; + inet_ntop(AF_INET, (void*) &(cli_addr.sin_addr), clientip, 20); + //printf("Connection from:\t%s\n", clientip); + + //printf("Number of idle threads %d\n", idleth_queue.size()); + + pthread_mutex_lock(qlock); + + if(idleth_queue.size() > 0) + { + pthread_item idle_th = idleth_queue.front(); + idleth_queue.pop(); + + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = newsockfd; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + } + else + { + //insert the given element to the queue + conn_queue.push(newsockfd); + //printf("Number of queued connections:\t%d\n", conn_queue.size()); + } + + pthread_mutex_unlock(qlock); + + /* + try + { + + pthread_item idle_th = idleth_queue.pop(); + //wake up the thread to do work + pthread_mutex_lock(idle_th.data->th_mut); + //set the job of the idle thread + *(idle_th.data->workitem) = newsockfd; + pthread_cond_signal(idle_th.data->th_cond); + pthread_mutex_unlock(idle_th.data->th_mut); + + } + catch(int empty_exception) + { + //insert the given element to the queue + conn_queue.push(newsockfd); + //printf("Number of queued connections:\t%d\n", conn_queue.size()); + } + */ + } + + +} + +inline void readfromsocket(int sockd, byte *buf, int count) +{ + + int n = 0; + while( n < count ) + { + n += read( sockd, buf + n, count - n); + } + +} + +inline void writetosocket(int sockd, byte *buf, int count) +{ + int n = 0; + while( n < count ) + { + n += write( sockd, buf + n, count - n); + } +} + + + + + +void * thread_work_fn( void * args) +{ + pthread_item * item = (pthread_item *) args; + + pthread_mutex_lock(item->data->th_mut); + while(true) + { + while(*(item->data->workitem) == -1) + { + if(!*(item->data->sys_alive)) + break; + pthread_cond_wait(item->data->th_cond, item->data->th_mut); //wait for job + } + + + if(!*(item->data->sys_alive)) + { + //printf("thread quitted.\n"); + break; + } + + //step 1: read the opcode + uint8_t opcode; + ssize_t n = read(*(item->data->workitem), &opcode, sizeof(uint8_t)); + assert( n == sizeof(uint8_t)); + assert( opcode < logserver::OP_INVALID ); + + //step 2: read the tuple from client + datatuple tuple; + tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + tuple.datalen = (uint32_t*)malloc(sizeof(uint32_t)); + + //read the key length + n = read(*(item->data->workitem), tuple.keylen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + //read the data length + n = read(*(item->data->workitem), tuple.datalen, sizeof(uint32_t)); + assert( n == sizeof(uint32_t)); + + //read the key + tuple.key = (byte*) malloc(*tuple.keylen); + readfromsocket(*(item->data->workitem), (byte*) tuple.key, *tuple.keylen); + //read the data + if(!tuple.isDelete() && opcode != logserver::OP_FIND) + { + tuple.data = (byte*) malloc(*tuple.datalen); + readfromsocket(*(item->data->workitem), (byte*) tuple.data, *tuple.datalen); + } + else + tuple.data = 0; + + //step 3: process the tuple + //pthread_mutex_lock(item->data->table_lock); + //readlock(item->data->table_lock,0); + + if(opcode == logserver::OP_INSERT) + { + //insert/update/delete + item->data->ltable->insertTuple(tuple); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + //step 4: send response + uint8_t rcode = logserver::OP_SUCCESS; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + } + else if(opcode == logserver::OP_FIND) + { + //find the tuple + datatuple *dt = item->data->ltable->findTuple(-1, tuple.key, *tuple.keylen); + //unlock the lsmlock + //pthread_mutex_unlock(item->data->table_lock); + //unlock(item->data->table_lock); + + if(dt == 0) //tuple deleted + { + dt = (datatuple*) malloc(sizeof(datatuple)); + dt->keylen = (uint32_t*) malloc(2*sizeof(uint32_t) + *tuple.keylen); + *dt->keylen = *tuple.keylen; + dt->datalen = dt->keylen + 1; + dt->key = (datatuple::key_t) (dt->datalen+1); + memcpy((byte*) dt->key, (byte*) tuple.key, *tuple.keylen); + dt->setDelete(); + } + + //send the reply code + uint8_t rcode = logserver::OP_SENDING_TUPLE; + n = write(*(item->data->workitem), &rcode, sizeof(uint8_t)); + assert(n == sizeof(uint8_t)); + + //send the tuple + writetosocket(*(item->data->workitem), (byte*) dt->keylen, dt->byte_length()); + + //free datatuple + free(dt->keylen); + free(dt); + } + + //close the socket + close(*(item->data->workitem)); + + //free the tuple + free(tuple.keylen); + free(tuple.datalen); + free(tuple.key); + free(tuple.data); + + //printf("socket %d: work completed.\n", *(item->data->workitem)); + + pthread_mutex_lock(item->data->qlock); + + if(item->data->conn_queue->size() > 0) + { + int new_work = item->data->conn_queue->front(); + item->data->conn_queue->pop(); + *(item->data->workitem) = new_work; + } + else + { + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + } + + pthread_mutex_unlock(item->data->qlock); + + /* + //check if there is new work this thread can do + try + { + int new_work = item->data->conn_queue->pop(); + *(item->data->workitem) = new_work; //set new work + //printf("socket %d: new work found.\n", *(item->data->workitem)); + } + catch(int empty_exception) + { + //printf("socket %d: no new work found.\n", *(item->data->workitem)); + //set work to -1 + *(item->data->workitem) = -1; + //add self to idle queue + item->data->idleth_queue->push(*item); + + } + */ + + } + pthread_mutex_unlock(item->data->th_mut); + + +} + + diff --git a/logserver_simple.h b/logserver_simple.h new file mode 100644 index 0000000..48fbea6 --- /dev/null +++ b/logserver_simple.h @@ -0,0 +1,198 @@ +#ifndef _LOGSERVER_H_ +#define _LOGSERVER_H_ + + +#include +#include + +//#include "logstore.h" + +#include "datatuple.h" + + + +#include +#include + +#undef begin +#undef try +#undef end + +class logtable; + +template +class ccqueue +{ +public: + ccqueue() + { + qmut = new pthread_mutex_t; + pthread_mutex_init(qmut,0); + } + + int size() + { + pthread_mutex_lock(qmut); + int qsize = m_queue.size(); + pthread_mutex_unlock(qmut); + return qsize; + } + + //inserts a copy of the given element to the queue + void push(const T &item) + { + pthread_mutex_lock(qmut); + m_queue.push(item); + pthread_mutex_unlock(qmut); + return; + } + + //returns a copy of the next element + //deletes the copy in the queue + //throws an exception with -1 on empty queue + T pop() throw (int) + { + pthread_mutex_lock(qmut); + + if(m_queue.size() > 0) + { + T item = m_queue.front(); + m_queue.pop(); + pthread_mutex_unlock(qmut); + return item; + } + + + pthread_mutex_unlock(qmut); + throw(-1); + + + } + + + + ~ccqueue() + { + delete qmut; + } + +private: + + std::queue m_queue; + + pthread_mutex_t *qmut; + +}; + +struct pthread_item; + +struct pthread_data { + std::queue *idleth_queue; + std::queue *conn_queue; + pthread_mutex_t * qlock; + + pthread_cond_t * th_cond; + pthread_mutex_t * th_mut; + + int *workitem; //id of the socket to work + + //pthread_mutex_t * table_lock; + rwl *table_lock; + logtable *ltable; + bool *sys_alive; +}; + +struct pthread_item{ + pthread_t * th_handle; + pthread_data *data; +}; + +struct work_item +{ + int sockd; //socket id + datatuple in_tuple; //request + datatuple out_tuple; //response +}; + + +void * thread_work_fn( void *); + +class logserver +{ +public: + //server codes + static uint8_t OP_SUCCESS; + static uint8_t OP_FAIL; + static uint8_t OP_SENDING_TUPLE; + + //client codes + static uint8_t OP_FIND; + static uint8_t OP_INSERT; + + static uint8_t OP_INVALID; + +public: + logserver(int nthreads, int server_port){ + this->nthreads = nthreads; + this->server_port = server_port; + //lsmlock = new pthread_mutex_t; + //pthread_mutex_init(lsmlock,0); + + lsmlock = initlock(); + + qlock = new pthread_mutex_t; + pthread_mutex_init(qlock,0); + + ltable = 0; + + } + + ~logserver() + { + //delete lsmlock; + deletelock(lsmlock); + delete qlock; + } + + void startserver(logtable *ltable); + + void stopserver(); + + +public: + +private: + + //main loop of server + //accept connections, assign jobs to threads + void dispatchLoop(); + + +private: + + int server_port; + + int nthreads; + + bool sys_alive; + + int serversocket; //server socket file descriptor + + //ccqueue conn_queue; //list of active connections (socket list) + + //ccqueue idleth_queue; //list of idle threads + + std::queue conn_queue; + std::queue idleth_queue; + pthread_mutex_t *qlock; + + std::vector th_list; // list of threads + + rwl *lsmlock; //lock for using lsm table + + logtable *ltable; + +}; + + +#endif diff --git a/logstore.cpp b/logstore.cpp new file mode 100644 index 0000000..08d28b7 --- /dev/null +++ b/logstore.cpp @@ -0,0 +1,1606 @@ + + + +#include +#include +#include +#include + + +#include "merger.h" +#include "logstore.h" +#include "logiterators.h" + + +#include "datapage.cpp" + + +#include + +///////////////////////////////////////////////////////////////// +// LOGTREE implementation +///////////////////////////////////////////////////////////////// + +const RegionAllocConf_t logtree::REGION_ALLOC_STATIC_INITIALIZER = { {0,0,-1}, 0, -1, -1, 1000 }; +const RegionAllocConf_t +logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER = { {0,0,-1}, 0, -1, -1, 50000 }; + +#undef DEBUG +#define DEBUG(...) \ + +//printf(__VA_ARGS__); fflush(NULL) + +#define LOGTREE_ROOT_PAGE SLOTTED_PAGE + +//LSM_ROOT_PAGE + +const int64_t logtree::DEPTH = 0; //in root this is the slot num where the DEPTH (of tree) is stored +const int64_t logtree::COMPARATOR = 1; //in root this is the slot num where the COMPARATOR id is stored +const int64_t logtree::FIRST_SLOT = 2; //this is the first unused slot in all index pages +const size_t logtree::root_rec_size = sizeof(int64_t); +const int64_t logtree::PREV_LEAF = 0; //pointer to prev leaf page +const int64_t logtree::NEXT_LEAF = 1; //pointer to next leaf page + + + +logtree::logtree() +{ + +} + +void logtree::free_region_rid(int xid, recordid tree, + logtree_page_deallocator_t dealloc, void *allocator_state) +{ + // Tdealloc(xid,tree); + dealloc(xid,allocator_state); + // XXX fishy shouldn't caller do this? + Tdealloc(xid, *(recordid*)allocator_state); +} + + +void logtree::dealloc_region_rid(int xid, void *conf) +{ + recordid rid = *(recordid*)conf; + RegionAllocConf_t a; + Tread(xid,rid,&a); + DEBUG("{%lld <- dealloc region arraylist}\n", a.regionList.page); + + for(int i = 0; i < a.regionCount; i++) { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + TregionDealloc(xid,pid); + } +} + + +void logtree::force_region_rid(int xid, void *conf) +{ + recordid rid = *(recordid*)conf; + RegionAllocConf_t a; + Tread(xid,rid,&a); + + for(int i = 0; i < a.regionCount; i++) + { + a.regionList.slot = i; + pageid_t pid; + Tread(xid,a.regionList,&pid); + stasis_dirty_page_table_flush_range((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), pid, pid+a.regionSize); + forcePageRange(pid, pid+a.regionSize); + } +} + + +pageid_t logtree::alloc_region(int xid, void *conf) +{ + RegionAllocConf_t* a = (RegionAllocConf_t*)conf; + + + if(a->nextPage == a->endOfRegion) { + if(a->regionList.size == -1) { + //DEBUG("nextPage: %lld\n", a->nextPage); + a->regionList = TarrayListAlloc(xid, 1, 4, sizeof(pageid_t)); + DEBUG("regionList.page: %lld\n", a->regionList.page); + DEBUG("regionList.slot: %d\n", a->regionList.slot); + DEBUG("regionList.size: %lld\n", a->regionList.size); + + a->regionCount = 0; + } + DEBUG("{%lld <- alloc region arraylist}\n", a->regionList.page); + TarrayListExtend(xid,a->regionList,1); + a->regionList.slot = a->regionCount; + DEBUG("region lst slot %d\n",a->regionList.slot); + a->regionCount++; + DEBUG("region count %lld\n",a->regionCount); + a->nextPage = TregionAlloc(xid, a->regionSize,12); + DEBUG("next page %lld\n",a->nextPage); + a->endOfRegion = a->nextPage + a->regionSize; + Tset(xid,a->regionList,&a->nextPage); + DEBUG("next page %lld\n",a->nextPage); + } + + DEBUG("%lld ?= %lld\n", a->nextPage,a->endOfRegion); + pageid_t ret = a->nextPage; + // Ensure the page is in buffer cache without accessing disk (this + // sets it to clean and all zeros if the page is not in cache). + // Hopefully, future reads will get a cache hit, and avoid going to + // disk. + + Page * p = loadUninitializedPage(xid, ret); + releasePage(p); + DEBUG("ret %lld\n",ret); + (a->nextPage)++; + return ret; + +} + +pageid_t logtree::alloc_region_rid(int xid, void * ridp) { + recordid rid = *(recordid*)ridp; + RegionAllocConf_t conf; + Tread(xid,rid,&conf); + pageid_t ret = alloc_region(xid,&conf); + //DEBUG("{%lld <- alloc region extend}\n", conf.regionList.page); + // XXX get rid of Tset by storing next page in memory, and losing it + // on crash. + Tset(xid,rid,&conf); + return ret; +} + + + +recordid logtree::create(int xid) +{ + + tree_state = Talloc(xid,sizeof(RegionAllocConf_t)); + + //int ptype = TpageGetType(xid, tree_state.page); + //DEBUG("page type %d\n", ptype); //returns a slotted page + + Tset(xid,tree_state, ®ION_ALLOC_STATIC_INITIALIZER); + + pageid_t root = alloc_region_rid(xid, &tree_state); + DEBUG("Root = %lld\n", root); + recordid ret = { root, 0, 0 }; + + Page *p = loadPage(xid, ret.page); + writelock(p->rwlatch,0); + + stasis_page_slotted_initialize_page(p); + + //*stasis_page_type_ptr(p) = SLOTTED_PAGE; //LOGTREE_ROOT_PAGE; + + //logtree_state *state = (logtree_state*) ( malloc(sizeof(logtree_state))); + //state->lastLeaf = -1; + + //p->impl = state; + lastLeaf = -1; + + //initialize root node + recordid tmp = stasis_record_alloc_begin(xid, p, root_rec_size); + stasis_record_alloc_done(xid,p,tmp); + + assert(tmp.page == ret.page + && tmp.slot == DEPTH + && tmp.size == root_rec_size); + + writeRecord(xid, p, tmp, (byte*)&DEPTH, root_rec_size); + + tmp = stasis_record_alloc_begin(xid, p, root_rec_size); + stasis_record_alloc_done(xid,p,tmp); + + assert(tmp.page == ret.page + && tmp.slot == COMPARATOR + && tmp.size == root_rec_size); + + writeRecord(xid, p, tmp, (byte*)&COMPARATOR, root_rec_size); + + + unlock(p->rwlatch); + releasePage(p); + + root_rec = ret; + + return ret; +} + + +/** + * TODO: what happen if there is already such a record with a different size? + * I guess this should never happen in rose, but what if? + **/ +void logtree::writeRecord(int xid, Page *p, recordid &rid, + const byte *data, size_t datalen) +{ + byte *byte_arr = stasis_record_write_begin(xid, p, rid); + memcpy(byte_arr, data, datalen); //TODO: stasis write call + stasis_record_write_done(xid, p, rid, byte_arr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? + +} + +void logtree::writeNodeRecord(int xid, Page * p, recordid & rid, + const byte *key, size_t keylen, pageid_t ptr) +{ + DEBUG("writenoderecord:\tp->id\t%lld\tkey:\t%s\tkeylen: %d\tval_page\t%lld\n", + p->id, datatuple::key_to_str(key).c_str(), keylen, ptr); + indexnode_rec *nr = (indexnode_rec*)stasis_record_write_begin(xid, p, rid); + nr->ptr = ptr; + memcpy(nr+1, key, keylen); + stasis_record_write_done(xid, p, rid, (byte*)nr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? +} + +void logtree::writeRecord(int xid, Page *p, slotid_t slot, + const byte *data, size_t datalen) +{ + recordid rid; + rid.page = p->id; + rid.slot = slot; + rid.size = datalen; + byte *byte_arr = stasis_record_write_begin(xid, p, rid); + memcpy(byte_arr, data, datalen); //TODO: stasis write call + stasis_record_write_done(xid, p, rid, byte_arr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? + +} + +const byte* logtree::readRecord(int xid, Page * p, recordid &rid) +{ + //byte *ret = (byte*)malloc(rid.size); + //const byte *nr = stasis_record_read_begin(xid,p,rid); + //memcpy(ret, nr, rid.size); + //stasis_record_read_done(xid,p,rid,nr); + + const byte *nr = stasis_record_read_begin(xid,p,rid); + return nr; + + //DEBUG("reading {%lld, %d, %d}\n", + // p->id, rid.slot, rid.size ); + + //return ret; +} + +const byte* logtree::readRecord(int xid, Page * p, slotid_t slot, int64_t size) +{ + recordid rid; + rid.page = p->id; + rid.slot = slot; + rid.size = size; + //byte *ret = (byte*)malloc(rid.size); + //stasis_record_read(xid,p,rid,ret); + //return ret; + const byte *nr = stasis_record_read_begin(xid,p,rid); + return nr; +// return readRecord(xid, p, rid); + +} + +int32_t logtree::readRecordLength(int xid, Page *p, slotid_t slot) +{ + recordid rec = {p->id, slot, 0}; + int32_t reclen = stasis_record_length_read(xid, p, rec); + return reclen; +} + +void logtree::initializeNodePage(int xid, Page *p) +{ + stasis_page_slotted_initialize_page(p); + recordid reserved1 = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)); + stasis_record_alloc_done(xid, p, reserved1); + recordid reserved2 = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)); + stasis_record_alloc_done(xid, p, reserved2); +} + + +recordid logtree::appendPage(int xid, recordid tree, pageid_t & rmLeafID, + const byte *key, size_t keySize, + lsm_page_allocator_t allocator, void *allocator_state, + long val_page) +{ + Page *p = loadPage(xid, tree.page); + writelock(p->rwlatch, 0); + //logtree_state *s = (logtree_state*)p->impl; + + tree.slot = 0; + //tree.size = sizeof(lsmTreeNodeRecord)+keySize; + + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + int64_t depth = *((int64_t*)nr); + + if(rmLeafID == -1) { + rmLeafID = findLastLeaf(xid, p, depth); + } + + Page *lastLeaf; + + if(rmLeafID != tree.page) + { + lastLeaf= loadPage(xid, rmLeafID); + writelock(lastLeaf->rwlatch, 0); + } else + lastLeaf = p; + + + recordid ret = stasis_record_alloc_begin(xid, lastLeaf, + sizeof(indexnode_rec)+keySize); + + if(ret.size == INVALID_SLOT) + { + if(lastLeaf->id != p->id) + { + assert(rmLeafID != tree.page); + unlock(lastLeaf->rwlatch); + releasePage(lastLeaf); // don't need that page anymore... + lastLeaf = 0; + } + // traverse down the root of the tree. + + tree.slot = 0; + + assert(tree.page == p->id); + + ret = appendInternalNode(xid, p, depth, key, keySize, val_page, + rmLeafID == tree.page ? -1 : rmLeafID, + allocator, allocator_state); + + if(ret.size == INVALID_SLOT) + { + DEBUG("Need to split root; depth = %d\n", depth); + + pageid_t child = allocator(xid, allocator_state); + Page *lc = loadPage(xid, child); + writelock(lc->rwlatch,0); + + initializeNodePage(xid, lc); + + //creates a copy of the root page records in the + //newly allocated child page + for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(p); i++) + { + //read the record from the root page + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,p,i,0); + int reclen = readRecordLength(xid, p, i); + + recordid cnext = stasis_record_alloc_begin(xid, lc,reclen); + + assert(i == cnext.slot); + assert(cnext.size != INVALID_SLOT); + + stasis_record_alloc_done(xid, lc, cnext); + + writeRecord(xid,lc,i,(byte*)(nr),reclen); + } + + // deallocate old entries, and update pointer on parent node. + // NOTE: stasis_record_free call goes to slottedFree in slotted.c + // this function only reduces the numslots when you call it + // with the last slot. so thats why i go backwards here. + for(int i = *stasis_page_slotted_numslots_ptr(p)-1; i>FIRST_SLOT; i--) + { + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,p,i,0); + int reclen = readRecordLength(xid, p, i); + recordid tmp_rec= {p->id, i, reclen}; + stasis_record_free(xid, p, tmp_rec); + } + + //TODO: could change with stasis_slotted_page_initialize(...); + // reinsert first. + + recordid pFirstSlot = { p->id, FIRST_SLOT, readRecordLength(xid, p, FIRST_SLOT)}; + + assert(*stasis_page_slotted_numslots_ptr(p) == FIRST_SLOT+1); + + indexnode_rec *nr + = (indexnode_rec*)stasis_record_write_begin(xid, p, pFirstSlot); + + // don't overwrite key... + nr->ptr = child; + stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr); + stasis_page_lsn_write(xid, p, 0); // XXX need real LSN? + + if(!depth) { + rmLeafID = lc->id; + pageid_t tmpid = -1; + writeRecord(xid,lc,PREV_LEAF,(byte*)(&tmpid), root_rec_size); + writeRecord(xid,lc,NEXT_LEAF,(byte*)(&tmpid), root_rec_size); + } + + unlock(lc->rwlatch); + releasePage(lc); + + //update the depth info at the root + depth ++; + writeRecord(xid,p,DEPTH,(byte*)(&depth), root_rec_size); + + assert(tree.page == p->id); + ret = appendInternalNode(xid, p, depth, key, keySize, val_page, + rmLeafID == tree.page ? -1 : rmLeafID, + allocator, allocator_state); + + assert(ret.size != INVALID_SLOT); + + } + else { + DEBUG("Appended new internal node tree depth = %lld key = %s\n", + depth, datatuple::key_to_str(key).c_str()); + } + + rmLeafID = ret.page; + DEBUG("lastleaf is %lld\n", rmLeafID); + + + } + else + { + // write the new value to an existing page + DEBUG("Writing %s\t%d to existing page# %lld\n", datatuple::key_to_str(key).c_str(), + val_page, lastLeaf->id); + + stasis_record_alloc_done(xid, lastLeaf, ret); + + logtree::writeNodeRecord(xid, lastLeaf, ret, key, keySize, val_page); + + if(lastLeaf->id != p->id) { + assert(rmLeafID != tree.page); + unlock(lastLeaf->rwlatch); + releasePage(lastLeaf); + } + } + + unlock(p->rwlatch); + releasePage(p); + + return ret; +} + +/* adding pages: + + 1) Try to append value to lsmTreeState->lastLeaf + + 2) If that fails, traverses down the root of the tree, split pages while + traversing back up. + + 3) Split is done by adding new page at end of row (no key + redistribution), except at the root, where root contents are + pushed into the first page of the next row, and a new path from root to + leaf is created starting with the root's immediate second child. + +*/ + +recordid logtree::appendInternalNode(int xid, Page *p, + int64_t depth, + const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state) +{ +// assert(*stasis_page_type_ptr(p) == LOGTREE_ROOT_PAGE || +// *stasis_page_type_ptr(p) == SLOTTED_PAGE); + assert(p->pageType == LOGTREE_ROOT_PAGE || + p->pageType == SLOTTED_PAGE); + + DEBUG("appendInternalNode\tdepth %lldkeylen%d\tnumslots %d\n", depth, key_len, *stasis_page_slotted_numslots_ptr(p)); + + if(!depth) + { + // leaf node. + recordid ret = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)+key_len); + if(ret.size != INVALID_SLOT) { + stasis_record_alloc_done(xid, p, ret); + writeNodeRecord(xid,p,ret,key,key_len,val_page); + } + return ret; + } + else + { + // recurse + int slot = *stasis_page_slotted_numslots_ptr(p)-1;//*recordcount_ptr(p)-1; + + assert(slot >= FIRST_SLOT); // there should be no empty nodes + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid, p, slot, 0); + pageid_t child_id = nr->ptr; + nr = 0; + recordid ret; + { + Page *child_page = loadPage(xid, child_id); + writelock(child_page->rwlatch,0); + ret = appendInternalNode(xid, child_page, depth-1, key, key_len, + val_page, lastLeaf, allocator, allocator_state); + + unlock(child_page->rwlatch); + releasePage(child_page); + } + + if(ret.size == INVALID_SLOT) // subtree is full; split + { + ret = stasis_record_alloc_begin(xid, p, sizeof(indexnode_rec)+key_len); + DEBUG("keylen %d\tnumslots %d for page id %lld ret.size %lld prv rec len %d\n", + key_len, + *stasis_page_slotted_numslots_ptr(p), + p->id, + ret.size, + readRecordLength(xid, p, slot)); + if(ret.size != INVALID_SLOT) + { + stasis_record_alloc_done(xid, p, ret); + ret = buildPathToLeaf(xid, ret, p, depth, key, key_len, val_page, + lastLeaf, allocator, allocator_state); + + DEBUG("split tree rooted at %lld, wrote value to {%d %d %lld}\n", + p->id, ret.page, ret.slot, ret.size); + } else { + // ret is NULLRID; this is the root of a full tree. Return + // NULLRID to the caller. + } + } else { + // we inserted the value in to a subtree rooted here. + } + return ret; + } +} + +recordid logtree::buildPathToLeaf(int xid, recordid root, Page *root_p, + int64_t depth, const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state) +{ + + // root is the recordid on the root page that should point to the + // new subtree. + assert(depth); + DEBUG("buildPathToLeaf(depth=%lld) (lastleaf=%lld) called\n",depth, lastLeaf); + + pageid_t child = allocator(xid,allocator_state); + DEBUG("new child = %lld internal? %lld\n", child, depth-1); + + Page *child_p = loadPage(xid, child); + writelock(child_p->rwlatch,0); + initializeNodePage(xid, child_p); + + recordid ret; + + if(depth-1) { + // recurse: the page we just allocated is not a leaf. + recordid child_rec = stasis_record_alloc_begin(xid, child_p, sizeof(indexnode_rec)+key_len); + assert(child_rec.size != INVALID_SLOT); + stasis_record_alloc_done(xid, child_p, child_rec); + + ret = buildPathToLeaf(xid, child_rec, child_p, depth-1, key, key_len, + val_page,lastLeaf, allocator, allocator_state); + + unlock(child_p->rwlatch); + releasePage(child_p); + + } else { + // set leaf + + // backward link.//these writes do not need alloc_begin as it is done in page initialization + writeRecord(xid, child_p, PREV_LEAF, (byte*)(&lastLeaf), root_rec_size); + //writeNodeRecord(xid,child_p,PREV_LEAF,dummy,key_len,lastLeaf); + + // forward link (initialize to -1) + + pageid_t tmp_pid = -1; + writeRecord(xid, child_p, NEXT_LEAF, (byte*)(&tmp_pid), root_rec_size); + //writeNodeRecord(xid,child_p,NEXT_LEAF,dummy,key_len,-1); + + recordid leaf_rec = stasis_record_alloc_begin(xid, child_p, + sizeof(indexnode_rec)+key_len); + + assert(leaf_rec.slot == FIRST_SLOT); + + stasis_record_alloc_done(xid, child_p, leaf_rec); + writeNodeRecord(xid,child_p,leaf_rec,key,key_len,val_page); + + ret = leaf_rec; + + unlock(child_p->rwlatch); + releasePage(child_p); + if(lastLeaf != -1) + { + // install forward link in previous page + Page *lastLeafP = loadPage(xid, lastLeaf); + writelock(lastLeafP->rwlatch,0); + writeRecord(xid,lastLeafP,NEXT_LEAF,(byte*)(&child),root_rec_size); + unlock(lastLeafP->rwlatch); + releasePage(lastLeafP); + } + + DEBUG("%lld <-> %lld\n", lastLeaf, child); + } + + writeNodeRecord(xid, root_p, root, key, key_len, child); + + return ret; + +} + + + +/** + * Traverse from the root of the page to the right most leaf (the one + * with the higest base key value). + **/ +pageid_t logtree::findLastLeaf(int xid, Page *root, int64_t depth) +{ + if(!depth) + { + DEBUG("Found last leaf = %lld\n", root->id); + return root->id; + } + else + { + const indexnode_rec *nr = (indexnode_rec*) readRecord(xid, root, + (*stasis_page_slotted_numslots_ptr(root))-1, 0); + pageid_t ret; + + Page *p = loadPage(xid, nr->ptr); + readlock(p->rwlatch,0); + ret = findLastLeaf(xid,p,depth-1); + unlock(p->rwlatch); + releasePage(p); + + return ret; + } +} + + +/** + * Traverse from the root of the tree to the left most (lowest valued + * key) leaf. + */ +pageid_t logtree::findFirstLeaf(int xid, Page *root, int64_t depth) +{ + if(!depth) //if depth is 0, then returns the id of the page + return root->id; + else + { + const indexnode_rec *nr = (indexnode_rec*)readRecord(xid,root,FIRST_SLOT,0); + Page *p = loadPage(xid, nr->ptr); + readlock(p->rwlatch,0); + pageid_t ret = findFirstLeaf(xid,p,depth-1); + unlock(p->rwlatch); + releasePage(p); + return ret; + } +} + + +pageid_t logtree::findPage(int xid, recordid tree, const byte *key, size_t keySize) +{ + Page *p = loadPage(xid, tree.page); + readlock(p->rwlatch,0); + + const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + + int64_t depth = *((int64_t*)depth_nr); + + recordid rid = lookup(xid, p, depth, key, keySize); + pageid_t ret = lookupLeafPageFromRid(xid,rid);//,keySize); + unlock(p->rwlatch); + releasePage(p); + + return ret; + +} + +pageid_t logtree::lookupLeafPageFromRid(int xid, recordid rid) +{ + pageid_t pid = -1; + if(rid.page != NULLRID.page || rid.slot != NULLRID.slot) + { + Page * p2 = loadPage(xid, rid.page); + readlock(p2->rwlatch,0); + pid = ((const indexnode_rec*)(readRecord(xid,p2,rid.slot,0)))->ptr; + unlock(p2->rwlatch); + releasePage(p2); + } + return pid; +} + + +recordid logtree::lookup(int xid, + Page *node, + int64_t depth, + const byte *key, size_t keySize ) +{ + //DEBUG("lookup: pid %lld\t depth %lld\n", node->id, depth); + if(*stasis_page_slotted_numslots_ptr(node) == FIRST_SLOT) + return NULLRID; + + assert(*stasis_page_slotted_numslots_ptr(node) > FIRST_SLOT); + + int match = FIRST_SLOT; + + // don't need to compare w/ first item in tree. + const indexnode_rec * rec = (indexnode_rec*)readRecord(xid,node,FIRST_SLOT,0); //TODO: why read it then? + + for(int i = FIRST_SLOT+1; i < *stasis_page_slotted_numslots_ptr(node); i++) + { + rec = (const indexnode_rec*)readRecord(xid,node,i,0); + int cmpval = datatuple::compare((datatuple::key_t) (rec+1),(datatuple::key_t) key); + if(cmpval>0) //changed it from > + break; + match = i; + } + + + if(depth) + { + pageid_t child_id = ((const indexnode_rec*)readRecord(xid,node,match,0))->ptr; + Page* child_page = loadPage(xid, child_id); + readlock(child_page->rwlatch,0); + recordid ret = lookup(xid,child_page,depth-1,key,0); + unlock(child_page->rwlatch); + releasePage(child_page); + return ret; + } + else + { + recordid ret = {node->id, match, keySize}; + return ret; + } +} + + +void logtree::print_tree(int xid) +{ + Page *p = loadPage(xid, root_rec.page); + readlock(p->rwlatch,0); + + const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + + int64_t depth = *((int64_t*)depth_nr); + + print_tree(xid, root_rec.page, depth); + + unlock(p->rwlatch); + releasePage(p); + +} + +void logtree::print_tree(int xid, pageid_t pid, int64_t depth) +{ + + Page *node = loadPage(xid, pid); + readlock(node->rwlatch,0); + + //const indexnode_rec *depth_nr = (const indexnode_rec*)readRecord(xid, p , DEPTH, 0); + + printf("page_id:%lld\tnum_slots:%d\t\n", node->id, *stasis_page_slotted_numslots_ptr(node)); + + if(*stasis_page_slotted_numslots_ptr(node) == FIRST_SLOT) + return; + + assert(*stasis_page_slotted_numslots_ptr(node) > FIRST_SLOT); + + if(depth) + { + printf("\tnot_leaf\n"); + + for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(node); i++) + { + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,i,0); + printf("\tchild_page_id:%lld\tkey:%s\n", nr->ptr, + datatuple::key_to_str((byte*)(nr+1)).c_str()); + + } + + for(int i = FIRST_SLOT; i < *stasis_page_slotted_numslots_ptr(node); i++) + { + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,i,0); + print_tree(xid, nr->ptr, depth-1); + + } + + } + else + { + printf("\tis_leaf\t\n"); + const indexnode_rec *nr = (const indexnode_rec*)readRecord(xid,node,FIRST_SLOT,0); + printf("\tdata_page_id:%lld\tkey:%s\n", nr->ptr, + datatuple::key_to_str((byte*)(nr+1)).c_str()); + printf("\t...\n"); + nr = (const indexnode_rec*)readRecord(xid,node,(*stasis_page_slotted_numslots_ptr(node))-1,0); + printf("\tdata_page_id:%lld\tkey:%s\n", nr->ptr, + datatuple::key_to_str((byte*)(nr+1)).c_str()); + + + } + + + unlock(node->rwlatch); + releasePage(node); + + +} + +///////////////////////////////////////////////////////////////// +// LOG TABLE IMPLEMENTATION +///////////////////////////////////////////////////////////////// + +template class DataPage; + + +logtable::logtable() +{ + + tree_c0 = NULL; + tree_c1 = NULL; + tree_c2 = NULL; +// rbtree_mut = NULL; + this->mergedata = 0; + fixed_page_count = -1; + //tmerger = new tuplemerger(&append_merger); + tmerger = new tuplemerger(&replace_merger); + + tsize = 0; + tree_bytes = 0; + + +} + +logtable::~logtable() +{ + if(tree_c1 != NULL) + delete tree_c1; + if(tree_c2 != NULL) + delete tree_c2; + + if(tree_c0 != NULL) + { + for(rbtree_t::iterator delitr=tree_c0->begin(); + delitr != tree_c0->end(); delitr++) + free((*delitr).keylen); + + delete tree_c0; + } + + delete tmerger; + + /* + if(rbtree_mut) + delete rbtree_mut; + if(tree_c0) + delete tree_c0; + if(input_needed) + delete input_needed; + */ +} + +recordid logtable::allocTable(int xid) +{ + + table_rec = Talloc(xid, sizeof(tbl_header)); + + //create the big tree + tree_c2 = new logtree(); + tree_c2->create(xid); + + tbl_header.c2_dp_state = Talloc(xid, sizeof(RegionAllocConf_t)); + Tset(xid, tbl_header.c2_dp_state, &DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + + //create the small tree + tree_c1 = new logtree(); + tree_c1->create(xid); + tbl_header.c1_dp_state = Talloc(xid, sizeof(RegionAllocConf_t)); + Tset(xid, tbl_header.c1_dp_state, &DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + tbl_header.c2_root = tree_c2->get_root_rec(); + tbl_header.c2_state = tree_c2->get_tree_state(); + tbl_header.c1_root = tree_c1->get_root_rec(); + tbl_header.c1_state = tree_c1->get_tree_state(); + + Tset(xid, table_rec, &tbl_header); + + return table_rec; +} + +void logtable::flushTable() +{ + struct timeval start_tv, stop_tv; + double start, stop; + + static double last_start; + static bool first = 1; + static int merge_count = 0; + + gettimeofday(&start_tv,0); + start = tv_to_double(start_tv); + + + writelock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + int expmcount = merge_count; + + + //this is for waiting the previous merger of the mem-tree + //hopefullly this wont happen + printf("prv merge not complete\n"); + + + while(*mergedata->old_c0) { + unlock(mergedata->header_lock); +// pthread_mutex_lock(mergedata->rbtree_mut); + if(tree_bytes >= MAX_C0_SIZE) + pthread_cond_wait(mergedata->input_needed_cond, mergedata->rbtree_mut); + else + { + pthread_mutex_unlock(mergedata->rbtree_mut); + return; + } + + + pthread_mutex_unlock(mergedata->rbtree_mut); + + writelock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + if(expmcount != merge_count) + { + unlock(mergedata->header_lock); + pthread_mutex_unlock(mergedata->rbtree_mut); + return; + } + + } + + printf("prv merge complete\n"); + + gettimeofday(&stop_tv,0); + stop = tv_to_double(stop_tv); + + //rbtree_ptr *tmp_ptr = new rbtree_ptr_t; //(typeof(h->scratch_tree)*) malloc(sizeof(void*)); + //*tmp_ptr = tree_c0; + *(mergedata->old_c0) = tree_c0; + +// pthread_mutex_lock(mergedata->rbtree_mut); + pthread_cond_signal(mergedata->input_ready_cond); +// pthread_mutex_unlock(mergedata->rbtree_mut); + + merge_count ++; + tree_c0 = new rbtree_t; + tsize = 0; + tree_bytes = 0; + + pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + if(first) + { + printf("flush waited %f sec\n", stop-start); + first = 0; + } + else + { + printf("flush waited %f sec (worked %f)\n", + stop-start, start-last_start); + } + last_start = stop; + +} + +datatuple * logtable::findTuple(int xid, datatuple::key_t key, size_t keySize) +{ + //prepare a search tuple + datatuple search_tuple; + search_tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + *(search_tuple.keylen) = keySize; + search_tuple.key = key; + + readlock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + datatuple *ret_tuple=0; + + //step 1: look in tree_c0 + rbtree_t::iterator rbitr = tree_c0->find(search_tuple); + if(rbitr != tree_c0->end()) + { + DEBUG("tree_c0 size %d\n", tree_c0->size()); + datatuple tuple = *rbitr; + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + } + + bool done = false; + //step: 2 look into first in tree if exists (a first level merge going on) + if(*(mergedata->old_c0) != 0) + { + DEBUG("old mem tree not null %d\n", (*(mergedata->old_c0))->size()); + rbitr = (*(mergedata->old_c0))->find(search_tuple); + if(rbitr != (*(mergedata->old_c0))->end()) + { + datatuple tuple = *rbitr; + + if(tuple.isDelete()) //tuple deleted + done = true; //return ret_tuple + else if(ret_tuple != 0) //merge the two + { + datatuple *mtuple = tmerger->merge(&tuple, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from current tree + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //key first found in old mem tree + { + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + } + //we cannot free tuple from old-tree 'cos it is not a copy + } + } + + //release the memtree lock + pthread_mutex_unlock(mergedata->rbtree_mut); + + //step 3: check c1 + if(!done) + { + datatuple *tuple_c1 = findTuple(xid, key, keySize, tree_c1); + if(tuple_c1 != NULL) + { + bool use_copy = false; + if(tuple_c1->isDelete()) //tuple deleted + done = true; + else if(ret_tuple != 0) //merge the two + { + datatuple *mtuple = tmerger->merge(tuple_c1, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from before + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //found for the first time + { + use_copy = true; + ret_tuple = tuple_c1; + //byte *barr = (byte*)malloc(tuple_c1->byte_length()); + //memcpy(barr, (byte*)tuple_c1->keylen, tuple_c1->byte_length()); + //ret_tuple = datatuple::from_bytes(barr); + } + + if(!use_copy) + { + free(tuple_c1->keylen); //free tuple from tree c1 + free(tuple_c1); + } + } + } + + //step 4: check old c1 if exists + if(!done && *(mergedata->diskmerge_args->in_tree) != 0) + { + DEBUG("old c1 tree not null\n"); + datatuple *tuple_oc1 = findTuple(xid, key, keySize, + (logtree*)( *(mergedata->diskmerge_args->in_tree))); + + if(tuple_oc1 != NULL) + { + bool use_copy = false; + if(tuple_oc1->isDelete()) + done = true; + else if(ret_tuple != 0) //merge the two + { + datatuple *mtuple = tmerger->merge(tuple_oc1, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from before + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //found for the first time + { + use_copy = true; + ret_tuple = tuple_oc1; + //byte *barr = (byte*)malloc(tuple_oc1->byte_length()); + //memcpy(barr, (byte*)tuple_oc1->keylen, tuple_oc1->byte_length()); + //ret_tuple = datatuple::from_bytes(barr); + } + + if(!use_copy) + { + free(tuple_oc1->keylen); //free tuple from tree old c1 + free(tuple_oc1); + } + } + } + + //step 5: check c2 + if(!done) + { + DEBUG("Not in old first disk tree\n"); + datatuple *tuple_c2 = findTuple(xid, key, keySize, tree_c2); + + if(tuple_c2 != NULL) + { + bool use_copy = false; + if(tuple_c2->isDelete()) + done = true; + else if(ret_tuple != 0) + { + datatuple *mtuple = tmerger->merge(tuple_c2, ret_tuple); //merge the two + free(ret_tuple->keylen); //free tuple from before + free(ret_tuple); + ret_tuple = mtuple; //set return tuple to merge result + } + else //found for the first time + { + use_copy = true; + ret_tuple = tuple_c2; + //byte *barr = (byte*)malloc(tuple_c2->byte_length()); + //memcpy(barr, (byte*)tuple_c2->keylen, tuple_c2->byte_length()); + //ret_tuple = datatuple::from_bytes(barr); + } + + if(!use_copy) + { + free(tuple_c2->keylen); //free tuple from tree c2 + free(tuple_c2); + } + } + } + + //pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + free(search_tuple.keylen); + + return ret_tuple; + +} + +/* + * returns the first record found with the matching key + * (not to be used together with diffs) + **/ +datatuple * logtable::findTuple_first(int xid, datatuple::key_t key, size_t keySize) +{ + //prepare a search tuple + datatuple search_tuple; + search_tuple.keylen = (uint32_t*)malloc(sizeof(uint32_t)); + *(search_tuple.keylen) = keySize; + search_tuple.key = key; + + pthread_mutex_lock(mergedata->rbtree_mut); + + datatuple *ret_tuple=0; + //step 1: look in tree_c0 + + rbtree_t::iterator rbitr = tree_c0->find(search_tuple); + if(rbitr != tree_c0->end()) + { + DEBUG("tree_c0 size %d\n", tree_c0->size()); + datatuple tuple = *rbitr; + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + + } + else + { + DEBUG("Not in mem tree %d\n", tree_c0->size()); + //step: 2 look into first in tree if exists (a first level merge going on) + if(*(mergedata->old_c0) != 0) + { + DEBUG("old mem tree not null %d\n", (*(mergedata->old_c0))->size()); + rbitr = (*(mergedata->old_c0))->find(search_tuple); + if(rbitr != (*(mergedata->old_c0))->end()) + { + datatuple tuple = *rbitr; + byte *barr = (byte*)malloc(tuple.byte_length()); + memcpy(barr, (byte*)tuple.keylen, tuple.byte_length()); + ret_tuple = datatuple::from_bytes(barr); + } + } + + if(ret_tuple == 0) + { + DEBUG("Not in old mem tree\n"); + + //step 3: check c1 + ret_tuple = findTuple(xid, key, keySize, tree_c1); + } + + if(ret_tuple == 0) + { + DEBUG("Not in first disk tree\n"); + + //step 4: check old c1 if exists + if( *(mergedata->diskmerge_args->in_tree) != 0) + { + DEBUG("old c1 tree not null\n"); + ret_tuple = findTuple(xid, key, keySize, + (logtree*)( *(mergedata->diskmerge_args->in_tree))); + } + + } + + if(ret_tuple == 0) + { + DEBUG("Not in old first disk tree\n"); + + //step 5: check c2 + ret_tuple = findTuple(xid, key, keySize, tree_c2); + } + } + + + + + pthread_mutex_unlock(mergedata->rbtree_mut); + free(search_tuple.keylen); + + return ret_tuple; + +} + +void logtable::insertTuple(struct datatuple &tuple) +{ + //static int count = LATCH_INTERVAL; + //static int tsize = 0; //number of tuples + //static int64_t tree_bytes = 0; //number of bytes + static const size_t isize = sizeof(uint32_t); + + //lock the red-black tree + readlock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + //find the previous tuple with same key in the memtree if exists + rbtree_t::iterator rbitr = tree_c0->find(tuple); + if(rbitr != tree_c0->end()) + { + datatuple pre_t = *rbitr; + //do the merging + datatuple *new_t = tmerger->merge(&pre_t, &tuple); + tree_c0->erase(pre_t); //remove the previous tuple + + tree_c0->insert( *new_t); //insert the new tuple + + //update the tree size (+ new_t size - pre_t size) + tree_bytes += (new_t->byte_length() - pre_t.byte_length()); + + free(pre_t.keylen); //free the previous tuple + free(new_t); // frees the malloc(sizeof(datatuple)) coming from merge + } + else //no tuple with same key exists in mem-tree + { + + //create a copy + datatuple t; + byte *arr = (byte*) malloc(tuple.byte_length()); + + t.keylen = (uint32_t*) arr; + *t.keylen = *tuple.keylen; + t.datalen = (uint32_t*) (arr+isize); + *t.datalen = *tuple.datalen; + t.key = (datatuple::key_t) (arr+isize+isize); + memcpy((byte*)t.key, (byte*)tuple.key, *t.keylen); + if(!tuple.isDelete()) + { + t.data = (datatuple::data_t) (arr+isize+isize+ *(t.keylen)); + memcpy((byte*)t.data, (byte*)tuple.data, *t.datalen); + } + else + t.data = 0; + + //insert tuple into the rbtree + tree_c0->insert(t); + tsize++; + tree_bytes += t.byte_length() + RB_TREE_OVERHEAD; + + } + + //flushing logic + /* + bool go = false; + if(tree_bytes >= MAX_C0_SIZE) + { + go = *mergedata->input_needed; + DEBUG("go %d\n", go); + } + */ + + if(tree_bytes >= MAX_C0_SIZE ) + { + DEBUG("tree size before merge %d tuples %lld bytes.\n", tsize, tree_bytes); + pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + flushTable(); + + readlock(mergedata->header_lock,0); + pthread_mutex_lock(mergedata->rbtree_mut); + + //tsize = 0; + //tree_bytes = 0; + + } + + //unlock + pthread_mutex_unlock(mergedata->rbtree_mut); + unlock(mergedata->header_lock); + + + DEBUG("tree size %d tuples %lld bytes.\n", tsize, tree_bytes); +} + + +DataPage* logtable::insertTuple(int xid, struct datatuple &tuple, recordid &dpstate, logtree *ltree) +{ + + //create a new data page + + DataPage * dp = 0; + + while(dp==0) + { + dp = new DataPage(xid, fixed_page_count, + &DataPage::dp_alloc_region_rid, + &dpstate ); + + //insert the record into the data page + if(!dp->append(xid, tuple)) + { + delete dp; + dp = 0; + } + } + + + RegionAllocConf_t alloc_conf; + //insert the record key and id of the first page of the datapage to the logtree + Tread(xid,ltree->get_tree_state(), &alloc_conf); + logtree::appendPage(xid, ltree->get_root_rec(), ltree->lastLeaf, + tuple.get_key(), + *tuple.keylen, + ltree->alloc_region, + &alloc_conf, + dp->get_start_pid() + ); + Tset(xid,ltree->get_tree_state(),&alloc_conf); + + + //return the datapage + return dp; +} + +datatuple * logtable::findTuple(int xid, datatuple::key_t key, size_t keySize, logtree *ltree) +{ + datatuple * tup=0; + + //find the datapage + pageid_t pid = ltree->findPage(xid, ltree->get_root_rec(), (byte*)key, keySize); + + if(pid!=-1) + { + DataPage * dp = new DataPage(xid, pid); + dp->recordRead(xid, key, keySize, &tup); + delete dp; + } + return tup; +} + + +///////////////////////////////////////////////// +//logtreeIterator implementation +///////////////////////////////////////////////// + +lladdIterator_t* logtreeIterator::open(int xid, recordid root) +{ + if(root.page == 0 && root.slot == 0 && root.size == -1) + return 0; + + Page *p = loadPage(xid,root.page); + readlock(p->rwlatch,0); + + //size_t keySize = getKeySize(xid,p); + DEBUG("ROOT_REC_SIZE %d\n", logtree::root_rec_size); + const byte * nr = logtree::readRecord(xid,p, + logtree::DEPTH, + logtree::root_rec_size); + int64_t depth = *((int64_t*)nr); + DEBUG("DEPTH = %lld\n", depth); + + pageid_t leafid = logtree::findFirstLeaf(xid, p, depth); + if(leafid != root.page) + { + unlock(p->rwlatch); + releasePage(p); + p = loadPage(xid,leafid); + readlock(p->rwlatch,0); + assert(depth != 0); + } + else + assert(depth == 0); + + + logtreeIterator_s *impl = (logtreeIterator_s*)malloc(sizeof(logtreeIterator_s)); + impl->p = p; + { + recordid rid = { p->id, 1, 0};//keySize }; //TODO: why does this start from 1? + impl->current = rid; + } + //DEBUG("keysize = %d, slot = %d\n", keySize, impl->current.slot); + impl->t = 0; + impl->justOnePage = (depth == 0); + + lladdIterator_t *it = (lladdIterator_t*) malloc(sizeof(lladdIterator_t)); + it->type = -1; // XXX LSM_TREE_ITERATOR; + it->impl = impl; + return it; +} + +lladdIterator_t* logtreeIterator::openAt(int xid, recordid root, const byte* key) +{ + if(root.page == NULLRID.page && root.slot == NULLRID.slot) + return 0; + + Page *p = loadPage(xid,root.page); + readlock(p->rwlatch,0); + //size_t keySize = getKeySize(xid,p); + //assert(keySize); + const byte *nr = logtree::readRecord(xid,p,logtree::DEPTH, logtree::root_rec_size); + //const byte *cmp_nr = logtree::readRecord(xid, p , logtree::COMPARATOR, logtree::root_rec_size); + + int64_t depth = *((int64_t*)nr); + + recordid lsm_entry_rid = logtree::lookup(xid,p,depth,key,0);//keySize,comparators[cmp_nr->ptr]); + + if(lsm_entry_rid.page == NULLRID.page && lsm_entry_rid.slot == NULLRID.slot) { + unlock(p->rwlatch); + return 0; + } + assert(lsm_entry_rid.size != INVALID_SLOT); + + if(root.page != lsm_entry_rid.page) + { + unlock(p->rwlatch); + releasePage(p); + p = loadPage(xid,lsm_entry_rid.page); + readlock(p->rwlatch,0); + } + + logtreeIterator_s *impl = (logtreeIterator_s*) malloc(sizeof(logtreeIterator_s)); + impl->p = p; + + impl->current.page = lsm_entry_rid.page; + impl->current.slot = lsm_entry_rid.slot - 1; // slot before thing of interest + impl->current.size = lsm_entry_rid.size; + + impl->t = 0; // must be zero so free() doesn't croak. + impl->justOnePage = (depth==0); + + lladdIterator_t *it = (lladdIterator_t*) malloc(sizeof(lladdIterator_t)); + it->type = -1; // XXX LSM_TREE_ITERATOR + it->impl = impl; + return it; +} + +/** + * move to the next page + **/ +int logtreeIterator::next(int xid, lladdIterator_t *it) +{ + logtreeIterator_s *impl = (logtreeIterator_s*) it->impl; + + impl->current = stasis_record_next(xid, impl->p, impl->current); + + if(impl->current.size == INVALID_SLOT) + { + + const indexnode_rec next_rec = *(const indexnode_rec*)logtree::readRecord(xid,impl->p, + logtree::NEXT_LEAF, + 0); + unlock(impl->p->rwlatch); + releasePage(impl->p); + + DEBUG("done with page %lld next = %lld\n", impl->p->id, next_rec.ptr); + + + if(next_rec.ptr != -1 && ! impl->justOnePage) + { + impl->p = loadPage(xid, next_rec.ptr); + readlock(impl->p->rwlatch,0); + impl->current.page = next_rec.ptr; + impl->current.slot = 2; + impl->current.size = stasis_record_length_read(xid, impl->p, impl->current); //keySize; + } else { + impl->p = 0; + impl->current.size = INVALID_SLOT; + } + + } + else + { + /* + assert(impl->current.size == keySize + sizeof(lsmTreeNodeRecord)); + impl->current.size = keySize; + */ + } + + + if(impl->current.size != INVALID_SLOT) + { + //size_t sz = sizeof(*impl->t) + impl->current.size; + if(impl->t != NULL) + free(impl->t); + + impl->t = (indexnode_rec*)malloc(impl->current.size); + memcpy(impl->t, logtree::readRecord(xid,impl->p,impl->current), impl->current.size); + + return 1; + } + else + { + if(impl->t != NULL) + free(impl->t); + impl->t = 0; + return 0; + } + +} + +/* +lladdIterator_t *logtreeIterator::copy(int xid, lladdIterator_t* i) +{ + logtreeIterator_s *it = (logtreeIterator_s*) i->impl; + logtreeIterator_s *mine = (logtreeIterator_s*) malloc(sizeof(logtreeIterator_s)); + + if(it->p) + { + mine->p = loadPage(xid, it->p->id); + readlock(mine->p->rwlatch,0); + } + else + mine->p = 0; + + memcpy(&mine->current, &it->current,sizeof(recordid)); + + if(it->t) + { + mine->t = (datatuple*)malloc(sizeof(*it->t)); //TODO: DATA IS NOT COPIED, MIGHT BE WRONG + //mine->t = malloc(sizeof(*it->t) + it->current.size); + memcpy(mine->t, it->t, sizeof(*it->t));// + it->current.size); + } + else + mine->t = 0; + + mine->justOnePage = it->justOnePage; + lladdIterator_t * ret = (lladdIterator_t*)malloc(sizeof(lladdIterator_t)); + ret->type = -1; // XXX LSM_TREE_ITERATOR + ret->impl = mine; + return ret; +} +*/ + +void logtreeIterator::close(int xid, lladdIterator_t *it) +{ + logtreeIterator_s *impl = (logtreeIterator_s*)it->impl; + if(impl->p) + { + unlock(impl->p->rwlatch); + releasePage(impl->p); + } + if(impl->t) + { + free(impl->t); + } + free(impl); + free(it); +} + + +///////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////// + + + + +double tv_to_double(struct timeval tv) +{ + return static_cast(tv.tv_sec) + + (static_cast(tv.tv_usec) / 1000000.0); +} + + +/////////////////////////////////////////////////////////////////// + diff --git a/logstore.h b/logstore.h new file mode 100644 index 0000000..5230a67 --- /dev/null +++ b/logstore.h @@ -0,0 +1,302 @@ +#ifndef _LOGSTORE_H_ +#define _LOGSTORE_H_ + +#undef end +#undef begin + +#include +#include +#include +#include +#include +#include + +#include "logserver.h" + +#include +#include +#include + +#include + + + +#include + +#include +#include +#include +#include +#include +#include + + +#include "datapage.h" +#include "tuplemerger.h" +#include "datatuple.h" + + +double tv_to_double(struct timeval tv); + + +struct logtable_mergedata; + + + +typedef struct RegionAllocConf_t +{ + recordid regionList; + pageid_t regionCount; + pageid_t nextPage; + pageid_t endOfRegion; + pageid_t regionSize; +} RegionAllocConf_t; + + +//struct logtree_state { +// pageid_t lastLeaf; +//}; + + +struct indexnode_rec { + pageid_t ptr; +}; + +typedef pageid_t(*logtree_page_allocator_t)(int, void *); +typedef void(*logtree_page_deallocator_t)(int, void *); + + +class logtree{ +public: + logtree(); + + recordid create(int xid); + + void print_tree(int xid); + + static pageid_t alloc_region(int xid, void *conf); + static pageid_t alloc_region_rid(int xid, void * ridp); + static void force_region_rid(int xid, void *conf); + static void dealloc_region_rid(int xid, void *conf); + static void free_region_rid(int xid, recordid tree, + logtree_page_deallocator_t dealloc, + void *allocator_state); + + static void writeNodeRecord(int xid, Page *p, recordid &rid, + const byte *key, size_t keylen, pageid_t ptr); + + static void writeRecord(int xid, Page *p, recordid &rid, + const byte *data, size_t datalen); + + static void writeRecord(int xid, Page *p, slotid_t slot, + const byte *data, size_t datalen); + + static const byte* readRecord(int xid, Page * p, recordid &rid); + static const byte* readRecord(int xid, Page * p, slotid_t slot, int64_t size); + + static int32_t readRecordLength(int xid, Page *p, slotid_t slot); + + //return the left-most leaf, these are not data pages, although referred to as leaf + static pageid_t findFirstLeaf(int xid, Page *root, int64_t depth); + //return the right-most leaf + static pageid_t findLastLeaf(int xid, Page *root, int64_t depth) ; + + //reads the given record and returns the page id stored in it + static pageid_t lookupLeafPageFromRid(int xid, recordid rid); + + //returns a record that stores the pageid where the given key should be in, i.e. if it exists + static recordid lookup(int xid, Page *node, int64_t depth, const byte *key, + size_t keySize); + + //returns the id of the data page that could contain the given key + static pageid_t findPage(int xid, recordid tree, const byte *key, size_t keySize); + + + //appends a leaf page, val_page is the id of the leaf page + //rmLeafID --> rightmost leaf id + static recordid appendPage(int xid, recordid tree, pageid_t & rmLeafID, + const byte *key,size_t keySize, + logtree_page_allocator_t allocator, void *allocator_state, + long val_page); + + static recordid appendInternalNode(int xid, Page *p, + int64_t depth, + const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state); + + static recordid buildPathToLeaf(int xid, recordid root, Page *root_p, + int64_t depth, const byte *key, size_t key_len, + pageid_t val_page, pageid_t lastLeaf, + logtree_page_allocator_t allocator, + void *allocator_state); + + + + /** + Initialize a page for use as an internal node of the tree. + */ + inline static void initializeNodePage(int xid, Page *p); + + recordid &get_tree_state(){return tree_state;} + recordid &get_root_rec(){return root_rec;} + +public: + + const static RegionAllocConf_t REGION_ALLOC_STATIC_INITIALIZER; + const static int64_t DEPTH; + const static int64_t COMPARATOR; + const static int64_t FIRST_SLOT; + const static size_t root_rec_size; + const static int64_t PREV_LEAF; + const static int64_t NEXT_LEAF; + + pageid_t lastLeaf; +private: + + void print_tree(int xid, pageid_t pid, int64_t depth); + +private: + recordid tree_state; + recordid root_rec; + + + + +}; + + +class logtable +{ +public: + logtable(); + ~logtable(); + + //user access functions + datatuple * findTuple(int xid, datatuple::key_t key, size_t keySize); + + datatuple * findTuple_first(int xid, datatuple::key_t key, size_t keySize); + + void insertTuple(struct datatuple &tuple); + + + //other class functions + recordid allocTable(int xid); + + void flushTable(); + + DataPage* insertTuple(int xid, struct datatuple &tuple, recordid &dpstate,logtree *ltree); + + datatuple * findTuple(int xid, datatuple::key_t key, size_t keySize, logtree *ltree); + + inline recordid & get_table_rec(){return table_rec;} + + inline logtree * get_tree_c2(){return tree_c2;} + inline logtree * get_tree_c1(){return tree_c1;} + + inline void set_tree_c1(logtree *t){tree_c1=t;} + inline void set_tree_c2(logtree *t){tree_c2=t;} + + typedef std::set rbtree_t; + typedef rbtree_t* rbtree_ptr_t; + inline rbtree_ptr_t get_tree_c0(){return tree_c0;} + + void set_tree_c0(rbtree_ptr_t newtree){tree_c0 = newtree;} + + inline recordid & get_dpstate1(){return tbl_header.c1_dp_state;} + inline recordid & get_dpstate2(){return tbl_header.c2_dp_state;} + + int get_fixed_page_count(){return fixed_page_count;} + void set_fixed_page_count(int count){fixed_page_count = count;} + + void setMergeData(logtable_mergedata * mdata) { this->mergedata = mdata;} + logtable_mergedata* getMergeData(){return mergedata;} + + inline tuplemerger * gettuplemerger(){return tmerger;} + +public: + + struct table_header { + recordid c2_root; //tree root record --> points to the root of the b-tree + recordid c2_state; //tree state --> describes the regions used by the index tree + recordid c2_dp_state; //data pages state --> regions used by the data pages + recordid c1_root; + recordid c1_state; + recordid c1_dp_state; + //epoch_t beginning; + //epoch_t end; + + }; + + const static RegionAllocConf_t DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER; + + logtable_mergedata * mergedata; + +private: + + + +private: + recordid table_rec; + struct table_header tbl_header; + + logtree *tree_c2; //big tree + logtree *tree_c1; //small tree + rbtree_ptr_t tree_c0; // in-mem red black tree + + + int tsize; //number of tuples + int64_t tree_bytes; //number of bytes + + + //DATA PAGE SETTINGS + int fixed_page_count;//number of pages in a datapage + +// logtable_mergedata * mergedata; + + tuplemerger *tmerger; +}; + + +typedef struct logtreeIterator_s { + Page * p; + recordid current; + indexnode_rec *t; + int justOnePage; +} logtreeIterator_s; + + +class logtreeIterator +{ + +public: + static lladdIterator_t* open(int xid, recordid root); + static lladdIterator_t* openAt(int xid, recordid root, const byte* key); + static int next(int xid, lladdIterator_t *it); + //static lladdIterator_t *copy(int xid, lladdIterator_t* i); + static void close(int xid, lladdIterator_t *it); + + + static inline int key (int xid, lladdIterator_t *it, byte **key) + { + logtreeIterator_s * impl = (logtreeIterator_s*)it->impl; + *key = (byte*)(impl->t+1); + return (int) impl->current.size - sizeof(indexnode_rec); + } + + + static inline int value(int xid, lladdIterator_t *it, byte **value) + { + logtreeIterator_s * impl = (logtreeIterator_s*)it->impl; + *value = (byte*)&(impl->t->ptr); + return sizeof(impl->t->ptr); + } + + static inline void tupleDone(int xid, void *it) { } + static inline void releaseLock(int xid, void *it) { } + +}; + + +#endif diff --git a/merger.cpp b/merger.cpp new file mode 100644 index 0000000..bcdced0 --- /dev/null +++ b/merger.cpp @@ -0,0 +1,836 @@ + +#include +#include "merger.h" +#include "logiterators.cpp" +#include "datapage.cpp" +//pageid_t merge_scheduler::C0_MEM_SIZE = 1000 * 1000 * 1000; + +//template <> struct merger_args; +//template <> struct merger_args; +inline DataPage* +insertTuple(int xid, DataPage *dp, datatuple &t, + logtable *ltable, + logtree * ltree, + recordid & dpstate, + int64_t &dpages, int64_t &npages); + +int merge_scheduler::addlogtable(logtable *ltable) +{ + + struct logtable_mergedata * mdata = new logtable_mergedata; + + // initialize merge data + mdata->header_lock = initlock(); + mdata->rbtree_mut = new pthread_mutex_t; + pthread_mutex_init(mdata->rbtree_mut,0); + mdata->old_c0 = new rbtree_ptr_t; + *mdata->old_c0 = 0; + + mdata->input_needed = new bool(false); + + mdata->input_ready_cond = new pthread_cond_t; + pthread_cond_init(mdata->input_ready_cond,0); + + mdata->input_needed_cond = new pthread_cond_t; + pthread_cond_init(mdata->input_needed_cond,0); + + mdata->input_size = new int64_t(100); + + mdata->diskmerge_args = new merger_args; + mdata->memmerge_args = new merger_args; + + mergedata.push_back(std::make_pair(ltable, mdata)); + return mergedata.size()-1; + +} + +merge_scheduler::~merge_scheduler() +{ + for(int i=0; iheader_lock); + delete mdata->rbtree_mut; + delete mdata->old_c0; + delete mdata->input_needed; + delete mdata->input_ready_cond; + delete mdata->input_needed_cond; + delete mdata->input_size; + + //delete the merge thread structure variables + delete (recordid*) mdata->memmerge_args->pageAllocState; + delete (recordid*) mdata->memmerge_args->oldAllocState; + delete mdata->memmerge_args->still_open; + + delete (recordid*) mdata->diskmerge_args->pageAllocState; + delete (recordid*) mdata->diskmerge_args->oldAllocState; + + pthread_cond_destroy(mdata->diskmerge_args->in_block_needed_cond); + delete mdata->diskmerge_args->in_block_needed_cond; + delete mdata->diskmerge_args->in_block_needed; + + pthread_cond_destroy(mdata->diskmerge_args->out_block_needed_cond); + delete mdata->diskmerge_args->out_block_needed_cond; + delete mdata->diskmerge_args->out_block_needed; + + pthread_cond_destroy(mdata->diskmerge_args->in_block_ready_cond); + delete mdata->diskmerge_args->in_block_ready_cond; + pthread_cond_destroy(mdata->diskmerge_args->out_block_ready_cond); + delete mdata->diskmerge_args->out_block_ready_cond; + + delete mdata->diskmerge_args->my_tree_size; + + delete mdata->diskmerge_args; + delete mdata->memmerge_args; + + + } + mergedata.clear(); + +} + +void merge_scheduler::shutdown() +{ + //signal shutdown + for(int i=0; iflushTable(); + + pthread_mutex_lock(mdata->rbtree_mut); + *(mdata->memmerge_args->still_open)=false; + pthread_cond_signal(mdata->input_ready_cond); + + //*(mdata->diskmerge_args->still_open)=false;//same pointer so no need + + pthread_mutex_unlock(mdata->rbtree_mut); + + } + + for(int i=0; imemmerge_thread,0); + pthread_join(mdata->diskmerge_thread,0); + } + + +} + +void merge_scheduler::startlogtable(int index) +{ + logtable * ltable = mergedata[index].first; + struct logtable_mergedata *mdata = mergedata[index].second; + + pthread_cond_t * block1_needed_cond = new pthread_cond_t; + pthread_cond_init(block1_needed_cond,0); + pthread_cond_t * block2_needed_cond = new pthread_cond_t; + pthread_cond_init(block2_needed_cond,0); + + pthread_cond_t * block1_ready_cond = new pthread_cond_t; + pthread_cond_init(block1_ready_cond,0); + pthread_cond_t * block2_ready_cond = new pthread_cond_t; + pthread_cond_init(block2_ready_cond,0); + + bool *block1_needed = new bool(false); + bool *block2_needed = new bool(false); + bool *system_running = new bool(true); + + //wait to merge the next block until we have merged block FUDGE times. + static const int FUDGE = 1; + static double R = MIN_R; + int64_t * block1_size = new int64_t; + *block1_size = FUDGE * ((int)R) * (*(mdata->input_size)); + + //initialize rb-tree + ltable->set_tree_c0(new rbtree_t); + + //disk merger args + recordid * ridp = new recordid; + *ridp = ltable->get_tree_c2()->get_tree_state(); //h.bigTreeAllocState; + recordid * oldridp = new recordid; + *oldridp = NULLRID; + + logtree ** block1_scratch = new logtree*; + *block1_scratch=0; + + //recordid * allocer_scratch = new recordid; + RegionAllocConf_t *allocer_scratch = new RegionAllocConf_t; + + + struct merger_args diskmerge_args= { + ltable, + 1, //worker id + logtree::alloc_region_rid, //pageAlloc + ridp, // pageAllocState + oldridp, // oldAllocState + mdata->rbtree_mut, //block_ready_mutex + block1_needed_cond, //in_block_needed_cond + block1_needed, //in_block_needed + block2_needed_cond, //out_block_needed_cond + block2_needed, //out_block_needed + block1_ready_cond, //in_block_ready_cond + block2_ready_cond, //out_block_ready_cond + system_running, //still_open i.e. system running + block1_size, //mytree_size ? + 0, //out_tree_size, biggest component computes its size directly. + 0, //max_tree_size No max size for biggest component + &R, //r_i + block1_scratch, //in-tree + allocer_scratch, //in_tree_allocer + 0, //out_tree + 0, //out_tree_allocer + new treeIterator::treeIteratorHandle(ltable->get_tree_c2()->get_root_rec()), // my_tree + ltable->get_table_rec() //tree + }; + + *mdata->diskmerge_args = diskmerge_args; + + DEBUG("Tree C2 is %lld\n", (long long)ltable->get_tree_c2()->get_root_rec().page); + + + //memory merger args + ridp = new recordid; + *ridp = ltable->get_tree_c1()->get_tree_state(); + oldridp = new recordid; + *oldridp = NULLRID; + + DEBUG("Tree C1 is %lld\n", (long long)ltable->get_tree_c1()->get_root_rec().page); + + struct merger_args memmerge_args = + { + ltable, + 2, + logtree::alloc_region_rid, //pageAlloc + ridp, // pageAllocState + oldridp, // oldAllocState + mdata->rbtree_mut, //block_ready_mutex + mdata->input_needed_cond, + mdata->input_needed, + block1_needed_cond, + block1_needed, + mdata->input_ready_cond, + block1_ready_cond, + system_running, + mdata->input_size, + block1_size, + (int64_t)(R * R * MAX_C0_SIZE), + &R, + mdata->old_c0, + 0, + block1_scratch, + allocer_scratch, + new treeIterator::treeIteratorHandle(ltable->get_tree_c1()->get_root_rec()), + ltable->get_table_rec() //tree + }; + + *mdata->memmerge_args = memmerge_args; + + void * (*diskmerger)(void*) = diskMergeThread; + void * (*memmerger)(void*) = memMergeThread; + + pthread_create(&mdata->diskmerge_thread, 0, diskmerger, mdata->diskmerge_args); + pthread_create(&mdata->memmerge_thread, 0, memmerger, mdata->memmerge_args); + +} + +//TODO: flush the data pages +// deallocate/free their region +// create new data region for new data pages +void* memMergeThread(void*arg) +{ + + int xid;// = Tbegin(); + + merger_args * a = (merger_args*)(arg); + assert(a->my_tree->r_.size != -1); + + logtable * ltable = a->ltable; + + int merge_count =0; +// pthread_mutex_lock(a->block_ready_mut); + + while(true) + { + writelock(ltable->mergedata->header_lock,0); + int done = 0; + // get a new input for merge + while(!*(a->in_tree)) + { + pthread_mutex_lock(a->block_ready_mut); + *a->in_block_needed = true; + //pthread_cond_signal(a->in_block_needed_cond); + pthread_cond_broadcast(a->in_block_needed_cond); + + if(!*(a->still_open)){ + done = 1; + pthread_mutex_unlock(a->block_ready_mut); + break; + } + + printf("mmt:\twaiting for block ready cond\n"); + unlock(ltable->mergedata->header_lock); + + pthread_cond_wait(a->in_block_ready_cond, a->block_ready_mut); + pthread_mutex_unlock(a->block_ready_mut); + + writelock(ltable->mergedata->header_lock,0); + printf("mmt:\tblock ready\n"); + + } + *a->in_block_needed = false; + + if(done==1) + { + pthread_mutex_lock(a->block_ready_mut); + pthread_cond_signal(a->out_block_ready_cond); + pthread_mutex_unlock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + break; + } + + if((*a->in_tree)->size()==0) //input empty, this can only happen during shutdown + { + delete *a->in_tree; + *a->in_tree = 0; + unlock(ltable->mergedata->header_lock); + continue; + } + + uint64_t insertedTuples=0; + int64_t mergedPages=0; + + assert(a->my_tree->r_.size != -1); + + //create the iterators + treeIterator *itrA = new treeIterator(a->my_tree->r_); + memTreeIterator *itrB = + new memTreeIterator(*a->in_tree); + memTreeIterator *itrBend = itrB->end(); + + //Tcommit(xid); + xid = Tbegin(); + + //create a new tree + logtree * scratch_tree = new logtree; + recordid scratch_root = scratch_tree->create(xid); + + //save the old dp state values + RegionAllocConf_t olddp_state; + Tread(xid, ltable->get_dpstate1(), &olddp_state); + //reinitialize the dp state + Tset(xid, ltable->get_dpstate1(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + //pthread_mutex_unlock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + + //: do the merge + printf("mmt:\tMerging:\n"); + + int64_t npages = 0; + mergedPages = merge_iterators(xid, itrA, itrB, ltable, scratch_tree, npages); + + delete itrA; + delete itrB; + delete itrBend; + + //force write the new region to disk + recordid scratch_alloc_state = scratch_tree->get_tree_state(); + //TlsmForce(xid,scratch_root,logtree::force_region_rid, &scratch_alloc_state); + logtree::force_region_rid(xid, &scratch_alloc_state); + //force write the new datapages + DataPage::force_region_rid(xid, <able->get_dpstate1()); + + //writes complete + //now automically replace the old c1 with new c1 + //pthread_mutex_lock(a->block_ready_mut); + + writelock(ltable->mergedata->header_lock,0); + merge_count++; + *a->my_tree_size = mergedPages; + printf("mmt:\tmerge_count %d #pages written %lld\n", merge_count, npages); + + delete ltable->get_tree_c1(); + ltable->set_tree_c1(scratch_tree); + + logtable::table_header h; + void * oldAllocState = a->pageAllocState; + Tread(xid, a->tree, &h); + + h.c1_root = scratch_root; + h.c1_state = scratch_alloc_state; + //note we already updated the dpstate before the merge + printf("mmt:\tUpdated C1's position on disk to %lld\n",scratch_root.page); + Tset(xid, a->tree, &h); + + //Tcommit(xid); + //xid = Tbegin(); + + // free old my_tree here + //TODO: check + logtree::free_region_rid(xid, a->my_tree->r_, logtree::dealloc_region_rid, oldAllocState); + + + //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState); + //TODO: check + //free the old data pages + DataPage::dealloc_region_rid(xid, &olddp_state); + + Tcommit(xid); + //xid = Tbegin(); + + + //TODO: this is simplistic for now + //signal the other merger if necessary + double target_R = *(a->r_i); + double new_c1_size = npages * PAGE_SIZE; + assert(target_R >= MIN_R); + if( (new_c1_size / MAX_C0_SIZE > target_R) || + (a->max_size && new_c1_size > a->max_size ) ) + { + printf("mmt:\tsignaling C2 for merge\n"); + printf("mmt:\tnew_c1_size %.2f\tMAX_C0_SIZE %lld\ta->max_size %lld\t targetr %.2f \n", new_c1_size, + MAX_C0_SIZE, a->max_size, target_R); + + // XXX need to report backpressure here! + while(*a->out_tree) { + pthread_mutex_lock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + + pthread_cond_wait(a->out_block_needed_cond, a->block_ready_mut); + pthread_mutex_unlock(a->block_ready_mut); + writelock(ltable->mergedata->header_lock,0); + } + + + *a->out_tree = scratch_tree; + xid = Tbegin(); + Tread(xid, ltable->get_dpstate1(), a->out_tree_allocer); + + pthread_cond_signal(a->out_block_ready_cond); + + + logtree *empty_tree = new logtree; + empty_tree->create(xid); + + *(recordid*)(a->pageAllocState) = empty_tree->get_tree_state(); + + a->my_tree->r_ = empty_tree->get_root_rec(); + + ltable->set_tree_c1(empty_tree); + + logtable::table_header h; + Tread(xid, a->tree, &h); + h.c1_root = empty_tree->get_root_rec(); //update root + h.c1_state = empty_tree->get_tree_state(); //update index alloc state + printf("mmt:\tUpdated C1's position on disk to %lld\n",empty_tree->get_root_rec().page); + Tset(xid, a->tree, &h); + //update datapage alloc state + Tset(xid, ltable->get_dpstate1(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + Tcommit(xid); + //xid = Tbegin(); + + } + else //not signaling the C2 for merge yet + { + printf("mmt:\tnot signaling C2 for merge\n"); + *(recordid*)a->pageAllocState = scratch_alloc_state; + a->my_tree->r_ = scratch_root; + } + + rbtree_ptr_t deltree = *a->in_tree; + *a->in_tree = 0; + + + //Tcommit(xid); + unlock(ltable->mergedata->header_lock); + + //TODO: get the freeing outside of the lock + //// ----------- Free in_tree + for(rbtree_t::iterator delitr=deltree->begin(); + delitr != deltree->end(); delitr++) + free((*delitr).keylen); + + delete deltree; + //deltree = 0; + + + /* + for(rbtree_t::iterator delitr=(*a->in_tree)->begin(); + delitr != (*a->in_tree)->end(); delitr++) + free((*delitr).keylen); + + delete *a->in_tree; + *a->in_tree = 0; + */ + } + + //pthread_mutex_unlock(a->block_ready_mut); + + return 0; + +} + +void *diskMergeThread(void*arg) +{ + int xid;// = Tbegin(); + + merger_args * a = (merger_args*)(arg); + assert(a->my_tree->r_.size != -1); + + logtable * ltable = a->ltable; + + int merge_count =0; + //pthread_mutex_lock(a->block_ready_mut); + + while(true) + { + writelock(ltable->mergedata->header_lock,0); + int done = 0; + // get a new input for merge + while(!*(a->in_tree)) + { + pthread_mutex_lock(a->block_ready_mut); + *a->in_block_needed = true; + pthread_cond_signal(a->in_block_needed_cond); + + if(!*(a->still_open)){ + done = 1; + pthread_mutex_unlock(a->block_ready_mut); + break; + } + + printf("dmt:\twaiting for block ready cond\n"); + unlock(ltable->mergedata->header_lock); + + pthread_cond_wait(a->in_block_ready_cond, a->block_ready_mut); + pthread_mutex_unlock(a->block_ready_mut); + + printf("dmt:\tblock ready\n"); + writelock(ltable->mergedata->header_lock,0); + } + *a->in_block_needed = false; + if(done==1) + { + pthread_cond_signal(a->out_block_ready_cond); + unlock(ltable->mergedata->header_lock); + break; + } + + + uint64_t insertedTuples=0; + int64_t mergedPages=0; + + assert(a->my_tree->r_.size != -1); + + //create the iterators + treeIterator *itrA = new treeIterator(a->my_tree->r_); + treeIterator *itrB = + new treeIterator((*a->in_tree)->get_root_rec()); + + //Tcommit(xid); + xid = Tbegin(); + + //create a new tree + logtree * scratch_tree = new logtree; + recordid scratch_root = scratch_tree->create(xid); + + //save the old dp state values + RegionAllocConf_t olddp_state; + Tread(xid, ltable->get_dpstate2(), &olddp_state); + //reinitialize the dp state + //TODO: maybe you want larger regions for the second tree? + Tset(xid, ltable->get_dpstate2(), &logtable::DATAPAGE_REGION_ALLOC_STATIC_INITIALIZER); + + //pthread_mutex_unlock(a->block_ready_mut); + unlock(ltable->mergedata->header_lock); + + + //do the merge + printf("dmt:\tMerging:\n"); + + int64_t npages = 0; + mergedPages = merge_iterators(xid, itrA, itrB, ltable, scratch_tree, npages); + + delete itrA; + delete itrB; + + //force write the new region to disk + recordid scratch_alloc_state = scratch_tree->get_tree_state(); + //TODO: + //TlsmForce(xid,scratch_root,logtree::force_region_rid, &scratch_alloc_state); + logtree::force_region_rid(xid, &scratch_alloc_state); + //force write the new datapages + DataPage::force_region_rid(xid, <able->get_dpstate2()); + + + //writes complete + //now automically replace the old c2 with new c2 + //pthread_mutex_lock(a->block_ready_mut); + writelock(ltable->mergedata->header_lock,0); + + merge_count++; + *a->my_tree_size = mergedPages; + //update the current optimal R value + *(a->r_i) = std::max(MIN_R, sqrt( (npages * 1.0) / (MAX_C0_SIZE/PAGE_SIZE) ) ); + + printf("dmt:\tmerge_count %d\t#written pages: %lld\n optimal r %.2f", merge_count, npages, *(a->r_i)); + + delete ltable->get_tree_c2(); + ltable->set_tree_c2(scratch_tree); + + logtable::table_header h; + void * oldAllocState = a->pageAllocState; + Tread(xid, a->tree, &h); + + h.c2_root = scratch_root; + h.c2_state = scratch_alloc_state; + //note we already updated the dpstate before the merge + printf("dmt:\tUpdated C2's position on disk to %lld\n",scratch_root.page); + Tset(xid, a->tree, &h); + + + + // free old my_tree here + //TODO: check + logtree::free_region_rid(xid, a->my_tree->r_, logtree::dealloc_region_rid, oldAllocState); + //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState); + + //TODO: check + //free the old data pages + DataPage::dealloc_region_rid(xid, &olddp_state); + + + + *(recordid*)a->pageAllocState = scratch_alloc_state; + a->my_tree->r_ = scratch_root; + + //// ----------- Free in_tree + //TODO: check + logtree::free_region_rid(xid, (*a->in_tree)->get_root_rec(), + logtree::dealloc_region_rid, + &((*a->in_tree)->get_tree_state())); + //TlsmFree(xid,a->my_tree->r_,logtree::dealloc_region_rid,oldAllocState); + + //TODO: check + //free the old data pages + DataPage::dealloc_region_rid(xid, a->in_tree_allocer);//TODO: + + Tcommit(xid); + + //xid = Tbegin(); + //Tcommit(xid); + delete *a->in_tree; + *a->in_tree = 0; + + unlock(ltable->mergedata->header_lock); + + } + + //pthread_mutex_unlock(a->block_ready_mut); + + return 0; + + +} + +int64_t merge_iterators(int xid, + treeIterator *itrA, + memTreeIterator * itrB, + logtable *ltable, + logtree *scratch_tree, + int64_t &npages ) +{ + int64_t dpages = 0; + //int npages = 0; + int64_t ntuples = 0; + DataPage *dp = 0; + + memTreeIterator *itrBend = itrB->end(); + datatuple *t1 = itrA->getnext(); + + while(*itrB != *itrBend) + { + datatuple t2 = **itrB; + DEBUG("tuple\t%lld: keylen %d datalen %d\n", ntuples, *t2.keylen,*t2.datalen ); + + while(t1 != 0 && datatuple::compare(t1->key, t2.key) < 0) // t1 is less than t2 + { + //insert t1 + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + if(t1 != 0 && datatuple::compare(t1->key, t2.key) == 0) + { + datatuple *mtuple = ltable->gettuplemerger()->merge(t1,&t2); + //insert merged tuple + dp = insertTuple(xid, dp, *mtuple, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + free(t1->keylen); + free(t1); + t1 = itrA->getnext(); //advance itrA + free(mtuple->keylen); + free(mtuple); + } + else + { + //insert t2 + dp = insertTuple(xid, dp, t2, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + //free(t2.keylen); //cannot free here it may still be read through a lookup + } + + ntuples++; + ++(*itrB); + } + + while(t1 != 0) // t1 is less than t2 + { + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate1(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + + delete itrBend; + if(dp!=NULL) + delete dp; + DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples); + fflush(stdout); + + + return dpages; + +} + + +int64_t merge_iterators(int xid, + treeIterator *itrA, //iterator on c2 + treeIterator *itrB, //iterator on c1 + logtable *ltable, + logtree *scratch_tree, + int64_t &npages) +{ + int64_t dpages = 0; + //int npages = 0; + int64_t ntuples = 0; + DataPage *dp = 0; + + datatuple *t1 = itrA->getnext(); + datatuple *t2 = 0; + + while( (t2=itrB->getnext()) != 0) + { + DEBUG("tuple\t%lld: keylen %d datalen %d\n", + ntuples, *(t2->keylen),*(t2->datalen) ); + + while(t1 != 0 && datatuple::compare(t1->key, t2->key) < 0) // t1 is less than t2 + { + //insert t1 + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, + ltable->get_dpstate2(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + if(t1 != 0 && datatuple::compare(t1->key, t2->key) == 0) + { + datatuple *mtuple = ltable->gettuplemerger()->merge(t1,t2); + + //insert merged tuple, drop deletes + if(!mtuple->isDelete()) + dp = insertTuple(xid, dp, *mtuple, ltable, scratch_tree, ltable->get_dpstate2(), + dpages, npages); + + free(t1->keylen); + free(t1); + t1 = itrA->getnext(); //advance itrA + free(mtuple->keylen); + free(mtuple); + } + else + { + //insert t2 + dp = insertTuple(xid, dp, *t2, ltable, scratch_tree, ltable->get_dpstate2(), + dpages, npages); + } + + free(t2->keylen); + free(t2); + ntuples++; + } + + while(t1 != 0) + { + dp = insertTuple(xid, dp, *t1, ltable, scratch_tree, ltable->get_dpstate2(), + dpages, npages); + + free(t1->keylen); + free(t1); + ntuples++; + //advance itrA + t1 = itrA->getnext(); + } + + if(dp!=NULL) + delete dp; + DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples); + fflush(stdout); + + return dpages; + +} + + + +inline DataPage* +insertTuple(int xid, DataPage *dp, datatuple &t, + logtable *ltable, + logtree * ltree, + recordid & dpstate, + int64_t &dpages, int64_t &npages) +{ + if(dp==0) + { + dp = ltable->insertTuple(xid, t, dpstate, ltree); + dpages++; + } + else if(!dp->append(xid, t)) + { + npages += dp->get_page_count(); + delete dp; + dp = ltable->insertTuple(xid, t, dpstate, ltree); + dpages++; + } + + return dp; +} + + + + diff --git a/merger.h b/merger.h new file mode 100644 index 0000000..def1859 --- /dev/null +++ b/merger.h @@ -0,0 +1,127 @@ +#ifndef _MERGER_H_ +#define _MERGER_H_ + +#include +#include + +#include "logstore.h" +#include "logiterators.h" + +typedef std::set rbtree_t; +typedef rbtree_t* rbtree_ptr_t; + +//TODO: 400 bytes overhead per tuple, this is nuts, check if this is true... +static const int RB_TREE_OVERHEAD = 400; +static const int64_t MAX_C0_SIZE = 800 *1024*1024; //max size of c0 +static const double MIN_R = 3.0; +//T is either logtree or red-black tree +template +struct merger_args +{ + logtable * ltable; + int worker_id; + + //page allocation information + pageid_t(*pageAlloc)(int,void*); + void *pageAllocState; + void *oldAllocState; + + pthread_mutex_t * block_ready_mut; + + pthread_cond_t * in_block_needed_cond; + bool * in_block_needed; + + pthread_cond_t * out_block_needed_cond; + bool * out_block_needed; + + pthread_cond_t * in_block_ready_cond; + pthread_cond_t * out_block_ready_cond; + + bool * still_open; + + int64_t * my_tree_size; + int64_t * out_tree_size; + int64_t max_size; //pageid_t + double * r_i; + + T ** in_tree; + void * in_tree_allocer; + + logtree ** out_tree; + void * out_tree_allocer; + + treeIterator::treeIteratorHandle *my_tree; + + recordid tree; +}; + + + +struct logtable_mergedata +{ + //merge threads + pthread_t diskmerge_thread; + pthread_t memmerge_thread; + + rwl *header_lock; + + pthread_mutex_t * rbtree_mut; + rbtree_ptr_t *old_c0; //in-mem red black tree being merged / to be merged + + bool *input_needed; // memmerge-input needed + + pthread_cond_t * input_ready_cond; + pthread_cond_t * input_needed_cond; + int64_t * input_size; + + //merge args 1 + struct merger_args *diskmerge_args; + //merge args 2 + struct merger_args *memmerge_args; + +}; + + +class merge_scheduler +{ + std::vector > mergedata; + +public: + //static pageid_t C0_MEM_SIZE; + ~merge_scheduler(); + + int addlogtable(logtable * ltable); + void startlogtable(int index); + + struct logtable_mergedata *getMergeData(int index){return mergedata[index].second;} + + void shutdown(); + + + +}; + + +void* memMergeThread(void* arg); + +//merges and returns the number of data pages used +int64_t merge_iterators(int xid, + treeIterator *itrA, + memTreeIterator * itrB, + logtable *ltable, + logtree *scratch_tree, + int64_t &npages); + + +int64_t merge_iterators(int xid, + treeIterator *itrA, + treeIterator *itrB, + logtable *ltable, + logtree *scratch_tree, + int64_t &npages); + + +void* diskMergeThread(void* arg); + + +#endif diff --git a/tuplemerger.cpp b/tuplemerger.cpp new file mode 100644 index 0000000..0adbf22 --- /dev/null +++ b/tuplemerger.cpp @@ -0,0 +1,84 @@ +#include "tuplemerger.h" +#include "logstore.h" + +datatuple* tuplemerger::merge(datatuple *t1, datatuple *t2) +{ + assert(!t1->isDelete() || !t2->isDelete()); //both cannot be delete + + datatuple *t; + + if(t1->isDelete()) //delete - t2 + { + t = datatuple::from_bytes(t2->to_bytes()); + } + else if(t2->isDelete()) + { + t = datatuple::from_bytes(t2->to_bytes()); + } + else //neither is a delete + { + t = (*merge_fp)(t1,t2); + } + + return t; + +} + +/** + * appends the data in t2 to data from t1 + * + * deletes are handled by the tuplemerger::merge function + * so here neither t1 nor t2 is a delete datatuple + **/ +datatuple* append_merger(datatuple *t1, datatuple *t2) +{ + static const size_t isize = sizeof(uint32_t); + struct datatuple *t = (datatuple*) malloc(sizeof(datatuple)); + + byte *arr = (byte*)malloc(t1->byte_length() + *t2->datalen); + + t->keylen = (uint32_t*) arr; + *(t->keylen) = *(t1->keylen); + + t->datalen = (uint32_t*) (arr+isize); + *(t->datalen) = *(t1->datalen) + *(t2->datalen); + + t->key = (datatuple::key_t) (arr+isize+isize); + memcpy((byte*)t->key, (byte*)t1->key, *(t1->keylen)); + + t->data = (datatuple::data_t) (arr+isize+isize+ *(t1->keylen)); + memcpy((byte*)t->data, (byte*)t1->data, *(t1->datalen)); + memcpy(((byte*)t->data) + *(t1->datalen), (byte*)t2->data, *(t2->datalen)); + + return t; + +} + +/** + * replaces the data with data from t2 + * + * deletes are handled by the tuplemerger::merge function + * so here neither t1 nor t2 is a delete datatuple + **/ +datatuple* replace_merger(datatuple *t1, datatuple *t2) +{ + static const size_t isize = sizeof(uint32_t); + struct datatuple *t = (datatuple*) malloc(sizeof(datatuple)); + + byte *arr = (byte*)malloc(t2->byte_length()); + + t->keylen = (uint32_t*) arr; + *(t->keylen) = *(t2->keylen); + + t->datalen = (uint32_t*) (arr+isize); + *(t->datalen) = *(t2->datalen); + + t->key = (datatuple::key_t) (arr+isize+isize); + memcpy((byte*)t->key, (byte*)t2->key, *(t2->keylen)); + + t->data = (datatuple::data_t) (arr+isize+isize+ *(t2->keylen)); + memcpy((byte*)t->data, (byte*)t2->data, *(t2->datalen)); + + return t; + +} diff --git a/tuplemerger.h b/tuplemerger.h new file mode 100644 index 0000000..b8314ba --- /dev/null +++ b/tuplemerger.h @@ -0,0 +1,34 @@ +#ifndef _TUPLE_MERGER_H_ +#define _TUPLE_MERGER_H_ + +struct datatuple; + +typedef datatuple* (*merge_fn_t) (datatuple*, datatuple *); + +datatuple* append_merger(datatuple *t1, datatuple *t2); + +datatuple* replace_merger(datatuple *t1, datatuple *t2); + + +class tuplemerger +{ + +public: + + tuplemerger(merge_fn_t merge_fp) + { + this->merge_fp = merge_fp; + } + + + datatuple* merge(datatuple *t1, datatuple *t2); + +private: + + merge_fn_t merge_fp; + +}; + + + +#endif