LSM table is now able to spawn threads, then exit.

This commit is contained in:
Sears Russell 2007-11-02 15:00:47 +00:00
parent 7e2c37534e
commit f101919244
2 changed files with 181 additions and 124 deletions

View file

@ -66,6 +66,12 @@ class treeIterator {
currentPage_ = (PAGELAYOUT*)p_->impl; currentPage_ = (PAGELAYOUT*)p_->impl;
} }
} }
typedef recordid handle;
explicit treeIterator(recordid tree) :
tree_(tree),
scratch_(),
keylen_(ROW::sizeofBytes())
{ }
explicit treeIterator(treeIterator& t) : explicit treeIterator(treeIterator& t) :
tree_(t.tree_), tree_(t.tree_),
scratch_(t.scratch_), scratch_(t.scratch_),
@ -152,7 +158,7 @@ class treeIterator {
void operator=(treeIterator & t) { abort(); } void operator=(treeIterator & t) { abort(); }
int operator-(treeIterator & t) { abort(); } int operator-(treeIterator & t) { abort(); }
recordid tree_; recordid tree_;
ROW & scratch_; ROW scratch_;
int keylen_; int keylen_;
lladdIterator_t * lsmIterator_; lladdIterator_t * lsmIterator_;
slot_index_t slot_; slot_index_t slot_;
@ -381,9 +387,15 @@ class versioningIterator {
information. The rows should be sorted based on value, then sorted by information. The rows should be sorted based on value, then sorted by
version, with the newest value first. version, with the newest value first.
*/ */
template<class STLITER,class ROW> class stlSetIterator { template<class SET,class ROW> class stlSetIterator {
private:
typedef typename SET::iterator STLITER;
public: public:
typedef SET handle;
stlSetIterator( SET& s ) : it_(s.begin()), itend_(s.end()) {}
stlSetIterator( STLITER& it, STLITER& itend ) : it_(it), itend_(itend) {} stlSetIterator( STLITER& it, STLITER& itend ) : it_(it), itend_(itend) {}
explicit stlSetIterator(stlSetIterator &i) : it_(i.it_), itend_(i.itend_){} explicit stlSetIterator(stlSetIterator &i) : it_(i.it_), itend_(i.itend_){}
const ROW& operator* () { return *it_; } const ROW& operator* () { return *it_; }
@ -416,11 +428,11 @@ class versioningIterator {
STLITER it_; STLITER it_;
STLITER itend_; STLITER itend_;
friend const byte* friend const byte*
toByteArray<STLITER,ROW>(stlSetIterator<STLITER,ROW> * const t); toByteArray<SET,ROW>(stlSetIterator<SET,ROW> * const t);
}; };
template <class STLITER,class ROW> template <class SET,class ROW>
inline const byte * toByteArray(stlSetIterator<STLITER,ROW> * const t) { inline const byte * toByteArray(stlSetIterator<SET,ROW> * const t) {
return (*(t->it_)).toByteArray(); return (*(t->it_)).toByteArray();
} }
/** Produce a byte array from the value stored at t's current /** Produce a byte array from the value stored at t's current

View file

@ -20,23 +20,18 @@ namespace rose {
*/ */
template<class PAGELAYOUT> template<class PAGELAYOUT, class ITERA, class ITERB>
struct new_insert_args { struct merge_args {
int comparator_idx;
int rowsize; //typedef int32_t val_t;
// ITER *begin;
// ITER *end;
pageid_t(*pageAlloc)(int,void*); pageid_t(*pageAlloc)(int,void*);
void *pageAllocState; void *pageAllocState;
pthread_mutex_t * block_ready_mut; pthread_mutex_t * block_ready_mut;
pthread_cond_t * block_needed_cond; pthread_cond_t * in_block_needed_cond;
pthread_cond_t * block_ready_cond; pthread_cond_t * out_block_needed_cond;
int max_waiters; pthread_cond_t * in_block_ready_cond;
int wait_count; pthread_cond_t * out_block_ready_cond;
recordid * wait_queue; bool * still_open;
typename PAGELAYOUT::FMT::TUP *scratchA; typename ITERA::handle ** out_tree;
typename PAGELAYOUT::FMT::TUP *scratchB; typename ITERB::handle ** in_tree;
pageid_t mergedPages;
}; };
template <class PAGELAYOUT, class ITER> template <class PAGELAYOUT, class ITER>
@ -101,39 +96,59 @@ namespace rose {
ITERA is an iterator over the data structure that mergeThread creates (a lsm tree iterator). ITERA is an iterator over the data structure that mergeThread creates (a lsm tree iterator).
ITERB is an iterator over the data structures that mergeThread takes as input (lsm tree, or rb tree..) ITERB is an iterator over the data structures that mergeThread takes as input (lsm tree, or rb tree..)
*/ */
template<class PAGELAYOUT, class ITERA, class ITERB> //class PAGELAYOUTX, class ENGINE, class ITERA, class ITERB, template<class PAGELAYOUT, class ITERA, class ITERB>
// class ROW, class TYPE>
void* mergeThread(void* arg) { void* mergeThread(void* arg) {
// The ITER argument of a is unused (we don't look at it's begin or end fields...) // The ITER argument of a is unused (we don't look at it's begin or end fields...)
//insert_args<PAGELAYOUT,ENGINE,ITERA,ROW>* a = merge_args<PAGELAYOUT, ITERA, ITERB> * a = (merge_args<PAGELAYOUT, ITERA, ITERB>*)arg;
// (insert_args<PAGELAYOUT,ENGINE,ITERA,ROW>*)arg;
new_insert_args<PAGELAYOUT> * a = (new_insert_args<PAGELAYOUT>*)arg;
struct timeval start_tv, wait_tv, stop_tv; struct timeval start_tv, wait_tv, stop_tv;
int merge_count = 0; int merge_count = 0;
int xid = Tbegin();
// Initialize tree with an empty tree.
// XXX hardcodes ITERA's type:
recordid oldtree = TlsmCreate(xid, PAGELAYOUT::cmp_id(),a->pageAlloc,
a->pageAllocState,PAGELAYOUT::FMT::TUP::sizeofBytes());
Tcommit(xid);
// loop around here to produce multiple batches for merge. // loop around here to produce multiple batches for merge.
while(1) { while(1) {
gettimeofday(&start_tv,0); gettimeofday(&start_tv,0);
pthread_mutex_lock(a->block_ready_mut); pthread_mutex_lock(a->block_ready_mut);
while(a->wait_count <2) {
pthread_cond_wait(a->block_ready_cond,a->block_ready_mut); if(!*(a->still_open)) {
pthread_mutex_unlock(a->block_ready_mut);
break;
}
while(!*(a->in_tree)) {
pthread_cond_signal(a->in_block_needed_cond);
pthread_cond_wait(a->in_block_ready_cond,a->block_ready_mut);
} }
gettimeofday(&wait_tv,0); gettimeofday(&wait_tv,0);
recordid * oldTreeA = &a->wait_queue[0]; xid = Tbegin();
recordid * oldTreeB = &a->wait_queue[1];
recordid tree = TlsmCreate(xid, PAGELAYOUT::cmp_id(),a->pageAlloc,
a->pageAllocState,PAGELAYOUT::FMT::TUP::sizeofBytes());
ITERA taBegin(oldtree);
ITERB tbBegin(**a->in_tree);
// XXX keep in_tree handle around so that it can be freed below.
free(*a->in_tree); // free's copy of handle; not tree
*a->in_tree = 0; // free slot for producer
pthread_cond_signal(a->in_block_needed_cond);
pthread_mutex_unlock(a->block_ready_mut); pthread_mutex_unlock(a->block_ready_mut);
recordid tree = TlsmCreate(-1, a->comparator_idx,a->pageAlloc,a->pageAllocState,a->rowsize);
ITERA taBegin(*oldTreeA,*(a->scratchA),a->rowsize);
ITERB tbBegin(*oldTreeB,*(a->scratchB),a->rowsize);
ITERA *taEnd = taBegin.end(); ITERA *taEnd = taBegin.end();
ITERB *tbEnd = tbBegin.end(); ITERB *tbEnd = tbBegin.end();
@ -143,28 +158,43 @@ namespace rose {
mergeIterator<ITERA, ITERB, typename PAGELAYOUT::FMT::TUP> mergeIterator<ITERA, ITERB, typename PAGELAYOUT::FMT::TUP>
mEnd(taBegin, tbBegin, *taEnd, *tbEnd); mEnd(taBegin, tbBegin, *taEnd, *tbEnd);
mEnd.seekEnd(); mEnd.seekEnd();
uint64_t insertedTuples; uint64_t insertedTuples;
pageid_t mergedPages = compressData<PAGELAYOUT,mergeIterator<ITERA,ITERB,typename PAGELAYOUT::FMT::TUP> > pageid_t mergedPages = compressData<PAGELAYOUT,mergeIterator<ITERA,ITERB,typename PAGELAYOUT::FMT::TUP> >
(&mBegin, &mEnd,tree,a->pageAlloc,a->pageAllocState,&insertedTuples); (&mBegin, &mEnd,tree,a->pageAlloc,a->pageAllocState,&insertedTuples);
delete taEnd; delete taEnd;
delete tbEnd; delete tbEnd;
gettimeofday(&stop_tv,0); gettimeofday(&stop_tv,0);
pthread_mutex_lock(a->block_ready_mut);
a->mergedPages = mergedPages;
// TlsmFree(wait_queue[0]) /// XXX Need to implement (de)allocation! // TlsmFree(wait_queue[0]) /// XXX Need to implement (de)allocation!
// TlsmFree(wait_queue[1]) // TlsmFree(wait_queue[1])
memcpy(&a->wait_queue[0],&tree,sizeof(tree)); pthread_mutex_lock(a->block_ready_mut);
for(int i = 1; i + 1 < a->wait_count; i++) {
memcpy(&a->wait_queue[i],&a->wait_queue[i+1],sizeof(tree)); static int threshold_calc = 1000; // XXX REALLY NEED TO FIX THIS!
if(a->out_tree && // is there a upstream merger (note the lack of the * on a->out_tree)?
mergedPages > threshold_calc // do we have enough data to bother it?
) {
while(*a->out_tree) { // we probably don't need the "while..."
pthread_cond_wait(a->out_block_needed_cond, a->block_ready_mut);
}
// XXX C++? Objects? Constructors? Who needs them?
*a->out_tree = (recordid*)malloc(sizeof(tree));
**a->out_tree = tree;
pthread_cond_signal(a->out_block_ready_cond);
// This is a bit wasteful; allocate a new empty tree to merge against.
// We don't want to ever look at the one we just handed upstream...
// We could wait for an in tree to be ready, and then pass it directly
// to compress data (to avoid all those merging comparisons...)
tree = TlsmCreate(xid, PAGELAYOUT::cmp_id(),a->pageAlloc,
a->pageAllocState,PAGELAYOUT::FMT::TUP::sizeofBytes());
} }
a->wait_count--;
pthread_mutex_unlock(a->block_ready_mut); pthread_mutex_unlock(a->block_ready_mut);
merge_count++; merge_count++;
@ -172,83 +202,20 @@ namespace rose {
double wait_elapsed = tv_to_double(wait_tv) - tv_to_double(start_tv); double wait_elapsed = tv_to_double(wait_tv) - tv_to_double(start_tv);
double work_elapsed = tv_to_double(stop_tv) - tv_to_double(wait_tv); double work_elapsed = tv_to_double(stop_tv) - tv_to_double(wait_tv);
double total_elapsed = wait_elapsed + work_elapsed; double total_elapsed = wait_elapsed + work_elapsed;
double ratio = ((double)(insertedTuples * (uint64_t)a->rowsize)) double ratio = ((double)(insertedTuples * (uint64_t)PAGELAYOUT::FMT::TUP::sizeofBytes()))
/ (double)(PAGE_SIZE * mergedPages); / (double)(PAGE_SIZE * mergedPages);
double throughput = ((double)(insertedTuples * (uint64_t)a->rowsize)) double throughput = ((double)(insertedTuples * (uint64_t)PAGELAYOUT::FMT::TUP::sizeofBytes()))
/ (1024.0 * 1024.0 * total_elapsed); / (1024.0 * 1024.0 * total_elapsed);
printf("merge # %-6d: comp ratio: %-9.3f waited %6.1f sec " printf("merge # %-6d: comp ratio: %-9.3f waited %6.1f sec "
"worked %6.1f sec inserts %-12ld (%9.3f mb/s)\n", merge_count, ratio, "worked %6.1f sec inserts %-12ld (%9.3f mb/s)\n", merge_count, ratio,
wait_elapsed, work_elapsed, (unsigned long)insertedTuples, throughput); wait_elapsed, work_elapsed, (unsigned long)insertedTuples, throughput);
pthread_cond_signal(a->block_needed_cond); Tcommit(xid);
} }
return 0; return 0;
} }
/*
template<class PAGELAYOUT, class ITER>
void* insertThread(void* arg) {
new_insert_args<PAGELAYOUT> * a = (new_insert_args<PAGELAYOUT>*)arg;
struct timeval start_tv, start_wait_tv, stop_tv;
int insert_count = 0;
pageid_t lastTreeBlocks = 0;
uint64_t lastTreeInserts = 0;
pageid_t desiredInserts = 0;
// this is a hand-tuned value; it should be set dynamically, not staticly
double K = 0.18;
// loop around here to produce multiple batches for merge.
while(1) {
gettimeofday(&start_tv,0);
// XXX this needs to be an iterator over an in-memory tree.
ITER i(*(a->begin));
ITER j(desiredInserts ? *(a->begin) : *(a->end));
if(desiredInserts) {
j += desiredInserts;
}
recordid tree = TlsmCreate(-1, a->comparator_idx,a->rowsize);
lastTreeBlocks =
compressData<PAGELAYOUT,PAGELAYOUT::init_page,ITER>
(&i, &j,1,tree,a->pageAlloc,a->pageAllocState, &lastTreeInserts);
gettimeofday(&start_wait_tv,0);
pthread_mutex_lock(a->block_ready_mut);
while(a->wait_count >= a->max_waiters) {
pthread_cond_wait(a->block_needed_cond,a->block_ready_mut);
}
memcpy(&a->wait_queue[a->wait_count],&tree,sizeof(recordid));
a->wait_count++;
pthread_cond_signal(a->block_ready_cond);
gettimeofday(&stop_tv,0);
double work_elapsed = tv_to_double(start_wait_tv) - tv_to_double(start_tv);
double wait_elapsed = tv_to_double(stop_tv) - tv_to_double(start_wait_tv);
double elapsed = tv_to_double(stop_tv) - tv_to_double(start_tv);
printf("insert# %-6d waited %6.1f sec "
"worked %6.1f sec inserts %-12ld (%9.3f mb/s)\n",
++insert_count,
wait_elapsed,
work_elapsed,
(long int)lastTreeInserts,
(lastTreeInserts*(uint64_t)a->rowsize / (1024.0*1024.0)) / elapsed);
if(a->mergedPages != -1) {
desiredInserts = (pageid_t)(((double)a->mergedPages / K)
* ((double)lastTreeInserts
/ (double)lastTreeBlocks));
}
pthread_mutex_unlock(a->block_ready_mut);
}
return 0;
}
*/
typedef struct { typedef struct {
recordid bigTree; recordid bigTree;
recordid bigTreeAllocState; // this is probably the head of an arraylist of regions used by the tree... recordid bigTreeAllocState; // this is probably the head of an arraylist of regions used by the tree...
@ -258,6 +225,7 @@ namespace rose {
epoch_t end; epoch_t end;
} lsmTableHeader_t; } lsmTableHeader_t;
template<class PAGELAYOUT> template<class PAGELAYOUT>
inline recordid TlsmTableAlloc(int xid) { inline recordid TlsmTableAlloc(int xid) {
@ -281,30 +249,107 @@ namespace rose {
Tset(xid, ret, &h); Tset(xid, ret, &h);
return ret; return ret;
} }
/// XXX start should return a struct that contains these!
pthread_t merge1_thread;
pthread_t merge2_thread;
bool * still_open;
template<class PAGELAYOUT> template<class PAGELAYOUT>
void TlsmTableStart(recordid tree) { void TlsmTableStart(recordid tree) {
/// XXX xid for daemon processes? /// XXX xid for daemon processes?
void * (*merger)(void*) = mergeThread
<PAGELAYOUT,
treeIterator<typename PAGELAYOUT::FMT::TUP, typename PAGELAYOUT::FMT>,
treeIterator<typename PAGELAYOUT::FMT::TUP, typename PAGELAYOUT::FMT> >;
/*mergeThread
<PAGELAYOUT,
treeIterator<typename PAGELAYOUT::FMT::TUP, typename PAGELAYOUT::FMT>,
stlSetIterator<typename std::set<typename PAGELAYOUT::FMT::TUP,
typename PAGELAYOUT::FMT::TUP::stl_cmp>::iterator,
typename PAGELAYOUT::FMT::TUP> >
(0); */
lsmTableHeader_t h; lsmTableHeader_t h;
Tread(-1, tree, &h); Tread(-1, tree, &h);
typedef treeIterator<typename PAGELAYOUT::FMT::TUP,
typename PAGELAYOUT::FMT> LSM_ITER;
typedef stlSetIterator<typename std::set<typename PAGELAYOUT::FMT::TUP,
typename PAGELAYOUT::FMT::TUP::stl_cmp>,
typename PAGELAYOUT::FMT::TUP> RB_ITER;
pthread_mutex_t * block_ready_mut =
(pthread_mutex_t*)malloc(sizeof(pthread_mutex_t));
pthread_cond_t * block0_needed_cond =
(pthread_cond_t*)malloc(sizeof(pthread_cond_t));
pthread_cond_t * block1_needed_cond =
(pthread_cond_t*)malloc(sizeof(pthread_cond_t));
pthread_cond_t * block2_needed_cond =
(pthread_cond_t*)malloc(sizeof(pthread_cond_t));
pthread_cond_t * block0_ready_cond =
(pthread_cond_t*)malloc(sizeof(pthread_cond_t));
pthread_cond_t * block1_ready_cond =
(pthread_cond_t*)malloc(sizeof(pthread_cond_t));
pthread_cond_t * block2_ready_cond =
(pthread_cond_t*)malloc(sizeof(pthread_cond_t));
pthread_mutex_init(block_ready_mut,0);
pthread_cond_init(block0_needed_cond,0);
pthread_cond_init(block1_needed_cond,0);
pthread_cond_init(block2_needed_cond,0);
pthread_cond_init(block0_ready_cond,0);
pthread_cond_init(block1_ready_cond,0);
pthread_cond_init(block2_ready_cond,0);
typename LSM_ITER::handle * block1_scratch =
(typename LSM_ITER::handle*) malloc(sizeof(typename LSM_ITER::handle));
still_open = (bool*)malloc(sizeof(bool));
*still_open = 1;
recordid * ridp = (recordid*)malloc(sizeof(recordid));
*ridp = h.bigTreeAllocState;
recordid ** block1_scratch_p = (recordid**)malloc(sizeof(block1_scratch));
*block1_scratch_p = block1_scratch;
merge_args<PAGELAYOUT, LSM_ITER, LSM_ITER> * args1 = (merge_args<PAGELAYOUT,LSM_ITER,LSM_ITER>*)malloc(sizeof(merge_args<PAGELAYOUT,LSM_ITER,LSM_ITER>));
merge_args<PAGELAYOUT, LSM_ITER, LSM_ITER> tmpargs1 =
{
TlsmRegionAllocRid,
ridp,
block_ready_mut,
block1_needed_cond,
block2_needed_cond,
block1_ready_cond,
block2_ready_cond,
still_open,
0,
block1_scratch_p
};
*args1 = tmpargs1;
void * (*merger1)(void*) = mergeThread
<PAGELAYOUT, LSM_ITER, LSM_ITER>;
ridp = (recordid*)malloc(sizeof(recordid));
*ridp = h.mediumTreeAllocState;
merge_args<PAGELAYOUT, LSM_ITER, RB_ITER> * args2 = (merge_args<PAGELAYOUT,LSM_ITER,RB_ITER>*)malloc(sizeof(merge_args<PAGELAYOUT,LSM_ITER,RB_ITER>));
merge_args<PAGELAYOUT, LSM_ITER, RB_ITER> tmpargs2 =
{
TlsmRegionAllocRid,
ridp,
block_ready_mut,
block0_needed_cond,
block1_needed_cond,
block0_ready_cond,
block1_ready_cond,
still_open,
block1_scratch_p,
0 // XXX how does this thing get fed new trees of tuples?
};
*args2 = tmpargs2;
void * (*merger2)(void*) = mergeThread
<PAGELAYOUT, LSM_ITER, RB_ITER>;
pthread_create(&merge1_thread, 0, merger1, args1);
pthread_create(&merge2_thread, 0, merger2, args2);
} }
template<class PAGELAYOUT> template<class PAGELAYOUT>
void TlsmTableStop(recordid tree) { void TlsmTableStop(recordid tree) {
*still_open = 0;
pthread_join(merge1_thread,0);
pthread_join(merge2_thread,0);
} }
} }