add memTreeComponent::batchedRevalidatingIterator; amortize cost of red black latching during merge

git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@1017 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe
This commit is contained in:
sears 2010-08-18 17:29:25 +00:00
parent 1e487bbc54
commit b894cebaf7
7 changed files with 171 additions and 9 deletions

View file

@ -249,11 +249,11 @@ recordid diskTreeComponent::internalNodes::appendPage(int xid,
slotid_t numslots = stasis_record_last(xid, p).slot+1; slotid_t numslots = stasis_record_last(xid, p).slot+1;
recordid rid; recordid rid;
rid.page = p->id; rid.page = p->id;
// XXX writelock lc here? no need, since it's not installed in the tree yet
for(rid.slot = FIRST_SLOT; rid.slot < numslots; rid.slot++) { for(rid.slot = FIRST_SLOT; rid.slot < numslots; rid.slot++) {
//read the record from the root page //read the record from the root page
rid.size = stasis_record_length_read(xid, p, rid); rid.size = stasis_record_length_read(xid, p, rid);
const indexnode_rec *nr = (const indexnode_rec*)stasis_record_read_begin(xid, p, rid); const indexnode_rec *nr = (const indexnode_rec*)stasis_record_read_begin(xid, p, rid);
recordid cnext = stasis_record_alloc_begin(xid, lc,rid.size); recordid cnext = stasis_record_alloc_begin(xid, lc,rid.size);
assert(rid.slot == cnext.slot); assert(rid.slot == cnext.slot);
@ -288,7 +288,7 @@ recordid diskTreeComponent::internalNodes::appendPage(int xid,
// don't overwrite key... // don't overwrite key...
nr->ptr = child; nr->ptr = child;
stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr); stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr);
// XXX move this up before we insert LC into the root? Removes write lock on lc.
if(!depth) { if(!depth) {
lastLeaf = lc->id; lastLeaf = lc->id;
pageid_t tmpid = -1; pageid_t tmpid = -1;

View file

@ -86,7 +86,7 @@ void logtable<TUPLE>::init_stasis() {
DataPage<datatuple>::register_stasis_page_impl(); DataPage<datatuple>::register_stasis_page_impl();
// XXX Workaround Stasis' (still broken) default concurrent buffer manager // XXX Workaround Stasis' (still broken) default concurrent buffer manager
stasis_buffer_manager_size = 1024 * 1024; // 4GB = 2^10 pages: stasis_buffer_manager_size = 1024 * 1024; // 4GB = 2^10 pages:
stasis_buffer_manager_factory = stasis_buffer_manager_hash_factory; // stasis_buffer_manager_factory = stasis_buffer_manager_hash_factory;
Tinit(); Tinit();

View file

@ -353,7 +353,7 @@ public:
logtable * ltable; logtable * ltable;
uint64_t epoch; uint64_t epoch;
typedef mergeManyIterator< typedef mergeManyIterator<
typename memTreeComponent<TUPLE>::revalidatingIterator, typename memTreeComponent<TUPLE>::batchedRevalidatingIterator,
typename memTreeComponent<TUPLE>::iterator> inner_merge_it_t; typename memTreeComponent<TUPLE>::iterator> inner_merge_it_t;
typedef mergeManyIterator< typedef mergeManyIterator<
inner_merge_it_t, inner_merge_it_t,
@ -374,7 +374,7 @@ public:
void validate() { void validate() {
typename memTreeComponent<TUPLE>::revalidatingIterator * c0_it; typename memTreeComponent<TUPLE>::batchedRevalidatingIterator * c0_it;
typename memTreeComponent<TUPLE>::iterator *c0_mergeable_it[1]; typename memTreeComponent<TUPLE>::iterator *c0_mergeable_it[1];
diskTreeComponent::iterator * disk_it[4]; diskTreeComponent::iterator * disk_it[4];
epoch = ltable->get_epoch(); epoch = ltable->get_epoch();
@ -388,7 +388,7 @@ public:
t = NULL; t = NULL;
} }
c0_it = new typename memTreeComponent<TUPLE>::revalidatingIterator(ltable->get_tree_c0(), &ltable->rb_mut, t); c0_it = new typename memTreeComponent<TUPLE>::batchedRevalidatingIterator(ltable->get_tree_c0(), 100, &ltable->rb_mut, t);
c0_mergeable_it[0] = new typename memTreeComponent<TUPLE>::iterator (ltable->get_tree_c0_mergeable(), t); c0_mergeable_it[0] = new typename memTreeComponent<TUPLE>::iterator (ltable->get_tree_c0_mergeable(), t);
if(ltable->get_tree_c1_prime()) { if(ltable->get_tree_c1_prime()) {
disk_it[0] = ltable->get_tree_c1_prime()->open_iterator(t); disk_it[0] = ltable->get_tree_c1_prime()->open_iterator(t);

View file

@ -1,6 +1,7 @@
#ifndef _MEMTREECOMPONENT_H_ #ifndef _MEMTREECOMPONENT_H_
#define _MEMTREECOMPONENT_H_ #define _MEMTREECOMPONENT_H_
#include <set> #include <set>
#include <assert.h>
template<class TUPLE> template<class TUPLE>
class memTreeComponent { class memTreeComponent {
@ -141,6 +142,113 @@ public:
TUPLE * next_ret_; TUPLE * next_ret_;
pthread_mutex_t * mut_; pthread_mutex_t * mut_;
}; };
///////////////////////////////////////////////////////////////
// Revalidating iterator; automatically copes with changes to underlying tree
///////////////////////////////////////////////////////////////
class batchedRevalidatingIterator
{
private:
typedef typename rbtree_t::const_iterator MTITER;
void populate_next_ret_impl(std::_Rb_tree_const_iterator<TUPLE*>/*MTITER*/ it) {
num_batched_ = 0;
cur_off_ = 0;
while(it != s_->end() && num_batched_ < batch_size_) {
next_ret_[num_batched_] = (*it)->create_copy();
num_batched_++;
it++;
}
}
void populate_next_ret(TUPLE *key=NULL) {
if(cur_off_ == num_batched_) {
if(mut_) pthread_mutex_lock(mut_);
if(key) {
populate_next_ret_impl(s_->upper_bound(key));
} else {
populate_next_ret_impl(s_->begin());
}
if(mut_) pthread_mutex_unlock(mut_);
}
}
public:
batchedRevalidatingIterator( rbtree_t *s, int batch_size, pthread_mutex_t * rb_mut ) : s_(s), batch_size_(batch_size), num_batched_(batch_size), cur_off_(batch_size), mut_(rb_mut) {
next_ret_ = (TUPLE**)malloc(sizeof(next_ret_[0]) * batch_size_);
populate_next_ret();
/* if(mut_) pthread_mutex_lock(mut_);
if(s_->begin() == s_->end()) {
next_ret_ = NULL;
} else {
next_ret_ = (*s_->begin())->create_copy(); // the create_copy() calls have to happen before we release mut_...
}
if(mut_) pthread_mutex_unlock(mut_); */
}
batchedRevalidatingIterator( rbtree_t *s, int batch_size, pthread_mutex_t * rb_mut, TUPLE *&key ) : s_(s), batch_size_(batch_size), num_batched_(batch_size), cur_off_(batch_size), mut_(rb_mut) {
next_ret_ = (TUPLE**)malloc(sizeof(next_ret_[0]) * batch_size_);
populate_next_ret(key);
/* if(mut_) pthread_mutex_lock(mut_);
if(key) {
if(s_->find(key) != s_->end()) {
next_ret_ = (*(s_->find(key)))->create_copy();
} else if(s_->upper_bound(key) != s_->end()) {
next_ret_ = (*(s_->upper_bound(key)))->create_copy();
} else {
next_ret_ = NULL;
}
} else {
if(s_->begin() == s_->end()) {
next_ret_ = NULL;
} else {
next_ret_ = (*s_->begin())->create_copy(); // the create_copy() calls have to happen before we release mut_...
}
}
// DEBUG("changing mem next ret = %s key = %s\n", next_ret_ ? (const char*)next_ret_->key() : "NONE", key ? (const char*)key->key() : "NULL");
if(mut_) pthread_mutex_unlock(mut_); */
}
~batchedRevalidatingIterator() {
for(int i = cur_off_; i < num_batched_; i++) {
TUPLE::freetuple(next_ret_[cur_off_]);
}
free(next_ret_);
// if(next_ret_) TUPLE::freetuple(next_ret_);
}
TUPLE* next_callerFrees() {
/* if(mut_) pthread_mutex_lock(mut_);
TUPLE * ret = next_ret_;
if(next_ret_) {
if(s_->upper_bound(next_ret_) == s_->end()) {
next_ret_ = 0;
} else {
next_ret_ = (*s_->upper_bound(next_ret_))->create_copy();
}
}
if(mut_) pthread_mutex_unlock(mut_); */
if(cur_off_ == num_batched_) { return NULL; } // the last thing we did is call populate_next_ret_(), which only leaves us in this state at the end of the iterator.
TUPLE * ret = next_ret_[cur_off_];
cur_off_++;
populate_next_ret(ret);
return ret;
}
private:
explicit batchedRevalidatingIterator() { abort(); }
void operator=(batchedRevalidatingIterator & t) { abort(); }
int operator-(batchedRevalidatingIterator & t) { abort(); }
rbtree_t *s_;
TUPLE ** next_ret_;
int batch_size_;
int num_batched_;
int cur_off_;
pthread_mutex_t * mut_;
};
}; };
#endif //_MEMTREECOMPONENT_H_ #endif //_MEMTREECOMPONENT_H_

View file

@ -271,9 +271,11 @@ void mergeManager::tick(mergeStats * s, bool block, bool force) {
double_to_ts(&sleep_until, sleeptime + tv_to_double(&now)); double_to_ts(&sleep_until, sleeptime + tv_to_double(&now));
sleeping[s->merge_level] = true; sleeping[s->merge_level] = true;
if(s->merge_level == 0) abort(); if(s->merge_level == 0) abort();
rwlc_unlock(ltable->header_mut);
struct timespec ts; struct timespec ts;
double_to_ts(&ts, sleeptime); double_to_ts(&ts, sleeptime);
nanosleep(&ts, 0); nanosleep(&ts, 0);
rwlc_writelock(ltable->header_mut);
sleeping[s->merge_level] = false; sleeping[s->merge_level] = false;
pthread_cond_broadcast(&throttle_wokeup_cond); pthread_cond_broadcast(&throttle_wokeup_cond);
gettimeofday(&now, 0); gettimeofday(&now, 0);

View file

@ -175,8 +175,10 @@ void* memMergeThread(void*arg)
memTreeComponent<datatuple>::iterator *itrB = memTreeComponent<datatuple>::iterator *itrB =
new memTreeComponent<datatuple>::iterator(ltable->get_tree_c0_mergeable()); new memTreeComponent<datatuple>::iterator(ltable->get_tree_c0_mergeable());
#else #else
memTreeComponent<datatuple>::revalidatingIterator *itrB = // memTreeComponent<datatuple>::revalidatingIterator *itrB =
new memTreeComponent<datatuple>::revalidatingIterator(ltable->get_tree_c0(), &ltable->rb_mut); // new memTreeComponent<datatuple>::revalidatingIterator(ltable->get_tree_c0(), &ltable->rb_mut);
memTreeComponent<datatuple>::batchedRevalidatingIterator *itrB =
new memTreeComponent<datatuple>::batchedRevalidatingIterator(ltable->get_tree_c0(), 100, &ltable->rb_mut);
#endif #endif
//create a new tree //create a new tree
@ -397,6 +399,37 @@ static void periodically_force(int xid, int *i, diskTreeComponent * forceMe, sta
} }
} }
static int garbage_collect(logtable<datatuple> * ltable, datatuple ** garbage, int garbage_len, int next_garbage, bool force = false) {
if(next_garbage == garbage_len || force) {
pthread_mutex_lock(&ltable->rb_mut);
for(int i = 0; i < next_garbage; i++) {
datatuple * t2tmp = NULL;
{
memTreeComponent<datatuple>::rbtree_t::iterator rbitr = ltable->get_tree_c0()->find(garbage[i]);
if(rbitr != ltable->get_tree_c0()->end()) {
t2tmp = *rbitr;
if((t2tmp->datalen() == garbage[i]->datalen()) &&
!memcmp(t2tmp->data(), garbage[i]->data(), garbage[i]->datalen())) {
// they match, delete t2tmp
} else {
t2tmp = NULL;
}
}
} // close rbitr before touching the tree.
if(t2tmp) {
ltable->get_tree_c0()->erase(garbage[i]);
ltable->tree_bytes -= garbage[i]->byte_length();
datatuple::freetuple(t2tmp);
}
datatuple::freetuple(garbage[i]);
}
pthread_mutex_unlock(&ltable->rb_mut);
return 0;
} else {
return next_garbage;
}
}
template <class ITA, class ITB> template <class ITA, class ITB>
void merge_iterators(int xid, void merge_iterators(int xid,
diskTreeComponent * forceMe, diskTreeComponent * forceMe,
@ -413,6 +446,10 @@ void merge_iterators(int xid,
ltable->merge_mgr->read_tuple_from_large_component(stats->merge_level, t1); ltable->merge_mgr->read_tuple_from_large_component(stats->merge_level, t1);
datatuple *t2 = 0; datatuple *t2 = 0;
int garbage_len = 100;
int next_garbage = 0;
datatuple ** garbage = (datatuple**)malloc(sizeof(garbage[0]) * garbage_len);
int i = 0; int i = 0;
while( (t2=itrB->next_callerFrees()) != 0) while( (t2=itrB->next_callerFrees()) != 0)
@ -467,6 +504,12 @@ void merge_iterators(int xid,
// cannot free any tuples here; they may still be read through a lookup // cannot free any tuples here; they may still be read through a lookup
} }
#ifndef NO_SNOWSHOVEL #ifndef NO_SNOWSHOVEL
if(stats->merge_level == 1) {
next_garbage = garbage_collect(ltable, garbage, garbage_len, next_garbage);
garbage[next_garbage] = t2;
next_garbage++;
}
#if 0
pthread_mutex_lock(&ltable->rb_mut); pthread_mutex_lock(&ltable->rb_mut);
if(stats->merge_level == 1) { if(stats->merge_level == 1) {
datatuple * t2tmp = NULL; datatuple * t2tmp = NULL;
@ -487,8 +530,14 @@ void merge_iterators(int xid,
} }
pthread_mutex_unlock(&ltable->rb_mut); pthread_mutex_unlock(&ltable->rb_mut);
#endif #endif
if(stats->merge_level != 1) {
datatuple::freetuple(t2); datatuple::freetuple(t2);
} }
#else
datatuple::freetuple(t2);
#endif
}
while(t1 != 0) {// t1 is less than t2 while(t1 != 0) {// t1 is less than t2
scratch_tree->insertTuple(xid, t1); scratch_tree->insertTuple(xid, t1);
@ -503,5 +552,8 @@ void merge_iterators(int xid,
} }
DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples); DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples);
next_garbage = garbage_collect(ltable, garbage, garbage_len, next_garbage, true);
free(garbage);
scratch_tree->writes_done(); scratch_tree->writes_done();
} }

View file

@ -35,7 +35,7 @@ int main(int argc, char *argv[])
} }
if(argc == 2 && !strcmp(argv[1], "--benchmark")) { if(argc == 2 && !strcmp(argv[1], "--benchmark")) {
c0_size = 1024 * 1024 * 1024 * 1; c0_size = 1024 * 1024 * 768 * 1;
printf("note: running w/ 2GB c0 for benchmarking\n"); // XXX build a separate test server and deployment server? printf("note: running w/ 2GB c0 for benchmarking\n"); // XXX build a separate test server and deployment server?
} }