add memTreeComponent::batchedRevalidatingIterator; amortize cost of red black latching during merge
git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@1017 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe
This commit is contained in:
parent
1e487bbc54
commit
b894cebaf7
7 changed files with 171 additions and 9 deletions
|
@ -249,11 +249,11 @@ recordid diskTreeComponent::internalNodes::appendPage(int xid,
|
||||||
slotid_t numslots = stasis_record_last(xid, p).slot+1;
|
slotid_t numslots = stasis_record_last(xid, p).slot+1;
|
||||||
recordid rid;
|
recordid rid;
|
||||||
rid.page = p->id;
|
rid.page = p->id;
|
||||||
|
// XXX writelock lc here? no need, since it's not installed in the tree yet
|
||||||
for(rid.slot = FIRST_SLOT; rid.slot < numslots; rid.slot++) {
|
for(rid.slot = FIRST_SLOT; rid.slot < numslots; rid.slot++) {
|
||||||
//read the record from the root page
|
//read the record from the root page
|
||||||
rid.size = stasis_record_length_read(xid, p, rid);
|
rid.size = stasis_record_length_read(xid, p, rid);
|
||||||
const indexnode_rec *nr = (const indexnode_rec*)stasis_record_read_begin(xid, p, rid);
|
const indexnode_rec *nr = (const indexnode_rec*)stasis_record_read_begin(xid, p, rid);
|
||||||
|
|
||||||
recordid cnext = stasis_record_alloc_begin(xid, lc,rid.size);
|
recordid cnext = stasis_record_alloc_begin(xid, lc,rid.size);
|
||||||
|
|
||||||
assert(rid.slot == cnext.slot);
|
assert(rid.slot == cnext.slot);
|
||||||
|
@ -288,7 +288,7 @@ recordid diskTreeComponent::internalNodes::appendPage(int xid,
|
||||||
// don't overwrite key...
|
// don't overwrite key...
|
||||||
nr->ptr = child;
|
nr->ptr = child;
|
||||||
stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr);
|
stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr);
|
||||||
|
// XXX move this up before we insert LC into the root? Removes write lock on lc.
|
||||||
if(!depth) {
|
if(!depth) {
|
||||||
lastLeaf = lc->id;
|
lastLeaf = lc->id;
|
||||||
pageid_t tmpid = -1;
|
pageid_t tmpid = -1;
|
||||||
|
|
|
@ -86,7 +86,7 @@ void logtable<TUPLE>::init_stasis() {
|
||||||
DataPage<datatuple>::register_stasis_page_impl();
|
DataPage<datatuple>::register_stasis_page_impl();
|
||||||
// XXX Workaround Stasis' (still broken) default concurrent buffer manager
|
// XXX Workaround Stasis' (still broken) default concurrent buffer manager
|
||||||
stasis_buffer_manager_size = 1024 * 1024; // 4GB = 2^10 pages:
|
stasis_buffer_manager_size = 1024 * 1024; // 4GB = 2^10 pages:
|
||||||
stasis_buffer_manager_factory = stasis_buffer_manager_hash_factory;
|
// stasis_buffer_manager_factory = stasis_buffer_manager_hash_factory;
|
||||||
|
|
||||||
Tinit();
|
Tinit();
|
||||||
|
|
||||||
|
|
|
@ -353,7 +353,7 @@ public:
|
||||||
logtable * ltable;
|
logtable * ltable;
|
||||||
uint64_t epoch;
|
uint64_t epoch;
|
||||||
typedef mergeManyIterator<
|
typedef mergeManyIterator<
|
||||||
typename memTreeComponent<TUPLE>::revalidatingIterator,
|
typename memTreeComponent<TUPLE>::batchedRevalidatingIterator,
|
||||||
typename memTreeComponent<TUPLE>::iterator> inner_merge_it_t;
|
typename memTreeComponent<TUPLE>::iterator> inner_merge_it_t;
|
||||||
typedef mergeManyIterator<
|
typedef mergeManyIterator<
|
||||||
inner_merge_it_t,
|
inner_merge_it_t,
|
||||||
|
@ -374,7 +374,7 @@ public:
|
||||||
|
|
||||||
|
|
||||||
void validate() {
|
void validate() {
|
||||||
typename memTreeComponent<TUPLE>::revalidatingIterator * c0_it;
|
typename memTreeComponent<TUPLE>::batchedRevalidatingIterator * c0_it;
|
||||||
typename memTreeComponent<TUPLE>::iterator *c0_mergeable_it[1];
|
typename memTreeComponent<TUPLE>::iterator *c0_mergeable_it[1];
|
||||||
diskTreeComponent::iterator * disk_it[4];
|
diskTreeComponent::iterator * disk_it[4];
|
||||||
epoch = ltable->get_epoch();
|
epoch = ltable->get_epoch();
|
||||||
|
@ -388,7 +388,7 @@ public:
|
||||||
t = NULL;
|
t = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
c0_it = new typename memTreeComponent<TUPLE>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut, t);
|
c0_it = new typename memTreeComponent<TUPLE>::batchedRevalidatingIterator(ltable->get_tree_c0(), 100, <able->rb_mut, t);
|
||||||
c0_mergeable_it[0] = new typename memTreeComponent<TUPLE>::iterator (ltable->get_tree_c0_mergeable(), t);
|
c0_mergeable_it[0] = new typename memTreeComponent<TUPLE>::iterator (ltable->get_tree_c0_mergeable(), t);
|
||||||
if(ltable->get_tree_c1_prime()) {
|
if(ltable->get_tree_c1_prime()) {
|
||||||
disk_it[0] = ltable->get_tree_c1_prime()->open_iterator(t);
|
disk_it[0] = ltable->get_tree_c1_prime()->open_iterator(t);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#ifndef _MEMTREECOMPONENT_H_
|
#ifndef _MEMTREECOMPONENT_H_
|
||||||
#define _MEMTREECOMPONENT_H_
|
#define _MEMTREECOMPONENT_H_
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
template<class TUPLE>
|
template<class TUPLE>
|
||||||
class memTreeComponent {
|
class memTreeComponent {
|
||||||
|
@ -141,6 +142,113 @@ public:
|
||||||
TUPLE * next_ret_;
|
TUPLE * next_ret_;
|
||||||
pthread_mutex_t * mut_;
|
pthread_mutex_t * mut_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Revalidating iterator; automatically copes with changes to underlying tree
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
class batchedRevalidatingIterator
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
typedef typename rbtree_t::const_iterator MTITER;
|
||||||
|
|
||||||
|
|
||||||
|
void populate_next_ret_impl(std::_Rb_tree_const_iterator<TUPLE*>/*MTITER*/ it) {
|
||||||
|
num_batched_ = 0;
|
||||||
|
cur_off_ = 0;
|
||||||
|
while(it != s_->end() && num_batched_ < batch_size_) {
|
||||||
|
next_ret_[num_batched_] = (*it)->create_copy();
|
||||||
|
num_batched_++;
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void populate_next_ret(TUPLE *key=NULL) {
|
||||||
|
if(cur_off_ == num_batched_) {
|
||||||
|
if(mut_) pthread_mutex_lock(mut_);
|
||||||
|
if(key) {
|
||||||
|
populate_next_ret_impl(s_->upper_bound(key));
|
||||||
|
} else {
|
||||||
|
populate_next_ret_impl(s_->begin());
|
||||||
|
}
|
||||||
|
if(mut_) pthread_mutex_unlock(mut_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
batchedRevalidatingIterator( rbtree_t *s, int batch_size, pthread_mutex_t * rb_mut ) : s_(s), batch_size_(batch_size), num_batched_(batch_size), cur_off_(batch_size), mut_(rb_mut) {
|
||||||
|
next_ret_ = (TUPLE**)malloc(sizeof(next_ret_[0]) * batch_size_);
|
||||||
|
populate_next_ret();
|
||||||
|
/* if(mut_) pthread_mutex_lock(mut_);
|
||||||
|
if(s_->begin() == s_->end()) {
|
||||||
|
next_ret_ = NULL;
|
||||||
|
} else {
|
||||||
|
next_ret_ = (*s_->begin())->create_copy(); // the create_copy() calls have to happen before we release mut_...
|
||||||
|
}
|
||||||
|
if(mut_) pthread_mutex_unlock(mut_); */
|
||||||
|
}
|
||||||
|
batchedRevalidatingIterator( rbtree_t *s, int batch_size, pthread_mutex_t * rb_mut, TUPLE *&key ) : s_(s), batch_size_(batch_size), num_batched_(batch_size), cur_off_(batch_size), mut_(rb_mut) {
|
||||||
|
next_ret_ = (TUPLE**)malloc(sizeof(next_ret_[0]) * batch_size_);
|
||||||
|
populate_next_ret(key);
|
||||||
|
/* if(mut_) pthread_mutex_lock(mut_);
|
||||||
|
if(key) {
|
||||||
|
if(s_->find(key) != s_->end()) {
|
||||||
|
next_ret_ = (*(s_->find(key)))->create_copy();
|
||||||
|
} else if(s_->upper_bound(key) != s_->end()) {
|
||||||
|
next_ret_ = (*(s_->upper_bound(key)))->create_copy();
|
||||||
|
} else {
|
||||||
|
next_ret_ = NULL;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(s_->begin() == s_->end()) {
|
||||||
|
next_ret_ = NULL;
|
||||||
|
} else {
|
||||||
|
next_ret_ = (*s_->begin())->create_copy(); // the create_copy() calls have to happen before we release mut_...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// DEBUG("changing mem next ret = %s key = %s\n", next_ret_ ? (const char*)next_ret_->key() : "NONE", key ? (const char*)key->key() : "NULL");
|
||||||
|
if(mut_) pthread_mutex_unlock(mut_); */
|
||||||
|
}
|
||||||
|
|
||||||
|
~batchedRevalidatingIterator() {
|
||||||
|
for(int i = cur_off_; i < num_batched_; i++) {
|
||||||
|
TUPLE::freetuple(next_ret_[cur_off_]);
|
||||||
|
}
|
||||||
|
free(next_ret_);
|
||||||
|
// if(next_ret_) TUPLE::freetuple(next_ret_);
|
||||||
|
}
|
||||||
|
|
||||||
|
TUPLE* next_callerFrees() {
|
||||||
|
/* if(mut_) pthread_mutex_lock(mut_);
|
||||||
|
TUPLE * ret = next_ret_;
|
||||||
|
if(next_ret_) {
|
||||||
|
if(s_->upper_bound(next_ret_) == s_->end()) {
|
||||||
|
next_ret_ = 0;
|
||||||
|
} else {
|
||||||
|
next_ret_ = (*s_->upper_bound(next_ret_))->create_copy();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(mut_) pthread_mutex_unlock(mut_); */
|
||||||
|
if(cur_off_ == num_batched_) { return NULL; } // the last thing we did is call populate_next_ret_(), which only leaves us in this state at the end of the iterator.
|
||||||
|
TUPLE * ret = next_ret_[cur_off_];
|
||||||
|
cur_off_++;
|
||||||
|
populate_next_ret(ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
explicit batchedRevalidatingIterator() { abort(); }
|
||||||
|
void operator=(batchedRevalidatingIterator & t) { abort(); }
|
||||||
|
int operator-(batchedRevalidatingIterator & t) { abort(); }
|
||||||
|
|
||||||
|
rbtree_t *s_;
|
||||||
|
TUPLE ** next_ret_;
|
||||||
|
int batch_size_;
|
||||||
|
int num_batched_;
|
||||||
|
int cur_off_;
|
||||||
|
pthread_mutex_t * mut_;
|
||||||
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif //_MEMTREECOMPONENT_H_
|
#endif //_MEMTREECOMPONENT_H_
|
||||||
|
|
|
@ -271,9 +271,11 @@ void mergeManager::tick(mergeStats * s, bool block, bool force) {
|
||||||
double_to_ts(&sleep_until, sleeptime + tv_to_double(&now));
|
double_to_ts(&sleep_until, sleeptime + tv_to_double(&now));
|
||||||
sleeping[s->merge_level] = true;
|
sleeping[s->merge_level] = true;
|
||||||
if(s->merge_level == 0) abort();
|
if(s->merge_level == 0) abort();
|
||||||
|
rwlc_unlock(ltable->header_mut);
|
||||||
struct timespec ts;
|
struct timespec ts;
|
||||||
double_to_ts(&ts, sleeptime);
|
double_to_ts(&ts, sleeptime);
|
||||||
nanosleep(&ts, 0);
|
nanosleep(&ts, 0);
|
||||||
|
rwlc_writelock(ltable->header_mut);
|
||||||
sleeping[s->merge_level] = false;
|
sleeping[s->merge_level] = false;
|
||||||
pthread_cond_broadcast(&throttle_wokeup_cond);
|
pthread_cond_broadcast(&throttle_wokeup_cond);
|
||||||
gettimeofday(&now, 0);
|
gettimeofday(&now, 0);
|
||||||
|
|
56
merger.cpp
56
merger.cpp
|
@ -175,8 +175,10 @@ void* memMergeThread(void*arg)
|
||||||
memTreeComponent<datatuple>::iterator *itrB =
|
memTreeComponent<datatuple>::iterator *itrB =
|
||||||
new memTreeComponent<datatuple>::iterator(ltable->get_tree_c0_mergeable());
|
new memTreeComponent<datatuple>::iterator(ltable->get_tree_c0_mergeable());
|
||||||
#else
|
#else
|
||||||
memTreeComponent<datatuple>::revalidatingIterator *itrB =
|
// memTreeComponent<datatuple>::revalidatingIterator *itrB =
|
||||||
new memTreeComponent<datatuple>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut);
|
// new memTreeComponent<datatuple>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut);
|
||||||
|
memTreeComponent<datatuple>::batchedRevalidatingIterator *itrB =
|
||||||
|
new memTreeComponent<datatuple>::batchedRevalidatingIterator(ltable->get_tree_c0(), 100, <able->rb_mut);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//create a new tree
|
//create a new tree
|
||||||
|
@ -397,6 +399,37 @@ static void periodically_force(int xid, int *i, diskTreeComponent * forceMe, sta
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int garbage_collect(logtable<datatuple> * ltable, datatuple ** garbage, int garbage_len, int next_garbage, bool force = false) {
|
||||||
|
if(next_garbage == garbage_len || force) {
|
||||||
|
pthread_mutex_lock(<able->rb_mut);
|
||||||
|
for(int i = 0; i < next_garbage; i++) {
|
||||||
|
datatuple * t2tmp = NULL;
|
||||||
|
{
|
||||||
|
memTreeComponent<datatuple>::rbtree_t::iterator rbitr = ltable->get_tree_c0()->find(garbage[i]);
|
||||||
|
if(rbitr != ltable->get_tree_c0()->end()) {
|
||||||
|
t2tmp = *rbitr;
|
||||||
|
if((t2tmp->datalen() == garbage[i]->datalen()) &&
|
||||||
|
!memcmp(t2tmp->data(), garbage[i]->data(), garbage[i]->datalen())) {
|
||||||
|
// they match, delete t2tmp
|
||||||
|
} else {
|
||||||
|
t2tmp = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // close rbitr before touching the tree.
|
||||||
|
if(t2tmp) {
|
||||||
|
ltable->get_tree_c0()->erase(garbage[i]);
|
||||||
|
ltable->tree_bytes -= garbage[i]->byte_length();
|
||||||
|
datatuple::freetuple(t2tmp);
|
||||||
|
}
|
||||||
|
datatuple::freetuple(garbage[i]);
|
||||||
|
}
|
||||||
|
pthread_mutex_unlock(<able->rb_mut);
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
return next_garbage;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <class ITA, class ITB>
|
template <class ITA, class ITB>
|
||||||
void merge_iterators(int xid,
|
void merge_iterators(int xid,
|
||||||
diskTreeComponent * forceMe,
|
diskTreeComponent * forceMe,
|
||||||
|
@ -413,6 +446,10 @@ void merge_iterators(int xid,
|
||||||
ltable->merge_mgr->read_tuple_from_large_component(stats->merge_level, t1);
|
ltable->merge_mgr->read_tuple_from_large_component(stats->merge_level, t1);
|
||||||
datatuple *t2 = 0;
|
datatuple *t2 = 0;
|
||||||
|
|
||||||
|
int garbage_len = 100;
|
||||||
|
int next_garbage = 0;
|
||||||
|
datatuple ** garbage = (datatuple**)malloc(sizeof(garbage[0]) * garbage_len);
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
while( (t2=itrB->next_callerFrees()) != 0)
|
while( (t2=itrB->next_callerFrees()) != 0)
|
||||||
|
@ -467,6 +504,12 @@ void merge_iterators(int xid,
|
||||||
// cannot free any tuples here; they may still be read through a lookup
|
// cannot free any tuples here; they may still be read through a lookup
|
||||||
}
|
}
|
||||||
#ifndef NO_SNOWSHOVEL
|
#ifndef NO_SNOWSHOVEL
|
||||||
|
if(stats->merge_level == 1) {
|
||||||
|
next_garbage = garbage_collect(ltable, garbage, garbage_len, next_garbage);
|
||||||
|
garbage[next_garbage] = t2;
|
||||||
|
next_garbage++;
|
||||||
|
}
|
||||||
|
#if 0
|
||||||
pthread_mutex_lock(<able->rb_mut);
|
pthread_mutex_lock(<able->rb_mut);
|
||||||
if(stats->merge_level == 1) {
|
if(stats->merge_level == 1) {
|
||||||
datatuple * t2tmp = NULL;
|
datatuple * t2tmp = NULL;
|
||||||
|
@ -487,7 +530,13 @@ void merge_iterators(int xid,
|
||||||
}
|
}
|
||||||
pthread_mutex_unlock(<able->rb_mut);
|
pthread_mutex_unlock(<able->rb_mut);
|
||||||
#endif
|
#endif
|
||||||
|
if(stats->merge_level != 1) {
|
||||||
|
datatuple::freetuple(t2);
|
||||||
|
}
|
||||||
|
#else
|
||||||
datatuple::freetuple(t2);
|
datatuple::freetuple(t2);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(t1 != 0) {// t1 is less than t2
|
while(t1 != 0) {// t1 is less than t2
|
||||||
|
@ -503,5 +552,8 @@ void merge_iterators(int xid,
|
||||||
}
|
}
|
||||||
DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples);
|
DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples);
|
||||||
|
|
||||||
|
next_garbage = garbage_collect(ltable, garbage, garbage_len, next_garbage, true);
|
||||||
|
free(garbage);
|
||||||
|
|
||||||
scratch_tree->writes_done();
|
scratch_tree->writes_done();
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@ int main(int argc, char *argv[])
|
||||||
}
|
}
|
||||||
|
|
||||||
if(argc == 2 && !strcmp(argv[1], "--benchmark")) {
|
if(argc == 2 && !strcmp(argv[1], "--benchmark")) {
|
||||||
c0_size = 1024 * 1024 * 1024 * 1;
|
c0_size = 1024 * 1024 * 768 * 1;
|
||||||
printf("note: running w/ 2GB c0 for benchmarking\n"); // XXX build a separate test server and deployment server?
|
printf("note: running w/ 2GB c0 for benchmarking\n"); // XXX build a separate test server and deployment server?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue