add memTreeComponent::batchedRevalidatingIterator; amortize cost of red black latching during merge
git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@1017 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe
This commit is contained in:
parent
1e487bbc54
commit
b894cebaf7
7 changed files with 171 additions and 9 deletions
|
@ -249,11 +249,11 @@ recordid diskTreeComponent::internalNodes::appendPage(int xid,
|
|||
slotid_t numslots = stasis_record_last(xid, p).slot+1;
|
||||
recordid rid;
|
||||
rid.page = p->id;
|
||||
// XXX writelock lc here? no need, since it's not installed in the tree yet
|
||||
for(rid.slot = FIRST_SLOT; rid.slot < numslots; rid.slot++) {
|
||||
//read the record from the root page
|
||||
rid.size = stasis_record_length_read(xid, p, rid);
|
||||
const indexnode_rec *nr = (const indexnode_rec*)stasis_record_read_begin(xid, p, rid);
|
||||
|
||||
recordid cnext = stasis_record_alloc_begin(xid, lc,rid.size);
|
||||
|
||||
assert(rid.slot == cnext.slot);
|
||||
|
@ -288,7 +288,7 @@ recordid diskTreeComponent::internalNodes::appendPage(int xid,
|
|||
// don't overwrite key...
|
||||
nr->ptr = child;
|
||||
stasis_record_write_done(xid,p,pFirstSlot,(byte*)nr);
|
||||
|
||||
// XXX move this up before we insert LC into the root? Removes write lock on lc.
|
||||
if(!depth) {
|
||||
lastLeaf = lc->id;
|
||||
pageid_t tmpid = -1;
|
||||
|
|
|
@ -86,7 +86,7 @@ void logtable<TUPLE>::init_stasis() {
|
|||
DataPage<datatuple>::register_stasis_page_impl();
|
||||
// XXX Workaround Stasis' (still broken) default concurrent buffer manager
|
||||
stasis_buffer_manager_size = 1024 * 1024; // 4GB = 2^10 pages:
|
||||
stasis_buffer_manager_factory = stasis_buffer_manager_hash_factory;
|
||||
// stasis_buffer_manager_factory = stasis_buffer_manager_hash_factory;
|
||||
|
||||
Tinit();
|
||||
|
||||
|
|
|
@ -353,7 +353,7 @@ public:
|
|||
logtable * ltable;
|
||||
uint64_t epoch;
|
||||
typedef mergeManyIterator<
|
||||
typename memTreeComponent<TUPLE>::revalidatingIterator,
|
||||
typename memTreeComponent<TUPLE>::batchedRevalidatingIterator,
|
||||
typename memTreeComponent<TUPLE>::iterator> inner_merge_it_t;
|
||||
typedef mergeManyIterator<
|
||||
inner_merge_it_t,
|
||||
|
@ -374,7 +374,7 @@ public:
|
|||
|
||||
|
||||
void validate() {
|
||||
typename memTreeComponent<TUPLE>::revalidatingIterator * c0_it;
|
||||
typename memTreeComponent<TUPLE>::batchedRevalidatingIterator * c0_it;
|
||||
typename memTreeComponent<TUPLE>::iterator *c0_mergeable_it[1];
|
||||
diskTreeComponent::iterator * disk_it[4];
|
||||
epoch = ltable->get_epoch();
|
||||
|
@ -388,7 +388,7 @@ public:
|
|||
t = NULL;
|
||||
}
|
||||
|
||||
c0_it = new typename memTreeComponent<TUPLE>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut, t);
|
||||
c0_it = new typename memTreeComponent<TUPLE>::batchedRevalidatingIterator(ltable->get_tree_c0(), 100, <able->rb_mut, t);
|
||||
c0_mergeable_it[0] = new typename memTreeComponent<TUPLE>::iterator (ltable->get_tree_c0_mergeable(), t);
|
||||
if(ltable->get_tree_c1_prime()) {
|
||||
disk_it[0] = ltable->get_tree_c1_prime()->open_iterator(t);
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef _MEMTREECOMPONENT_H_
|
||||
#define _MEMTREECOMPONENT_H_
|
||||
#include <set>
|
||||
#include <assert.h>
|
||||
|
||||
template<class TUPLE>
|
||||
class memTreeComponent {
|
||||
|
@ -141,6 +142,113 @@ public:
|
|||
TUPLE * next_ret_;
|
||||
pthread_mutex_t * mut_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Revalidating iterator; automatically copes with changes to underlying tree
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
class batchedRevalidatingIterator
|
||||
{
|
||||
private:
|
||||
typedef typename rbtree_t::const_iterator MTITER;
|
||||
|
||||
|
||||
void populate_next_ret_impl(std::_Rb_tree_const_iterator<TUPLE*>/*MTITER*/ it) {
|
||||
num_batched_ = 0;
|
||||
cur_off_ = 0;
|
||||
while(it != s_->end() && num_batched_ < batch_size_) {
|
||||
next_ret_[num_batched_] = (*it)->create_copy();
|
||||
num_batched_++;
|
||||
it++;
|
||||
}
|
||||
}
|
||||
void populate_next_ret(TUPLE *key=NULL) {
|
||||
if(cur_off_ == num_batched_) {
|
||||
if(mut_) pthread_mutex_lock(mut_);
|
||||
if(key) {
|
||||
populate_next_ret_impl(s_->upper_bound(key));
|
||||
} else {
|
||||
populate_next_ret_impl(s_->begin());
|
||||
}
|
||||
if(mut_) pthread_mutex_unlock(mut_);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
batchedRevalidatingIterator( rbtree_t *s, int batch_size, pthread_mutex_t * rb_mut ) : s_(s), batch_size_(batch_size), num_batched_(batch_size), cur_off_(batch_size), mut_(rb_mut) {
|
||||
next_ret_ = (TUPLE**)malloc(sizeof(next_ret_[0]) * batch_size_);
|
||||
populate_next_ret();
|
||||
/* if(mut_) pthread_mutex_lock(mut_);
|
||||
if(s_->begin() == s_->end()) {
|
||||
next_ret_ = NULL;
|
||||
} else {
|
||||
next_ret_ = (*s_->begin())->create_copy(); // the create_copy() calls have to happen before we release mut_...
|
||||
}
|
||||
if(mut_) pthread_mutex_unlock(mut_); */
|
||||
}
|
||||
batchedRevalidatingIterator( rbtree_t *s, int batch_size, pthread_mutex_t * rb_mut, TUPLE *&key ) : s_(s), batch_size_(batch_size), num_batched_(batch_size), cur_off_(batch_size), mut_(rb_mut) {
|
||||
next_ret_ = (TUPLE**)malloc(sizeof(next_ret_[0]) * batch_size_);
|
||||
populate_next_ret(key);
|
||||
/* if(mut_) pthread_mutex_lock(mut_);
|
||||
if(key) {
|
||||
if(s_->find(key) != s_->end()) {
|
||||
next_ret_ = (*(s_->find(key)))->create_copy();
|
||||
} else if(s_->upper_bound(key) != s_->end()) {
|
||||
next_ret_ = (*(s_->upper_bound(key)))->create_copy();
|
||||
} else {
|
||||
next_ret_ = NULL;
|
||||
}
|
||||
} else {
|
||||
if(s_->begin() == s_->end()) {
|
||||
next_ret_ = NULL;
|
||||
} else {
|
||||
next_ret_ = (*s_->begin())->create_copy(); // the create_copy() calls have to happen before we release mut_...
|
||||
}
|
||||
}
|
||||
// DEBUG("changing mem next ret = %s key = %s\n", next_ret_ ? (const char*)next_ret_->key() : "NONE", key ? (const char*)key->key() : "NULL");
|
||||
if(mut_) pthread_mutex_unlock(mut_); */
|
||||
}
|
||||
|
||||
~batchedRevalidatingIterator() {
|
||||
for(int i = cur_off_; i < num_batched_; i++) {
|
||||
TUPLE::freetuple(next_ret_[cur_off_]);
|
||||
}
|
||||
free(next_ret_);
|
||||
// if(next_ret_) TUPLE::freetuple(next_ret_);
|
||||
}
|
||||
|
||||
TUPLE* next_callerFrees() {
|
||||
/* if(mut_) pthread_mutex_lock(mut_);
|
||||
TUPLE * ret = next_ret_;
|
||||
if(next_ret_) {
|
||||
if(s_->upper_bound(next_ret_) == s_->end()) {
|
||||
next_ret_ = 0;
|
||||
} else {
|
||||
next_ret_ = (*s_->upper_bound(next_ret_))->create_copy();
|
||||
}
|
||||
}
|
||||
if(mut_) pthread_mutex_unlock(mut_); */
|
||||
if(cur_off_ == num_batched_) { return NULL; } // the last thing we did is call populate_next_ret_(), which only leaves us in this state at the end of the iterator.
|
||||
TUPLE * ret = next_ret_[cur_off_];
|
||||
cur_off_++;
|
||||
populate_next_ret(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private:
|
||||
explicit batchedRevalidatingIterator() { abort(); }
|
||||
void operator=(batchedRevalidatingIterator & t) { abort(); }
|
||||
int operator-(batchedRevalidatingIterator & t) { abort(); }
|
||||
|
||||
rbtree_t *s_;
|
||||
TUPLE ** next_ret_;
|
||||
int batch_size_;
|
||||
int num_batched_;
|
||||
int cur_off_;
|
||||
pthread_mutex_t * mut_;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
#endif //_MEMTREECOMPONENT_H_
|
||||
|
|
|
@ -271,9 +271,11 @@ void mergeManager::tick(mergeStats * s, bool block, bool force) {
|
|||
double_to_ts(&sleep_until, sleeptime + tv_to_double(&now));
|
||||
sleeping[s->merge_level] = true;
|
||||
if(s->merge_level == 0) abort();
|
||||
rwlc_unlock(ltable->header_mut);
|
||||
struct timespec ts;
|
||||
double_to_ts(&ts, sleeptime);
|
||||
nanosleep(&ts, 0);
|
||||
rwlc_writelock(ltable->header_mut);
|
||||
sleeping[s->merge_level] = false;
|
||||
pthread_cond_broadcast(&throttle_wokeup_cond);
|
||||
gettimeofday(&now, 0);
|
||||
|
|
56
merger.cpp
56
merger.cpp
|
@ -175,8 +175,10 @@ void* memMergeThread(void*arg)
|
|||
memTreeComponent<datatuple>::iterator *itrB =
|
||||
new memTreeComponent<datatuple>::iterator(ltable->get_tree_c0_mergeable());
|
||||
#else
|
||||
memTreeComponent<datatuple>::revalidatingIterator *itrB =
|
||||
new memTreeComponent<datatuple>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut);
|
||||
// memTreeComponent<datatuple>::revalidatingIterator *itrB =
|
||||
// new memTreeComponent<datatuple>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut);
|
||||
memTreeComponent<datatuple>::batchedRevalidatingIterator *itrB =
|
||||
new memTreeComponent<datatuple>::batchedRevalidatingIterator(ltable->get_tree_c0(), 100, <able->rb_mut);
|
||||
#endif
|
||||
|
||||
//create a new tree
|
||||
|
@ -397,6 +399,37 @@ static void periodically_force(int xid, int *i, diskTreeComponent * forceMe, sta
|
|||
}
|
||||
}
|
||||
|
||||
static int garbage_collect(logtable<datatuple> * ltable, datatuple ** garbage, int garbage_len, int next_garbage, bool force = false) {
|
||||
if(next_garbage == garbage_len || force) {
|
||||
pthread_mutex_lock(<able->rb_mut);
|
||||
for(int i = 0; i < next_garbage; i++) {
|
||||
datatuple * t2tmp = NULL;
|
||||
{
|
||||
memTreeComponent<datatuple>::rbtree_t::iterator rbitr = ltable->get_tree_c0()->find(garbage[i]);
|
||||
if(rbitr != ltable->get_tree_c0()->end()) {
|
||||
t2tmp = *rbitr;
|
||||
if((t2tmp->datalen() == garbage[i]->datalen()) &&
|
||||
!memcmp(t2tmp->data(), garbage[i]->data(), garbage[i]->datalen())) {
|
||||
// they match, delete t2tmp
|
||||
} else {
|
||||
t2tmp = NULL;
|
||||
}
|
||||
}
|
||||
} // close rbitr before touching the tree.
|
||||
if(t2tmp) {
|
||||
ltable->get_tree_c0()->erase(garbage[i]);
|
||||
ltable->tree_bytes -= garbage[i]->byte_length();
|
||||
datatuple::freetuple(t2tmp);
|
||||
}
|
||||
datatuple::freetuple(garbage[i]);
|
||||
}
|
||||
pthread_mutex_unlock(<able->rb_mut);
|
||||
return 0;
|
||||
} else {
|
||||
return next_garbage;
|
||||
}
|
||||
}
|
||||
|
||||
template <class ITA, class ITB>
|
||||
void merge_iterators(int xid,
|
||||
diskTreeComponent * forceMe,
|
||||
|
@ -413,6 +446,10 @@ void merge_iterators(int xid,
|
|||
ltable->merge_mgr->read_tuple_from_large_component(stats->merge_level, t1);
|
||||
datatuple *t2 = 0;
|
||||
|
||||
int garbage_len = 100;
|
||||
int next_garbage = 0;
|
||||
datatuple ** garbage = (datatuple**)malloc(sizeof(garbage[0]) * garbage_len);
|
||||
|
||||
int i = 0;
|
||||
|
||||
while( (t2=itrB->next_callerFrees()) != 0)
|
||||
|
@ -467,6 +504,12 @@ void merge_iterators(int xid,
|
|||
// cannot free any tuples here; they may still be read through a lookup
|
||||
}
|
||||
#ifndef NO_SNOWSHOVEL
|
||||
if(stats->merge_level == 1) {
|
||||
next_garbage = garbage_collect(ltable, garbage, garbage_len, next_garbage);
|
||||
garbage[next_garbage] = t2;
|
||||
next_garbage++;
|
||||
}
|
||||
#if 0
|
||||
pthread_mutex_lock(<able->rb_mut);
|
||||
if(stats->merge_level == 1) {
|
||||
datatuple * t2tmp = NULL;
|
||||
|
@ -487,7 +530,13 @@ void merge_iterators(int xid,
|
|||
}
|
||||
pthread_mutex_unlock(<able->rb_mut);
|
||||
#endif
|
||||
if(stats->merge_level != 1) {
|
||||
datatuple::freetuple(t2);
|
||||
}
|
||||
#else
|
||||
datatuple::freetuple(t2);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
while(t1 != 0) {// t1 is less than t2
|
||||
|
@ -503,5 +552,8 @@ void merge_iterators(int xid,
|
|||
}
|
||||
DEBUG("dpages: %d\tnpages: %d\tntuples: %d\n", dpages, npages, ntuples);
|
||||
|
||||
next_garbage = garbage_collect(ltable, garbage, garbage_len, next_garbage, true);
|
||||
free(garbage);
|
||||
|
||||
scratch_tree->writes_done();
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
|
||||
if(argc == 2 && !strcmp(argv[1], "--benchmark")) {
|
||||
c0_size = 1024 * 1024 * 1024 * 1;
|
||||
c0_size = 1024 * 1024 * 768 * 1;
|
||||
printf("note: running w/ 2GB c0 for benchmarking\n"); // XXX build a separate test server and deployment server?
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue