Implemented snowshoveling, with caveats.
This commit includes concurrency control for datapages, the new backpressure for the c0-c1 merger, and (untested) support for defining NO_SNOWSHOVEL, and using the old approach. There are a number of important limitations: - statistics gathering is somewhat messed up - the c0-c1 merge thread polls for new data - on a sequential write, if the writers stall, the merge thread could empty memory and initiate an unnecessary merge. Instead, we should never finish a merge if c0 is less than N% full - this commit introduces yet another batch of nasty mutex contention issues git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@955 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe
This commit is contained in:
parent
00d6cd2440
commit
dc75f6d1f0
8 changed files with 275 additions and 106 deletions
147
datapage.cpp
147
datapage.cpp
|
@ -108,8 +108,6 @@ void DataPage<TUPLE>::initialize_page(pageid_t pageid) {
|
|||
#else
|
||||
p = loadUninitializedPage(xid_, pageid);
|
||||
#endif
|
||||
//XXX this is pretty obnoxious. Perhaps stasis shouldn't check for the latch
|
||||
writelock(p->rwlatch,0);
|
||||
|
||||
DEBUG("\t\t\t\t\t\t->%lld\n", pageid);
|
||||
|
||||
|
@ -128,28 +126,31 @@ void DataPage<TUPLE>::initialize_page(pageid_t pageid) {
|
|||
//set the page dirty
|
||||
stasis_page_lsn_write(xid_, p, alloc_->get_lsn(xid_));
|
||||
|
||||
//release the page
|
||||
unlock(p->rwlatch);
|
||||
releasePage(p);
|
||||
}
|
||||
template <class TUPLE>
|
||||
size_t DataPage<TUPLE>::write_bytes(const byte * buf, ssize_t remaining) {
|
||||
recordid chunk = calc_chunk_from_offset(write_offset_);
|
||||
if(chunk.size > remaining) {
|
||||
chunk.size = remaining;
|
||||
}
|
||||
if(chunk.page >= first_page_ + page_count_) {
|
||||
chunk.size = 0; // no space (should not happen)
|
||||
} else {
|
||||
Page *p = alloc_ ? alloc_->load_page(xid_, chunk.page) : loadPage(xid_, chunk.page);
|
||||
memcpy(data_at_offset_ptr(p, chunk.slot), buf, chunk.size);
|
||||
writelock(p->rwlatch,0);
|
||||
stasis_page_lsn_write(xid_, p, alloc_->get_lsn(xid_));
|
||||
unlock(p->rwlatch);
|
||||
releasePage(p);
|
||||
write_offset_ += chunk.size;
|
||||
}
|
||||
return chunk.size;
|
||||
size_t DataPage<TUPLE>::write_bytes(const byte * buf, ssize_t remaining, Page ** latch_p) {
|
||||
if(latch_p) { *latch_p = NULL; }
|
||||
recordid chunk = calc_chunk_from_offset(write_offset_);
|
||||
if(chunk.size > remaining) {
|
||||
chunk.size = remaining;
|
||||
}
|
||||
if(chunk.page >= first_page_ + page_count_) {
|
||||
chunk.size = 0; // no space (should not happen)
|
||||
} else {
|
||||
Page *p = alloc_ ? alloc_->load_page(xid_, chunk.page) : loadPage(xid_, chunk.page);
|
||||
assert(chunk.size);
|
||||
memcpy(data_at_offset_ptr(p, chunk.slot), buf, chunk.size);
|
||||
stasis_page_lsn_write(xid_, p, alloc_->get_lsn(xid_));
|
||||
if(latch_p && !*latch_p) {
|
||||
writelock(p->rwlatch,0);
|
||||
*latch_p = p;
|
||||
} else {
|
||||
releasePage(p);
|
||||
}
|
||||
write_offset_ += chunk.size;
|
||||
}
|
||||
return chunk.size;
|
||||
}
|
||||
template <class TUPLE>
|
||||
size_t DataPage<TUPLE>::read_bytes(byte * buf, off_t offset, ssize_t remaining) {
|
||||
|
@ -191,39 +192,64 @@ bool DataPage<TUPLE>::initialize_next_page() {
|
|||
|
||||
Page *p = alloc_ ? alloc_->load_page(xid_, rid.page-1) : loadPage(xid_, rid.page-1);
|
||||
*is_another_page_ptr(p) = (rid.page-1 == first_page_) ? 2 : 1;
|
||||
writelock(p->rwlatch, 0);
|
||||
stasis_page_lsn_write(xid_, p, alloc_->get_lsn(xid_));
|
||||
unlock(p->rwlatch);
|
||||
releasePage(p);
|
||||
|
||||
initialize_page(rid.page);
|
||||
return true;
|
||||
}
|
||||
|
||||
template<class TUPLE>
|
||||
Page * DataPage<TUPLE>::write_data_and_latch(const byte * buf, size_t len, bool init_next, bool latch) {
|
||||
bool first = true;
|
||||
Page * p = 0;
|
||||
while(1) {
|
||||
assert(len > 0);
|
||||
// if(latch) {
|
||||
// if(first) { assert(!p); } else { assert(p); }
|
||||
// } else {
|
||||
// assert(!p);
|
||||
// }
|
||||
size_t written;
|
||||
if(latch && first ) {
|
||||
written = write_bytes(buf, len, &p);
|
||||
} else {
|
||||
written = write_bytes(buf, len);
|
||||
}
|
||||
if(written == 0) {
|
||||
assert(!p);
|
||||
return 0; // fail
|
||||
}
|
||||
if(written == len) {
|
||||
if(latch) {
|
||||
return p;
|
||||
} else {
|
||||
// assert(!p);
|
||||
return (Page*)1;
|
||||
}
|
||||
}
|
||||
if(len > PAGE_SIZE && ! first) {
|
||||
assert(written > 4000);
|
||||
}
|
||||
buf += written;
|
||||
len -= written;
|
||||
if(init_next) {
|
||||
if(!initialize_next_page()) {
|
||||
if(p) {
|
||||
// assert(latch);
|
||||
unlock(p->rwlatch);
|
||||
releasePage(p);
|
||||
}
|
||||
return 0; // fail
|
||||
}
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
|
||||
template <class TUPLE>
|
||||
bool DataPage<TUPLE>::write_data(const byte * buf, size_t len, bool init_next) {
|
||||
bool first = true;
|
||||
while(1) {
|
||||
assert(len > 0);
|
||||
size_t written = write_bytes(buf, len);
|
||||
if(written == 0) {
|
||||
return false; // fail
|
||||
}
|
||||
if(written == len) {
|
||||
return true; // success
|
||||
}
|
||||
if(len > PAGE_SIZE && ! first) {
|
||||
assert(written > 4000);
|
||||
}
|
||||
buf += written;
|
||||
len -= written;
|
||||
if(init_next) {
|
||||
if(!initialize_next_page()) {
|
||||
return false; // fail
|
||||
}
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
return 0 != write_data_and_latch(buf, len, init_next, false);
|
||||
}
|
||||
template <class TUPLE>
|
||||
bool DataPage<TUPLE>::read_data(byte * buf, off_t offset, size_t len) {
|
||||
|
@ -255,9 +281,12 @@ bool DataPage<TUPLE>::append(TUPLE const * dat)
|
|||
byte * buf = dat->to_bytes(); // TODO could be more efficient; this does a malloc and memcpy. The alternative couples us more strongly to datapage, but simplifies datapage.
|
||||
len_t dat_len = dat->byte_length();
|
||||
|
||||
bool succ = write_data((const byte*)&dat_len, sizeof(dat_len));
|
||||
if(succ) {
|
||||
Page * p = write_data_and_latch((const byte*)&dat_len, sizeof(dat_len));
|
||||
bool succ = false;
|
||||
if(p) {
|
||||
succ = write_data(buf, dat_len);
|
||||
unlock(p->rwlatch);
|
||||
releasePage(p);
|
||||
}
|
||||
|
||||
free(buf);
|
||||
|
@ -306,14 +335,21 @@ TUPLE* DataPage<TUPLE>::iterator::getnext()
|
|||
len_t len;
|
||||
bool succ;
|
||||
if(dp == NULL) { return NULL; }
|
||||
// XXX hack: read latch the page that the record will live on.
|
||||
// This should be handled by a read_data_in_latch function, or something...
|
||||
Page * p = loadPage(dp->xid_, dp->calc_chunk_from_offset(read_offset_).page);
|
||||
readlock(p->rwlatch, 0);
|
||||
succ = dp->read_data((byte*)&len, read_offset_, sizeof(len));
|
||||
if((!succ) || (len == 0)) { return NULL; }
|
||||
read_offset_ += sizeof(len);
|
||||
|
||||
byte * buf = (byte*)malloc(len);
|
||||
|
||||
succ = dp->read_data(buf, read_offset_, len);
|
||||
|
||||
// release hacky latch
|
||||
unlock(p->rwlatch);
|
||||
releasePage(p);
|
||||
|
||||
if(!succ) { read_offset_ -= sizeof(len); free(buf); return NULL; }
|
||||
|
||||
read_offset_ += len;
|
||||
|
@ -325,19 +361,4 @@ TUPLE* DataPage<TUPLE>::iterator::getnext()
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*template <class TUPLE>
|
||||
void DataPage<TUPLE>::RecordIterator::advance(int xid, int count)
|
||||
{
|
||||
len_t len;
|
||||
bool succ;
|
||||
for(int i = 0; i < count; i++) {
|
||||
succ = dp->read_data(xid, (byte*)&len, read_offset_, sizeof(len));
|
||||
if((!succ) || (len == 0)) { return; }
|
||||
read_offset_ += sizeof(len);
|
||||
read_offset_ += len;
|
||||
}
|
||||
}*/
|
||||
|
||||
|
||||
template class DataPage<datatuple>;
|
||||
|
||||
|
|
|
@ -128,8 +128,9 @@ private:
|
|||
assert(ret.size);
|
||||
return ret;
|
||||
}
|
||||
size_t write_bytes(const byte * buf, ssize_t remaining);
|
||||
size_t write_bytes(const byte * buf, ssize_t remaining, Page ** latch_p = NULL);
|
||||
size_t read_bytes(byte * buf, off_t offset, ssize_t remaining);
|
||||
Page * write_data_and_latch(const byte * buf, size_t len, bool init_next = true, bool latch = true);
|
||||
bool write_data(const byte * buf, size_t len, bool init_next = true);
|
||||
bool read_data(byte * buf, off_t offset, size_t len);
|
||||
bool initialize_next_page();
|
||||
|
|
96
logstore.cpp
96
logstore.cpp
|
@ -26,6 +26,8 @@ logtable<TUPLE>::logtable(pageid_t internal_region_size, pageid_t datapage_regio
|
|||
r_val = MIN_R;
|
||||
tree_c0 = NULL;
|
||||
tree_c0_mergeable = NULL;
|
||||
c0_is_merging = false;
|
||||
tree_c1_prime = NULL;
|
||||
tree_c1 = NULL;
|
||||
tree_c1_mergeable = NULL;
|
||||
tree_c2 = NULL;
|
||||
|
@ -81,6 +83,7 @@ void logtable<TUPLE>::init_stasis() {
|
|||
|
||||
DataPage<datatuple>::register_stasis_page_impl();
|
||||
// XXX Workaround Stasis' (still broken) default concurrent buffer manager
|
||||
stasis_buffer_manager_size = 1024 * 1024; // 4GB = 2^10 pages:
|
||||
stasis_buffer_manager_factory = stasis_buffer_manager_hash_factory;
|
||||
|
||||
Tinit();
|
||||
|
@ -173,35 +176,42 @@ void logtable<TUPLE>::flushTable()
|
|||
gettimeofday(&start_tv,0);
|
||||
start = tv_to_double(start_tv);
|
||||
|
||||
|
||||
int expmcount = merge_count;
|
||||
merge_mgr->finished_merge(0);
|
||||
|
||||
//this is for waiting the previous merger of the mem-tree
|
||||
//hopefullly this wont happen
|
||||
|
||||
bool blocked = false;
|
||||
|
||||
int expmcount = merge_count;
|
||||
//this waits for the previous merger of the mem-tree
|
||||
//hopefullly this wont happen
|
||||
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
while(get_tree_c0_mergeable()) {
|
||||
#else
|
||||
while(get_c0_is_merging()) {
|
||||
#endif
|
||||
rwlc_cond_wait(&c0_needed, header_mut);
|
||||
blocked = true;
|
||||
if(expmcount != merge_count) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
set_c0_is_merging(true);
|
||||
|
||||
c0_stats->handed_off_tree();
|
||||
merge_mgr->new_merge(0);
|
||||
|
||||
gettimeofday(&stop_tv,0);
|
||||
stop = tv_to_double(stop_tv);
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
set_tree_c0_mergeable(get_tree_c0());
|
||||
|
||||
#endif
|
||||
pthread_cond_signal(&c0_ready);
|
||||
DEBUG("Signaled c0-c1 merge thread\n");
|
||||
|
||||
merge_count ++;
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
set_tree_c0(new memTreeComponent<datatuple>::rbtree_t);
|
||||
#endif
|
||||
c0_stats->starting_merge();
|
||||
|
||||
tsize = 0;
|
||||
|
@ -273,8 +283,37 @@ datatuple * logtable<TUPLE>::findTuple(int xid, const datatuple::key_t key, size
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
//step 3: check c1
|
||||
//step 2.5: check new c1 if exists
|
||||
if(!done && get_tree_c1_prime() != 0)
|
||||
{
|
||||
DEBUG("old c1 tree not null\n");
|
||||
datatuple *tuple_oc1 = get_tree_c1_prime()->findTuple(xid, key, keySize);
|
||||
|
||||
if(tuple_oc1 != NULL)
|
||||
{
|
||||
bool use_copy = false;
|
||||
if(tuple_oc1->isDelete())
|
||||
done = true;
|
||||
else if(ret_tuple != 0) //merge the two
|
||||
{
|
||||
datatuple *mtuple = tmerger->merge(tuple_oc1, ret_tuple); //merge the two
|
||||
datatuple::freetuple(ret_tuple); //free tuple from before
|
||||
ret_tuple = mtuple; //set return tuple to merge result
|
||||
}
|
||||
else //found for the first time
|
||||
{
|
||||
use_copy = true;
|
||||
ret_tuple = tuple_oc1;
|
||||
}
|
||||
|
||||
if(!use_copy)
|
||||
{
|
||||
datatuple::freetuple(tuple_oc1); //free tuple from tree old c1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//step 3: check c1
|
||||
if(!done)
|
||||
{
|
||||
datatuple *tuple_c1 = get_tree_c1()->findTuple(xid, key, keySize);
|
||||
|
@ -282,13 +321,13 @@ datatuple * logtable<TUPLE>::findTuple(int xid, const datatuple::key_t key, size
|
|||
{
|
||||
bool use_copy = false;
|
||||
if(tuple_c1->isDelete()) //tuple deleted
|
||||
done = true;
|
||||
done = true;
|
||||
else if(ret_tuple != 0) //merge the two
|
||||
{
|
||||
datatuple *mtuple = tmerger->merge(tuple_c1, ret_tuple); //merge the two
|
||||
datatuple::freetuple(ret_tuple); //free tuple from before
|
||||
ret_tuple = mtuple; //set return tuple to merge result
|
||||
}
|
||||
ret_tuple = mtuple; //set return tuple to merge result
|
||||
}
|
||||
else //found for the first time
|
||||
{
|
||||
use_copy = true;
|
||||
|
@ -411,6 +450,19 @@ datatuple * logtable<TUPLE>::findTuple_first(int xid, datatuple::key_t key, size
|
|||
}
|
||||
}
|
||||
|
||||
if(ret_tuple == 0)
|
||||
{
|
||||
DEBUG("Not in first disk tree\n");
|
||||
|
||||
//step 4: check in progress c1 if exists
|
||||
if( get_tree_c1_prime() != 0)
|
||||
{
|
||||
DEBUG("old c1 tree not null\n");
|
||||
ret_tuple = get_tree_c1_prime()->findTuple(xid, key, keySize);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(ret_tuple == 0)
|
||||
{
|
||||
DEBUG("Not in old mem tree\n");
|
||||
|
@ -453,15 +505,16 @@ void logtable<TUPLE>::insertTuple(datatuple *tuple)
|
|||
{
|
||||
//lock the red-black tree
|
||||
merge_mgr->read_tuple_from_small_component(0, tuple); // has to be before rb_mut, since it calls tick with block = true, and that releases header_mut.
|
||||
datatuple * pre_t = 0; // this is a pointer to any data tuples that we'll be deleting below. We need to update the merge_mgr statistics with it, but have to do so outside of the rb_mut region.
|
||||
|
||||
pthread_mutex_lock(&rb_mut);
|
||||
//find the previous tuple with same key in the memtree if exists
|
||||
memTreeComponent<datatuple>::rbtree_t::iterator rbitr = tree_c0->find(tuple);
|
||||
datatuple * t = 0;
|
||||
if(rbitr != tree_c0->end())
|
||||
{
|
||||
datatuple *pre_t = *rbitr;
|
||||
pre_t = *rbitr;
|
||||
//do the merging
|
||||
merge_mgr->read_tuple_from_large_component(0, pre_t);
|
||||
datatuple *new_t = tmerger->merge(pre_t, tuple);
|
||||
c0_stats->merged_tuples(new_t, tuple, pre_t);
|
||||
t = new_t;
|
||||
|
@ -472,7 +525,6 @@ void logtable<TUPLE>::insertTuple(datatuple *tuple)
|
|||
//update the tree size (+ new_t size - pre_t size)
|
||||
tree_bytes += ((int64_t)new_t->byte_length() - (int64_t)pre_t->byte_length());
|
||||
|
||||
datatuple::freetuple(pre_t); //free the previous tuple
|
||||
}
|
||||
else //no tuple with same key exists in mem-tree
|
||||
{
|
||||
|
@ -486,9 +538,9 @@ void logtable<TUPLE>::insertTuple(datatuple *tuple)
|
|||
tree_bytes += t->byte_length();// + RB_TREE_OVERHEAD;
|
||||
|
||||
}
|
||||
merge_mgr->wrote_tuple(0, t); // needs to be here; doesn't grab a mutex.
|
||||
|
||||
merge_mgr->wrote_tuple(0, t);
|
||||
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
//flushing logic
|
||||
if(tree_bytes >= max_c0_size )
|
||||
{
|
||||
|
@ -501,13 +553,19 @@ void logtable<TUPLE>::insertTuple(datatuple *tuple)
|
|||
rwlc_writelock(header_mut);
|
||||
// the test of tree size needs to be atomic with the flushTable, and flushTable needs a writelock.
|
||||
if(tree_bytes >= max_c0_size) {
|
||||
flushTable();
|
||||
flushTable(); // this needs to hold rb_mut if snowshoveling is disabled, but can't hold rb_mut if snowshoveling is enabled.
|
||||
}
|
||||
rwlc_unlock(header_mut);
|
||||
}
|
||||
|
||||
#endif
|
||||
pthread_mutex_unlock(&rb_mut);
|
||||
|
||||
// XXX is it OK to move this after the NO_SNOWSHOVEL block?
|
||||
if(pre_t) {
|
||||
// needs to be here; calls update_progress, which sometimes grabs mutexes..
|
||||
merge_mgr->read_tuple_from_large_component(0, pre_t); // was interspersed with the erase, insert above...
|
||||
datatuple::freetuple(pre_t); //free the previous tuple
|
||||
}
|
||||
|
||||
DEBUG("tree size %d tuples %lld bytes.\n", tsize, tree_bytes);
|
||||
}
|
||||
|
||||
|
|
27
logstore.h
27
logstore.h
|
@ -63,9 +63,11 @@ public:
|
|||
inline diskTreeComponent * get_tree_c2(){return tree_c2;}
|
||||
inline diskTreeComponent * get_tree_c1(){return tree_c1;}
|
||||
inline diskTreeComponent * get_tree_c1_mergeable(){return tree_c1_mergeable;}
|
||||
inline diskTreeComponent * get_tree_c1_prime(){return tree_c1_prime;}
|
||||
|
||||
inline void set_tree_c1(diskTreeComponent *t){tree_c1=t; bump_epoch(); }
|
||||
inline void set_tree_c1_mergeable(diskTreeComponent *t){tree_c1_mergeable=t; bump_epoch(); }
|
||||
inline void set_tree_c1_prime(diskTreeComponent *t){tree_c1_prime=t; bump_epoch(); }
|
||||
inline void set_tree_c2(diskTreeComponent *t){tree_c2=t; bump_epoch(); }
|
||||
pthread_cond_t c0_needed;
|
||||
pthread_cond_t c0_ready;
|
||||
|
@ -80,6 +82,8 @@ public:
|
|||
merge_mgr->set_c0_size(max_c0_size);
|
||||
merge_mgr->get_merge_stats(1);
|
||||
}
|
||||
bool get_c0_is_merging() { return c0_is_merging; }
|
||||
void set_c0_is_merging(bool is_merging) { c0_is_merging = is_merging; }
|
||||
void set_tree_c0_mergeable(memTreeComponent<datatuple>::rbtree_ptr_t newtree){tree_c0_mergeable = newtree; bump_epoch(); }
|
||||
void update_persistent_header(int xid, int merge_level);
|
||||
|
||||
|
@ -128,13 +132,15 @@ private:
|
|||
diskTreeComponent *tree_c2; //big tree
|
||||
diskTreeComponent *tree_c1; //small tree
|
||||
diskTreeComponent *tree_c1_mergeable; //small tree: ready to be merged with c2
|
||||
diskTreeComponent *tree_c1_prime; //small tree: ready to be merged with c2
|
||||
memTreeComponent<datatuple>::rbtree_ptr_t tree_c0; // in-mem red black tree
|
||||
memTreeComponent<datatuple>::rbtree_ptr_t tree_c0_mergeable; // in-mem red black tree: ready to be merged with c1.
|
||||
bool c0_is_merging;
|
||||
|
||||
int tsize; //number of tuples
|
||||
public:
|
||||
int64_t tree_bytes; //number of bytes
|
||||
|
||||
public:
|
||||
//DATA PAGE SETTINGS
|
||||
pageid_t internal_region_size; // in number of pages
|
||||
pageid_t datapage_region_size; // "
|
||||
|
@ -369,7 +375,7 @@ public:
|
|||
void validate() {
|
||||
typename memTreeComponent<TUPLE>::revalidatingIterator * c0_it;
|
||||
typename memTreeComponent<TUPLE>::iterator *c0_mergeable_it[1];
|
||||
diskTreeComponent::iterator * disk_it[3];
|
||||
diskTreeComponent::iterator * disk_it[4];
|
||||
epoch = ltable->get_epoch();
|
||||
|
||||
datatuple *t;
|
||||
|
@ -383,17 +389,22 @@ public:
|
|||
|
||||
c0_it = new typename memTreeComponent<TUPLE>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut, t);
|
||||
c0_mergeable_it[0] = new typename memTreeComponent<TUPLE>::iterator (ltable->get_tree_c0_mergeable(), t);
|
||||
disk_it[0] = ltable->get_tree_c1()->open_iterator(t);
|
||||
if(ltable->get_tree_c1_mergeable()) {
|
||||
disk_it[1] = ltable->get_tree_c1_mergeable()->open_iterator(t);
|
||||
if(ltable->get_tree_c1_prime()) {
|
||||
disk_it[0] = ltable->get_tree_c1_prime()->open_iterator(t);
|
||||
} else {
|
||||
disk_it[1] = NULL;
|
||||
disk_it[0] = NULL;
|
||||
}
|
||||
disk_it[2] = ltable->get_tree_c2()->open_iterator(t);
|
||||
disk_it[1] = ltable->get_tree_c1()->open_iterator(t);
|
||||
if(ltable->get_tree_c1_mergeable()) {
|
||||
disk_it[2] = ltable->get_tree_c1_mergeable()->open_iterator(t);
|
||||
} else {
|
||||
disk_it[2] = NULL;
|
||||
}
|
||||
disk_it[3] = ltable->get_tree_c2()->open_iterator(t);
|
||||
|
||||
inner_merge_it_t * inner_merge_it =
|
||||
new inner_merge_it_t(c0_it, c0_mergeable_it, 1, NULL, TUPLE::compare_obj);
|
||||
merge_it_ = new merge_it_t(inner_merge_it, disk_it, 3, NULL, TUPLE::compare_obj); // XXX Hardcodes comparator, and does not handle merges
|
||||
merge_it_ = new merge_it_t(inner_merge_it, disk_it, 4, NULL, TUPLE::compare_obj); // XXX Hardcodes comparator, and does not handle merges
|
||||
if(last_returned) {
|
||||
TUPLE * junk = merge_it_->peek();
|
||||
if(junk && !TUPLE::compare(junk->key(), junk->keylen(), last_returned->key(), last_returned->keylen())) {
|
||||
|
|
|
@ -56,6 +56,8 @@ void mergeManager::update_progress(mergeStats * s, int delta) {
|
|||
if(s->merge_level < 2 && s->mergeable_size && delta) {
|
||||
int64_t effective_max_delta = (int64_t)(UPDATE_PROGRESS_PERIOD * s->bps);
|
||||
|
||||
if(s->merge_level == 0) { s->base_size = ltable->tree_bytes; }
|
||||
|
||||
if(s->mini_delta > effective_max_delta) {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, 0);
|
||||
|
@ -92,7 +94,15 @@ void mergeManager::update_progress(mergeStats * s, int delta) {
|
|||
s->out_progress = 0.0;
|
||||
}
|
||||
}
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
s->current_size = s->base_size + s->bytes_out - s->bytes_in_large;
|
||||
#else
|
||||
if(s->merge_level == 0 && delta) {
|
||||
s->current_size = s->bytes_out - s->bytes_in_large;
|
||||
} else {
|
||||
s->current_size = s->base_size + s->bytes_out - s->bytes_in_large;
|
||||
}
|
||||
#endif
|
||||
struct timeval now;
|
||||
gettimeofday(&now, 0);
|
||||
double elapsed_delta = tv_to_double(&now) - ts_to_double(&s->last_tick);
|
||||
|
@ -101,14 +111,12 @@ void mergeManager::update_progress(mergeStats * s, int delta) {
|
|||
s->lifetime_consumed += s->bytes_in_small_delta;
|
||||
double tau = 60.0; // number of seconds to look back for window computation. (this is the expected mean residence time in an exponential decay model, so the units are not so intuitive...)
|
||||
double decay = exp((0.0-elapsed_delta)/tau);
|
||||
// s->window_elapsed = (decay * s->window_elapsed) + elapsed_delta;
|
||||
// s->window_consumed = (decay * s->window_consumed) + s->bytes_in_small_delta;
|
||||
|
||||
double_to_ts(&s->last_tick, tv_to_double(&now));
|
||||
|
||||
double window_bps = ((double)s->bytes_in_small_delta) / (double)elapsed_delta;
|
||||
|
||||
s->bps = (1.0-decay) * window_bps + decay * s->bps; //s->window_consumed / s->window_elapsed;
|
||||
s->bps = (1.0-decay) * window_bps + decay * s->bps;
|
||||
|
||||
s->bytes_in_small_delta = 0;
|
||||
|
||||
|
@ -139,13 +147,17 @@ void mergeManager::update_progress(mergeStats * s, int delta) {
|
|||
* bytes_consumed_by_merger = sum(bytes_in_small_delta)
|
||||
*/
|
||||
void mergeManager::tick(mergeStats * s, bool block, bool force) {
|
||||
#define PRINT_SKIP 100
|
||||
#define PRINT_SKIP 10000
|
||||
if(block) {
|
||||
// sleep(((double)delta)/[s+1]->bps); // XXX We currently sleep based on the past performance of the current tree. In the limit, this is fine, but it would be better to sleep based on the past throughput of the tree component we're waiting for. fill in the parameters
|
||||
}
|
||||
if(force || s->need_tick) {
|
||||
|
||||
if(block) {
|
||||
if(block
|
||||
#ifndef NO_SNOWSHOVEL
|
||||
&& s->merge_level == 0
|
||||
#endif
|
||||
) {
|
||||
pthread_mutex_lock(<able->tick_mut);
|
||||
rwlc_readlock(ltable->header_mut);
|
||||
|
||||
|
@ -154,7 +166,7 @@ void mergeManager::tick(mergeStats * s, bool block, bool force) {
|
|||
pthread_cond_wait(&throttle_wokeup_cond, <able->tick_mut);
|
||||
rwlc_readlock(ltable->header_mut);
|
||||
}
|
||||
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
int64_t overshoot = 0;
|
||||
int64_t overshoot2 = 0;
|
||||
int64_t raw_overshoot = 0;
|
||||
|
@ -257,6 +269,23 @@ void mergeManager::tick(mergeStats * s, bool block, bool force) {
|
|||
break;
|
||||
}
|
||||
} while(1);
|
||||
#else
|
||||
while(/*s->current_size*/ltable->tree_bytes > ltable->max_c0_size) {
|
||||
rwlc_unlock(ltable->header_mut);
|
||||
printf("\nMEMORY OVERRUN!!!! SLEEP!!!!\n");
|
||||
sleep(1);
|
||||
rwlc_readlock(ltable->header_mut);
|
||||
}
|
||||
if(/*s->current_size*/ltable->tree_bytes > 0.9 * (double)ltable->max_c0_size) {
|
||||
double slp = 0.01 + (double)(((double)ltable->tree_bytes)-0.9*(double)ltable->max_c0_size) / (double)(ltable->max_c0_size);
|
||||
DEBUG("\nsleeping %0.6f tree_megabytes %0.3f\n", slp, ((double)ltable->tree_bytes)/(1024.0*1024.0));
|
||||
struct timespec sleeptime;
|
||||
double_to_ts(&sleeptime, slp);
|
||||
rwlc_unlock(ltable->header_mut);
|
||||
nanosleep(&sleeptime, 0);
|
||||
rwlc_readlock(ltable->header_mut);
|
||||
}
|
||||
#endif
|
||||
rwlc_unlock(ltable->header_mut);
|
||||
pthread_mutex_unlock(<able->tick_mut);
|
||||
} else {
|
||||
|
@ -360,8 +389,10 @@ void mergeManager::pretty_print(FILE * out) {
|
|||
double c0_c1_out_progress = 100.0 * c1->current_size / c1->target_size;
|
||||
double c1_c2_progress = 100.0 * (c2->bytes_in_large + c2->bytes_in_small) / (c1->mergeable_size + c2->base_size);
|
||||
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
assert((!c1->active) || (c0_c1_in_progress >= -1 && c0_c1_in_progress < 102));
|
||||
assert((!c2->active) || (c1_c2_progress >= -1 && c1_c2_progress < 102));
|
||||
#endif
|
||||
|
||||
fprintf(out,"[merge progress MB/s window (lifetime)]: app [%s %6lldMB ~ %3.0f%% %6.1fsec %4.1f (%4.1f)] %s %s [%s %3.0f%% ~ %3.0f%% %4.1f (%4.1f)] %s %s [%s %3.0f%% %4.1f (%4.1f)] %s ",
|
||||
c0->active ? "RUN" : "---", (long long)(c0->lifetime_consumed / mb), c0_out_progress, c0->lifetime_elapsed, c0->bps/((double)mb), c0->lifetime_consumed/(((double)mb)*c0->lifetime_elapsed),
|
||||
|
|
|
@ -118,11 +118,11 @@ class mergeStats {
|
|||
struct timespec last_mini_tick;
|
||||
struct timespec last_tick;
|
||||
public: // XXX only accessed during initialization.
|
||||
pageid_t base_size;
|
||||
pageid_t base_size; // size of table at beginning of merge. for c0, size of table at beginning of current c0-c1 merge round, plus data written since then. (this minus c1->bytes_in_small is the current size)
|
||||
pageid_t mergeable_size; // protected by mutex.
|
||||
protected:
|
||||
pageid_t target_size;
|
||||
pageid_t current_size;
|
||||
protected:
|
||||
|
||||
pageid_t bytes_out_with_overhead;// How many bytes did we write (including internal tree nodes)?
|
||||
public:
|
||||
|
|
60
merger.cpp
60
merger.cpp
|
@ -56,9 +56,11 @@ void merge_scheduler::startlogtable(int index, int64_t MAX_C0_SIZE)
|
|||
ltable->set_tree_c0(new memTreeComponent<datatuple>::rbtree_t);
|
||||
|
||||
//disk merger args
|
||||
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
ltable->set_max_c0_size(MAX_C0_SIZE);
|
||||
|
||||
#else
|
||||
ltable->set_max_c0_size(MAX_C0_SIZE*2); // XXX blatant hack.
|
||||
#endif
|
||||
diskTreeComponent ** block1_scratch = new diskTreeComponent*;
|
||||
*block1_scratch=0;
|
||||
|
||||
|
@ -121,6 +123,7 @@ void* memMergeThread(void*arg)
|
|||
ltable->merge_mgr->new_merge(1);
|
||||
int done = 0;
|
||||
// 2: wait for c0_mergable
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
while(!ltable->get_tree_c0_mergeable())
|
||||
{
|
||||
pthread_cond_signal(<able->c0_needed);
|
||||
|
@ -136,7 +139,21 @@ void* memMergeThread(void*arg)
|
|||
|
||||
DEBUG("mmt:\tblock ready\n");
|
||||
|
||||
}
|
||||
}
|
||||
#else
|
||||
if(!ltable->is_still_running()) {
|
||||
done = 1;
|
||||
}
|
||||
while(ltable->tree_bytes < 0.5 * (double)ltable->max_c0_size && ! done) {
|
||||
rwlc_unlock(ltable->header_mut);
|
||||
sleep(1); // XXX fixme!
|
||||
rwlc_writelock(ltable->header_mut);
|
||||
|
||||
if(!ltable->is_still_running()) {
|
||||
done = 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if(done==1)
|
||||
{
|
||||
|
@ -154,13 +171,19 @@ void* memMergeThread(void*arg)
|
|||
|
||||
//create the iterators
|
||||
diskTreeComponent::iterator *itrA = ltable->get_tree_c1()->open_iterator();
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
memTreeComponent<datatuple>::iterator *itrB =
|
||||
new memTreeComponent<datatuple>::iterator(ltable->get_tree_c0_mergeable());
|
||||
|
||||
#else
|
||||
memTreeComponent<datatuple>::revalidatingIterator *itrB =
|
||||
new memTreeComponent<datatuple>::revalidatingIterator(ltable->get_tree_c0(), <able->rb_mut);
|
||||
#endif
|
||||
|
||||
//create a new tree
|
||||
diskTreeComponent * c1_prime = new diskTreeComponent(xid, ltable->internal_region_size, ltable->datapage_region_size, ltable->datapage_size, stats);
|
||||
|
||||
ltable->set_tree_c1_prime(c1_prime);
|
||||
|
||||
rwlc_unlock(ltable->header_mut);
|
||||
|
||||
//: do the merge
|
||||
|
@ -191,11 +214,15 @@ void* memMergeThread(void*arg)
|
|||
|
||||
// 10: c1 = c1'
|
||||
ltable->set_tree_c1(c1_prime);
|
||||
ltable->set_tree_c1_prime(0);
|
||||
|
||||
#ifdef NO_SNOWSHOVEL
|
||||
// 11.5: delete old c0_mergeable
|
||||
memTreeComponent<datatuple>::tearDownTree(ltable->get_tree_c0_mergeable());
|
||||
// 11: c0_mergeable = NULL
|
||||
ltable->set_tree_c0_mergeable(NULL);
|
||||
#endif
|
||||
ltable->set_c0_is_merging(false);
|
||||
double new_c1_size = stats->output_size();
|
||||
pthread_cond_signal(<able->c0_needed);
|
||||
|
||||
|
@ -226,7 +253,7 @@ void* memMergeThread(void*arg)
|
|||
// we just set c1 = c1'. Want to move c1 -> c1 mergeable, clean out c1.
|
||||
|
||||
// 7: and perhaps c1_mergeable
|
||||
ltable->set_tree_c1_mergeable(c1_prime); // c1_prime == c1.
|
||||
ltable->set_tree_c1_mergeable(ltable->get_tree_c1()); // c1_prime == c1.
|
||||
stats->handed_off_tree();
|
||||
|
||||
// 8: c1 = new empty.
|
||||
|
@ -344,7 +371,7 @@ void *diskMergeThread(void*arg)
|
|||
DEBUG("\nR = %f\n", *(ltable->R()));
|
||||
|
||||
DEBUG("dmt:\tmerge_count %lld\t#written bytes: %lld\n optimal r %.2f", stats.merge_count, stats.output_size(), *(a->r_i));
|
||||
// 10: C2 is never to big
|
||||
// 10: C2 is never too big
|
||||
ltable->set_tree_c2(c2_prime);
|
||||
stats->handed_off_tree();
|
||||
|
||||
|
@ -439,6 +466,27 @@ void merge_iterators(int xid,
|
|||
periodically_force(xid, &i, forceMe, log);
|
||||
// cannot free any tuples here; they may still be read through a lookup
|
||||
}
|
||||
#ifndef NO_SNOWSHOVEL
|
||||
pthread_mutex_lock(<able->rb_mut);
|
||||
if(stats->merge_level == 1) {
|
||||
datatuple * t2tmp = NULL;
|
||||
{
|
||||
memTreeComponent<datatuple>::rbtree_t::iterator rbitr = ltable->get_tree_c0()->find(t2);
|
||||
if(rbitr != ltable->get_tree_c0()->end()) {
|
||||
t2tmp = *rbitr;
|
||||
if((t2tmp->datalen() == t2->datalen()) &&
|
||||
!memcmp(t2tmp->data(), t2->data(), t2->datalen())) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if(t2tmp) {
|
||||
ltable->get_tree_c0()->erase(t2);
|
||||
ltable->tree_bytes -= t2->byte_length();
|
||||
datatuple::freetuple(t2tmp);
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(<able->rb_mut);
|
||||
#endif
|
||||
datatuple::freetuple(t2);
|
||||
}
|
||||
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
#undef try
|
||||
#undef end
|
||||
|
||||
|
||||
class RegionAllocator
|
||||
{
|
||||
public:
|
||||
|
|
Loading…
Reference in a new issue