From 0652b0bf848001677c9d2fb94969e66cadd78f36 Mon Sep 17 00:00:00 2001 From: sears Date: Tue, 20 Mar 2012 22:46:29 +0000 Subject: [PATCH] moved bloom filter into stasis git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@3896 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe --- CMakeLists.txt | 2 +- bLSM.h | 8 +- bloomFilter.c | 159 ------------------------------------- bloomFilter.h | 50 ------------ diskTreeComponent.cpp | 4 +- diskTreeComponent.h | 10 +-- test/CMakeLists.txt | 2 - test/check_bloomFilter.cpp | 116 --------------------------- 8 files changed, 12 insertions(+), 339 deletions(-) delete mode 100644 bloomFilter.c delete mode 100644 bloomFilter.h delete mode 100644 test/check_bloomFilter.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6233121..8bcaefa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,5 +96,5 @@ ENDIF ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" ) #CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) IF ( HAVE_STASIS ) - ADD_LIBRARY(blsm bLSM.cpp diskTreeComponent.cpp memTreeComponent.cpp dataPage.cpp mergeScheduler.cpp tupleMerger.cpp mergeStats.cpp mergeManager.cpp bloomFilter.c) + ADD_LIBRARY(blsm bLSM.cpp diskTreeComponent.cpp memTreeComponent.cpp dataPage.cpp mergeScheduler.cpp tupleMerger.cpp mergeStats.cpp mergeManager.cpp) ENDIF ( HAVE_STASIS ) diff --git a/bLSM.h b/bLSM.h index a3c5fd0..010f695 100644 --- a/bLSM.h +++ b/bLSM.h @@ -189,11 +189,11 @@ public: bool mightBeOnDisk(dataTuple * t) { if(tree_c1) { if(!tree_c1->bloom_filter) { DEBUG("no c1 bloom filter\n"); return true; } - if(bloom_filter_lookup(tree_c1->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1\n"); return true; } + if(stasis_bloom_filter_lookup(tree_c1->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1\n"); return true; } } if(tree_c1_prime) { if(!tree_c1_prime->bloom_filter) { DEBUG("no c1' bloom filter\n"); return true; } - if(bloom_filter_lookup(tree_c1_prime->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1'\n"); return true; } + if(stasis_bloom_filter_lookup(tree_c1_prime->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1'\n"); return true; } } return mightBeAfterMemMerge(t); } @@ -202,13 +202,13 @@ public: if(tree_c1_mergeable) { if(!tree_c1_mergeable->bloom_filter) { DEBUG("no c1m bloom filter\n"); return true; } - if(bloom_filter_lookup(tree_c1_mergeable->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1m'\n");return true; } + if(stasis_bloom_filter_lookup(tree_c1_mergeable->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1m'\n");return true; } } if(tree_c2) { if(!tree_c2->bloom_filter) { DEBUG("no c2 bloom filter\n"); return true; } - if(bloom_filter_lookup(tree_c2->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c2\n");return true; } + if(stasis_bloom_filter_lookup(tree_c2->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c2\n");return true; } } return false; } diff --git a/bloomFilter.c b/bloomFilter.c deleted file mode 100644 index 7be11f6..0000000 --- a/bloomFilter.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * bloomFilter.c - * - * Copyright 2010-2012 Yahoo! Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Author: sears - */ -#include -#include -#include -#include "bloomFilter.h" -/** - Variable names: - m: number of bloom filter bits - n: number of bloom filter entries - k: number of hash functions = ln(2) * (m/n) - c: m/n - f: false positive rate = (1/2)^k ~= 0.6185)^(m/n) ; - taking log_0.6185 of both sides: k log_0.6185(1/2) = m/n ; - applying change of base: k log(1/2) / log(6.128) = m / n - (but that's not useful; this is:) - - f ~= 0.6185 ^ (m/n) - log_0.6185(f) = m/n - log(f) / log(0.6185) = m / n - m = n log f / log 0.6185 - p: probability a given bit is 1 ~= e^(-kn/m) - */ - -static uint64_t bloom_filter_calc_num_buckets(uint64_t num_expected_items, - double false_positive_rate) { - // m = n log f / log 0.6185 - return ((uint64_t) ceil(((double)num_expected_items) * - log(false_positive_rate) / log(0.6185))); - // m = - n ln f / ln 2 ^ 2 = - n ln f / 0.4804 = - n log f / (0.4343 * 0.4804) = -n log f / 0.2086 -} -static int bloom_filter_calc_num_functions(uint64_t num_expected_items, - uint64_t num_buckets) { - // k = ln(2) * (m/n) - int ret = floor((log(2) / log(exp(1.0))) - * ((double) num_buckets) / (double) num_expected_items); - if(ret == 0) { - return 1; - } else { - return ret; - } -} -static double bloom_filter_current_false_positive_rate(uint64_t actual_number_of_items, - uint64_t num_buckets) { - // 0.6185^(m/n) - return pow(0.6185, ((double)num_buckets)/(double)actual_number_of_items); -} - -struct bloom_filter_t { - uint64_t (*func_a)(const char *, int); - uint64_t (*func_b)(const char *, int); - uint64_t num_expected_items; - double desired_false_positive_rate; - uint64_t num_buckets; - uint8_t * buckets; - uint64_t num_functions; - uint64_t*result_scratch_space; - uint64_t actual_number_of_items; -}; -bloom_filter_t * bloom_filter_create(uint64_t(*func_a)(const char*,int), - uint64_t(*func_b)(const char*,int), - uint64_t num_expected_items, - double false_positive_rate) { - bloom_filter_t * ret = malloc(sizeof(*ret)); - ret->func_a = func_a; - ret->func_b = func_b; - ret->num_expected_items = num_expected_items; - ret->desired_false_positive_rate = false_positive_rate; - ret->num_buckets = bloom_filter_calc_num_buckets(ret->num_expected_items, ret->desired_false_positive_rate); - ret->buckets = calloc((ret->num_buckets / 8) + ((ret->num_buckets % 8 == 0) ? 0 : 1), 1); - ret->num_functions = bloom_filter_calc_num_functions(ret->num_expected_items, ret->num_buckets); - ret->result_scratch_space = malloc(sizeof(*ret->result_scratch_space) * ret->num_functions); - ret->actual_number_of_items = 0; - return ret; -} -void bloom_filter_destroy(bloom_filter_t* bf) { - free(bf->buckets); - free(bf->result_scratch_space); - free(bf); -} -// TODO this uses %. It would be better if it used &, but that would potentially double the memory we use. #define a flag. -static void bloom_filter_calc_functions(bloom_filter_t * bf, uint64_t* results, const char * key, int keylen) { - uint64_t fa = bf->func_a(key, keylen); - uint64_t fb = bf->func_b(key, keylen); - - results[0] = (fa + fb) % bf->num_buckets; - for(int i = 1; i < bf->num_functions; i++) { - results[i] = (results[i-1] + fb ) % bf->num_buckets; - } -} - -static const uint8_t bloom_filter_bit_masks[] = { 1, 2, 4, 8, 16, 32, 64, 128 }; -static void bloom_filter_set_bit(bloom_filter_t *bf, uint64_t bit) { - uint64_t array_offset = bit >> 3; - uint8_t bit_number = bit & 7; - - assert(bit < bf->num_buckets); - - bf->buckets[array_offset] |= bloom_filter_bit_masks[bit_number]; - -} -/** - @return 0 if the bit is not set, true otherwise. - */ -static uint8_t bloom_filter_get_bit(bloom_filter_t *bf, uint64_t bit) { - uint64_t array_offset = bit >> 3; - uint8_t bit_number = bit & 7; - - assert(bit < bf->num_buckets); - - return bf->buckets[array_offset] & bloom_filter_bit_masks[bit_number]; -} -void bloom_filter_insert(bloom_filter_t * bf, const char *key, int len) { - bloom_filter_calc_functions(bf, bf->result_scratch_space, key, len); - for(int i = 0; i < bf->num_functions; i++) { - bloom_filter_set_bit(bf, bf->result_scratch_space[i]); - } - bf->actual_number_of_items++; -} -int bloom_filter_lookup(bloom_filter_t * bf, const char * key, int len) { - int ret = 1; - uint64_t * scratch = malloc(sizeof(*scratch) * bf->num_functions); - bloom_filter_calc_functions(bf, scratch, key, len); - for(int i = 0; i < bf->num_functions; i++) { - ret = ret && bloom_filter_get_bit(bf, scratch[i]); - } - free(scratch); - return ret; -} - -void bloom_filter_print_stats(bloom_filter_t * bf) { - printf("Design capacity %lld design false positive %f\n" - "Current item count %lld current false positive %f\n" - "Number of buckets %lld (%f MB), number of hash functions %lld\n", - (long long)bf->num_expected_items, bf->desired_false_positive_rate, - (long long)bf->actual_number_of_items, - bloom_filter_current_false_positive_rate(bf->actual_number_of_items, - bf->num_buckets), - (long long)bf->num_buckets, - ((double)bf->num_buckets) / (8.0 * 1024.0 * 1024.0), - (long long)bf->num_functions); -} diff --git a/bloomFilter.h b/bloomFilter.h deleted file mode 100644 index 5eb51bf..0000000 --- a/bloomFilter.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * bloomFilter.h - * - * Copyright 2010-2012 Yahoo! Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Author: sears - */ -#ifndef BLOOM_FILTER_H -#define BLOOM_FILTER_H - -#include - -BEGIN_C_DECLS - -typedef struct bloom_filter_t bloom_filter_t; - -/** - @return 0 if there is not enough memory, or some other error occurred; a - pointer to the new bloom filter otherwise. - */ -bloom_filter_t * bloom_filter_create(uint64_t(*hash_func_a)(const char*,int), - uint64_t(*hash_func_b)(const char*,int), - uint64_t num_expected_items, - double false_positive_rate); - -void bloom_filter_destroy(bloom_filter_t*); - -void bloom_filter_insert(bloom_filter_t * bf, const char* key, int len); -/** - @return 1 if the value might be in the bloom filter, 0 otherwise - */ -int bloom_filter_lookup(bloom_filter_t * bf, const char* key, int len); - -void bloom_filter_print_stats(bloom_filter_t * bf); - -END_C_DECLS - -#endif diff --git a/diskTreeComponent.cpp b/diskTreeComponent.cpp index 0487709..ca0492f 100644 --- a/diskTreeComponent.cpp +++ b/diskTreeComponent.cpp @@ -79,7 +79,7 @@ void diskTreeComponent::writes_done() { int diskTreeComponent::insertTuple(int xid, dataTuple *t) { if(bloom_filter) { - bloom_filter_insert(bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen()); + stasis_bloom_filter_insert(bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen()); } int ret = 0; // no error. if(dp==0) { @@ -135,7 +135,7 @@ dataTuple * diskTreeComponent::findTuple(int xid, dataTuple::key_t key, size_t k dataTuple * tup=0; if(bloom_filter) { - if(!bloom_filter_lookup(bloom_filter, (const char*)key, keySize)) { + if(!stasis_bloom_filter_lookup(bloom_filter, (const char*)key, keySize)) { return NULL; } } diff --git a/diskTreeComponent.h b/diskTreeComponent.h index d2a9af3..c9364ce 100644 --- a/diskTreeComponent.h +++ b/diskTreeComponent.h @@ -25,7 +25,7 @@ #include "dataPage.h" #include "dataTuple.h" #include "mergeStats.h" -#include "bloomFilter.h" +#include #include extern "C" { @@ -50,10 +50,10 @@ class diskTreeComponent { stats(stats), bloom_filter(bloom_filter_size == 0 ? 0 - : bloom_filter_create(diskTreeComponent_hash_func_a, + : stasis_bloom_filter_create(diskTreeComponent_hash_func_a, diskTreeComponent_hash_func_b, bloom_filter_size, 0.01)) { - if(bloom_filter) bloom_filter_print_stats(bloom_filter); + if(bloom_filter) stasis_bloom_filter_print_stats(bloom_filter); } diskTreeComponent(int xid, recordid root, recordid internal_node_state, @@ -65,7 +65,7 @@ class diskTreeComponent { bloom_filter(0) {} ~diskTreeComponent() { - if(bloom_filter) bloom_filter_destroy(bloom_filter); + if(bloom_filter) stasis_bloom_filter_destroy(bloom_filter); delete dp; delete ltree; } @@ -208,7 +208,7 @@ class diskTreeComponent { }; }; - bloom_filter_t * bloom_filter; + stasis_bloom_filter_t * bloom_filter; class iterator { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index db34846..873787f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,8 +15,6 @@ # limitations under the License. IF( HAVE_STASIS ) CREATE_CHECK(check_gen) - CREATE_CHECK(check_bloomFilter) - CREATE_CHECK(check_testAndSet) CREATE_CHECK(check_logtree) CREATE_CHECK(check_datapage) CREATE_CHECK(check_logtable) diff --git a/test/check_bloomFilter.cpp b/test/check_bloomFilter.cpp deleted file mode 100644 index 70f9a57..0000000 --- a/test/check_bloomFilter.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* - * check_bloomFilter.cpp - * - * Copyright 2010-2012 Yahoo! Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Created on: Oct 2, 2010 - * Author: sears - */ -#include -#include -#include -#include -#include -#include - -/* - * This file can test CRC and FNV-1 based hash functions. Based on early experiments: - * - * CRC32 insert/lookup: 11/13 seconds, 1.1% false positive - * FNV-1 insert/lookup: 8/9 seconds, 2.8% false positive - * - * Expected false positive rate is 1%. - */ - -static uint64_t hash_a(const char* a, int len) { - return stasis_crc32(a,len,0xcafebabe); -} - -static uint64_t hash_b(const char* a, int len) { - return stasis_crc32(a,len,0xdeadbeef); -} -static uint64_t hash_a_fnv(const char* a, int len) { - return stasis_util_hash_fnv_1_uint32_t((const byte*)a, len); -} -static uint64_t hash_b_fnv(const char* a, int len) { - return stasis_util_hash_fnv_1_uint64_t((const byte*)a, len); -} - -static char * malloc_random_string(int group) { - char * str = 0; - int strlen = 0; - while(!strlen) strlen = 128 + (rand() & 127); - str = (char*)malloc(strlen + 1); - str[0] = group; - - for(int i = 1; i < strlen; i++) { - str[i] = (rand() & 128) + 1; - } - str[strlen] = 0; - return str; -} - -int main(int argc, char * argv[]) { - (void)hash_a; (void)hash_b; - (void)hash_a_fnv; (void)hash_b_fnv; - - const int num_inserts = 1000000; - char ** strings = (char**)malloc(num_inserts * sizeof(char*)); - uint64_t sum_strlen = 0; - struct timeval start, stop; - gettimeofday(&start, 0); - printf("seed: %lld\n", (long long)start.tv_sec); - srand(start.tv_sec); - for(int i = 0; i < num_inserts; i++) { - strings[i] = malloc_random_string(1); - sum_strlen += strlen(strings[i]); - } - gettimeofday(&stop,0); - printf("Generated strings in %d seconds. Mean string length: %f\n", (int)(stop.tv_sec - start.tv_sec), (double)(sum_strlen)/(double)num_inserts); - - bloom_filter_t * bf = bloom_filter_create(hash_a, hash_b, num_inserts, 0.01); - bloom_filter_print_stats(bf); - gettimeofday(&start, 0); - for(int i = 0; i < num_inserts; i++) { - bloom_filter_insert(bf,strings[i], strlen(strings[i])); - } - gettimeofday(&stop, 0); - printf("Inserted strings in %d seconds.\n", (int)(stop.tv_sec - start.tv_sec)); - - gettimeofday(&start, 0); - for(int i = 0; i < num_inserts; i++) { - assert(bloom_filter_lookup(bf, strings[i], strlen(strings[i]))); - } - gettimeofday(&stop, 0); - printf("Looked up strings in %d seconds.\n", (int)(stop.tv_sec - start.tv_sec)); - bloom_filter_print_stats(bf); - - uint64_t false_positives = 0; - gettimeofday(&start, 0); - for(int i = 0; i < num_inserts; i++) { - char * str = malloc_random_string(2); - if(bloom_filter_lookup(bf, str, strlen(str))) { - false_positives ++; - } - assert(bloom_filter_lookup(bf, strings[i], strlen(strings[i]))); - free(str); - } - gettimeofday(&stop, 0); - printf("Generated and looked up non-existant strings in %d seconds\n" - "false positive rate was %lf\n", (int)(stop.tv_sec - start.tv_sec), - ((double)false_positives)/(double)num_inserts); - - return 0; -}