initial checkin of bloom filter

git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@1232 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe
This commit is contained in:
sears 2010-10-03 18:06:19 +00:00
parent 6eaf5da8de
commit 3619e86271
5 changed files with 263 additions and 1 deletions

View file

@ -76,7 +76,7 @@ ENDIF ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" )
#CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
IF ( HAVE_STASIS )
ADD_LIBRARY(logstore requestDispatch.cpp simpleServer.cpp logserver.cpp logstore.cpp diskTreeComponent.cpp memTreeComponent.cpp datapage.cpp merger.cpp tuplemerger.cpp mergeStats.cpp mergeManager.cpp)
ADD_LIBRARY(logstore requestDispatch.cpp simpleServer.cpp logserver.cpp logstore.cpp diskTreeComponent.cpp memTreeComponent.cpp datapage.cpp merger.cpp tuplemerger.cpp mergeStats.cpp mergeManager.cpp bloomFilter.c)
CREATE_EXECUTABLE(server)
CREATE_EXECUTABLE(newserver)
ENDIF ( HAVE_STASIS )

131
bloomFilter.c Normal file
View file

@ -0,0 +1,131 @@
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include "bloomFilter.h"
/**
Variable names:
m: number of bloom filter bits
n: number of bloom filter entries
k: number of hash functions = ln(2) * (m/n)
c: m/n
f: false positive rate = (1/2)^k ~= 0.6185)^(m/n) ;
taking lof_0.6185 of both sides: k log_0.6185(1/2) = m/n ;
applying change of base: k log(1/2) / log(6.128) = m / n
(but that's not useful; this is:)
f ~= 0.6185 ^ (m/n)
log_0.6185(f) = m/n
log(f) / log(0.6185) = m / n
m = n log f / log 0.6185
p: probability a given bit is 1 ~= e^(-kn/m)
*/
static uint64_t bloom_filter_calc_num_buckets(uint64_t num_expected_items, double false_positive_rate) {
//m = n log f / log 0.6185
return ((uint64_t) ceil(((double)num_expected_items) * log(false_positive_rate) / log(0.6185)));
// m = - n ln f / ln 2 ^ 2 = - n ln f / 0.4804 = - n log f / (0.4343 * 0.4804) = -n log f / 0.2086
}
static int bloom_filter_calc_num_functions(uint64_t num_expected_items, uint64_t num_buckets) {
// k = ln(2) * (m/n)
int ret = floor((log(2) / log(exp(1.0))) * ((double) num_buckets) / (double) num_expected_items);
if(ret == 0) {
return 1;
} else {
return ret;
}
}
static double bloom_filter_current_false_positive_rate(uint64_t actual_number_of_items, uint64_t num_buckets) {
// 0.6185^(m/n)
return pow(0.6185, ((double)num_buckets)/(double)actual_number_of_items);
}
struct bloom_filter_t {
uint64_t (*func_a)(const char *, int);
uint64_t (*func_b)(const char *, int);
uint64_t num_expected_items;
double desired_false_positive_rate;
uint64_t num_buckets;
uint8_t * buckets;
uint64_t num_functions;
uint64_t*result_scratch_space;
uint64_t actual_number_of_items;
};
bloom_filter_t * bloom_filter_create(uint64_t(*func_a)(const char*,int),
uint64_t(*func_b)(const char*,int),
uint64_t num_expected_items,
double false_positive_rate) {
bloom_filter_t * ret = malloc(sizeof(*ret));
ret->func_a = func_a;
ret->func_b = func_b;
ret->num_expected_items = num_expected_items;
ret->desired_false_positive_rate = false_positive_rate;
ret->num_buckets = bloom_filter_calc_num_buckets(ret->num_expected_items, ret->desired_false_positive_rate);
ret->buckets = calloc((ret->num_buckets / 8) + ((ret->num_buckets % 8 == 0) ? 0 : 1), 1);
ret->num_functions = bloom_filter_calc_num_functions(ret->num_expected_items, ret->num_buckets);
ret->result_scratch_space = malloc(sizeof(*ret->result_scratch_space) * ret->num_functions);
ret->actual_number_of_items = 0;
return ret;
}
void bloom_filter_destroy(bloom_filter_t* bf) {
free(bf->buckets);
free(bf);
}
// XXX this uses %. It would be better if it used &, but that would potentially double the memory we use. #define a flag.
static void bloom_filter_calc_functions(bloom_filter_t * bf, uint64_t* results, const char * key, int keylen) {
uint64_t fa = bf->func_a(key, keylen);
uint64_t fb = bf->func_b(key, keylen);
results[0] = (fa + fb) % bf->num_buckets;
for(int i = 1; i < bf->num_functions; i++) {
results[i] = (results[i-1] + fb ) % bf->num_buckets;
}
}
static const uint8_t bloom_filter_bit_masks[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
static void bloom_filter_set_bit(bloom_filter_t *bf, uint64_t bit) {
uint64_t array_offset = bit >> 3;
uint8_t bit_number = bit & 7;
assert(bit < bf->num_buckets);
bf->buckets[array_offset] |= bloom_filter_bit_masks[bit_number];
}
/**
@return 0 if the bit is not set, true otherwise.
*/
static uint8_t bloom_filter_get_bit(bloom_filter_t *bf, uint64_t bit) {
uint64_t array_offset = bit >> 3;
uint8_t bit_number = bit & 7;
assert(bit < bf->num_buckets);
return bf->buckets[array_offset] & bloom_filter_bit_masks[bit_number];
}
void bloom_filter_insert(bloom_filter_t * bf, const char *key, int len) {
bloom_filter_calc_functions(bf, bf->result_scratch_space, key, len);
for(int i = 0; i < bf->num_functions; i++) {
bloom_filter_set_bit(bf, bf->result_scratch_space[i]);
}
bf->actual_number_of_items++;
}
int bloom_filter_lookup(bloom_filter_t * bf, const char * key, int len) {
int ret = 1;
uint64_t * scratch = malloc(sizeof(*scratch) * bf->num_functions);
bloom_filter_calc_functions(bf, scratch, key, len);
for(int i = 0; i < bf->num_functions; i++) {
ret = ret && bloom_filter_get_bit(bf, scratch[i]);
}
free(scratch);
return ret;
}
void bloom_filter_print_stats(bloom_filter_t * bf) {
printf("Design capacity %lld design false positive %f\n"
"Current item count %lld current false positive %f\n"
"Number of buckets %lld (%f MB), number of hash functions %lld\n",
bf->num_expected_items, bf->desired_false_positive_rate,
bf->actual_number_of_items,
bloom_filter_current_false_positive_rate(bf->actual_number_of_items, bf->num_buckets),
bf->num_buckets, ((double)bf->num_buckets) / (8.0 * 1024.0 * 1024.0), bf->num_functions);
}

28
bloomFilter.h Normal file
View file

@ -0,0 +1,28 @@
#ifndef BLOOM_FILTER_H
#define BLOOM_FILTER_H
#include <stasis/common.h>
BEGIN_C_DECLS
typedef struct bloom_filter_t bloom_filter_t;
/**
@return 0 if there is not enough memory, or some other error occurred; a pointer to the new bloom filter otherwise.
*/
bloom_filter_t * bloom_filter_create(uint64_t(*hash_func_a)(const char*,int), uint64_t(*hash_func_b)(const char*,int),
uint64_t num_expected_items, double false_positive_rate);
void bloom_filter_destroy(bloom_filter_t*);
void bloom_filter_insert(bloom_filter_t * bf, const char* key, int len);
/**
@return 1 if the value might be in the bloom filter, 0 otherwise
*/
int bloom_filter_lookup(bloom_filter_t * bf, const char* key, int len);
void bloom_filter_print_stats(bloom_filter_t * bf);
END_C_DECLS
#endif

View file

@ -1,5 +1,6 @@
IF( HAVE_STASIS )
CREATE_CHECK(check_gen)
CREATE_CHECK(check_bloomFilter)
CREATE_CHECK(check_testAndSet)
CREATE_CHECK(check_logtree)
CREATE_CHECK(check_datapage)

102
test/check_bloomFilter.cpp Normal file
View file

@ -0,0 +1,102 @@
/*
* check_bloomFilter.c
*
* Created on: Oct 2, 2010
* Author: sears
*/
#include <stasis/util/hashFunctions.h>
#include <bloomFilter.h>
#include <assert.h>
#include <stdio.h>
#include <sys/time.h>
#include <stasis/crc32.h>
/*
* This file can test CRC and FNV-1 based hash functions. Based on early experiments:
*
* CRC32 insert/lookup: 11/13 seconds, 1.1% false positive
* FNV-1 insert/lookup: 8/9 seconds, 2.8% false positive
*
* Expected false positive rate is 1%.
*/
static uint64_t hash_a(const char* a, int len) {
return stasis_crc32(a,len,0xcafebabe);
}
static uint64_t hash_b(const char* a, int len) {
return stasis_crc32(a,len,0xdeadbeef);
}
static uint64_t hash_a_fnv(const char* a, int len) {
return stasis_util_hash_fnv_1_uint32_t((const byte*)a, len);
}
static uint64_t hash_b_fnv(const char* a, int len) {
return stasis_util_hash_fnv_1_uint64_t((const byte*)a, len);
}
static char * malloc_random_string(int group) {
char * str = 0;
int strlen = 0;
while(!strlen) strlen = 128 + (rand() & 127);
str = (char*)malloc(strlen + 1);
str[0] = group;
for(int i = 1; i < strlen; i++) {
str[i] = (rand() & 128) + 1;
}
str[strlen] = 0;
return str;
}
int main(int argc, char * argv[]) {
(void)hash_a; (void)hash_b;
(void)hash_a_fnv; (void)hash_b_fnv;
const int num_inserts = 1000000;
char ** strings = (char**)malloc(num_inserts * sizeof(char*));
uint64_t sum_strlen = 0;
struct timeval start, stop;
gettimeofday(&start, 0);
printf("seed: %lld\n", (long long)start.tv_sec);
srand(start.tv_sec);
for(int i = 0; i < num_inserts; i++) {
strings[i] = malloc_random_string(1);
sum_strlen += strlen(strings[i]);
}
gettimeofday(&stop,0);
printf("Generated strings in %d seconds. Mean string length: %f\n", (int)(stop.tv_sec - start.tv_sec), (double)(sum_strlen)/(double)num_inserts);
bloom_filter_t * bf = bloom_filter_create(hash_a, hash_b, num_inserts, 0.01);
bloom_filter_print_stats(bf);
gettimeofday(&start, 0);
for(int i = 0; i < num_inserts; i++) {
bloom_filter_insert(bf,strings[i], strlen(strings[i]));
}
gettimeofday(&stop, 0);
printf("Inserted strings in %d seconds.\n", (int)(stop.tv_sec - start.tv_sec));
gettimeofday(&start, 0);
for(int i = 0; i < num_inserts; i++) {
assert(bloom_filter_lookup(bf, strings[i], strlen(strings[i])));
}
gettimeofday(&stop, 0);
printf("Looked up strings in %d seconds.\n", (int)(stop.tv_sec - start.tv_sec));
bloom_filter_print_stats(bf);
uint64_t false_positives = 0;
gettimeofday(&start, 0);
for(int i = 0; i < num_inserts; i++) {
char * str = malloc_random_string(2);
if(bloom_filter_lookup(bf, str, strlen(str))) {
false_positives ++;
}
assert(bloom_filter_lookup(bf, strings[i], strlen(strings[i])));
free(str);
}
gettimeofday(&stop, 0);
printf("Generated and looked up non-existant strings in %d seconds\n"
"false positive rate was %lf\n", (int)(stop.tv_sec - start.tv_sec),
((double)false_positives)/(double)num_inserts);
return 0;
}