Switch to multiplicative rolling hash for good distribution properties.
This commit is contained in:
parent
d94be4e314
commit
24e6f4e629
2 changed files with 22 additions and 18 deletions
|
@ -84,7 +84,7 @@ extern int bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new,
|
||||||
bsize_t *_newsize);
|
bsize_t *_newsize);
|
||||||
|
|
||||||
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
uint64_t ir[256];
|
uint64_t ir[256], out[256];
|
||||||
static int inited = 0;
|
static int inited = 0;
|
||||||
|
|
||||||
static uint32_t
|
static uint32_t
|
||||||
|
@ -130,16 +130,22 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
pthread_mutex_lock(&init_lock);
|
pthread_mutex_lock(&init_lock);
|
||||||
if (!inited) {
|
if (!inited) {
|
||||||
int term, j;
|
int term, j;
|
||||||
uint64_t val;
|
uint64_t val, poly_pow;
|
||||||
|
|
||||||
|
poly_pow = 1;
|
||||||
|
for (j = 0; j < RAB_POLYNOMIAL_WIN_SIZE; j++) {
|
||||||
|
poly_pow = (poly_pow * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
||||||
|
}
|
||||||
|
|
||||||
for (j = 0; j < 256; j++) {
|
for (j = 0; j < 256; j++) {
|
||||||
term = 1;
|
term = 1;
|
||||||
val = 0;
|
val = 1;
|
||||||
|
out[j] = (j * poly_pow) & POLY_MASK;
|
||||||
for (i=0; i<RAB_POLYNOMIAL_WIN_SIZE; i++) {
|
for (i=0; i<RAB_POLYNOMIAL_WIN_SIZE; i++) {
|
||||||
if (term & FP_POLY) {
|
if (term & FP_POLY) {
|
||||||
val += term * j;
|
val += ((term * j) & POLY_MASK);
|
||||||
}
|
}
|
||||||
term <<= 1;
|
term = (term * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
||||||
}
|
}
|
||||||
ir[j] = val;
|
ir[j] = val;
|
||||||
}
|
}
|
||||||
|
@ -346,10 +352,11 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
for (i=offset; i<*size; i++) {
|
for (i=offset; i<*size; i++) {
|
||||||
uchar_t cur_byte = buf1[i];
|
uchar_t cur_byte = buf1[i];
|
||||||
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||||
|
|
||||||
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||||
cur_roll_checksum = (cur_roll_checksum << 1) + cur_byte;
|
|
||||||
cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
|
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
||||||
|
cur_roll_checksum += cur_byte;
|
||||||
|
cur_roll_checksum -= out[pushed_out];
|
||||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||||
|
|
||||||
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
|
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
|
||||||
|
@ -377,15 +384,10 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offs
|
||||||
uchar_t cur_byte = buf1[i];
|
uchar_t cur_byte = buf1[i];
|
||||||
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||||
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||||
/*
|
|
||||||
* We want to do:
|
cur_roll_checksum = (cur_roll_checksum * RAB_POLYNOMIAL_CONST) & POLY_MASK;
|
||||||
* cur_roll_checksum = cur_roll_checksum * RAB_POLYNOMIAL_CONST + cur_byte;
|
cur_roll_checksum += cur_byte;
|
||||||
* cur_roll_checksum -= pushed_out * polynomial_pow;
|
cur_roll_checksum -= out[pushed_out];
|
||||||
*
|
|
||||||
* However since RAB_POLYNOMIAL_CONST == 2, we use shifts.
|
|
||||||
*/
|
|
||||||
cur_roll_checksum = (cur_roll_checksum << 1) + cur_byte;
|
|
||||||
cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
|
|
||||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -63,7 +63,9 @@
|
||||||
|
|
||||||
//List of constants, mostly constraints and defaults for various parameters
|
//List of constants, mostly constraints and defaults for various parameters
|
||||||
//to the Rabin Fingerprinting algorithm
|
//to the Rabin Fingerprinting algorithm
|
||||||
#define RAB_POLYNOMIAL_CONST 2
|
//Use prime constant from Bulat Ziganshin's REP. Seems to work best across wide range of data.
|
||||||
|
#define RAB_POLYNOMIAL_CONST 153191
|
||||||
|
#define POLY_MASK (0xffffffffffULL)
|
||||||
#define RAB_BLK_DEFAULT 1
|
#define RAB_BLK_DEFAULT 1
|
||||||
#define RAB_BLK_MIN_BITS 11
|
#define RAB_BLK_MIN_BITS 11
|
||||||
#define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
|
#define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
|
||||||
|
|
Loading…
Reference in a new issue