Use a rolling checksum based sketch value for a rabin chunk instead of a CRC64 checksum.

Avoids additional table-lookup memory access.
Reduce Rabin window size to avoid overflows in sketch value.
No need to maintain rolling checksum in Rabin context.
A few comment cleanups.
This commit is contained in:
Moinak Ghosh 2012-07-13 22:06:55 +05:30
parent 0091a0da02
commit 1da2c40888
2 changed files with 19 additions and 41 deletions

View file

@ -62,18 +62,6 @@
#include <allocator.h> #include <allocator.h>
#include <utils.h> #include <utils.h>
// CRC64 pieces from LZMA's implementation -----------------
#include <crc_macros.h>
#ifdef WORDS_BIGENDIAN
# define A1(x) ((x) >> 56)
#else
# define A1 A
#endif
extern const uint64_t lzma_crc64_table[4][256];
// ---------------------------------------------------------
#include "rabin_polynomial.h" #include "rabin_polynomial.h"
extern int lzma_init(void **data, int *level, ssize_t chunksize); extern int lzma_init(void **data, int *level, ssize_t chunksize);
@ -175,8 +163,6 @@ reset_rabin_context(rabin_context_t *ctx)
{ {
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
ctx->window_pos = 0; ctx->window_pos = 0;
ctx->cur_roll_checksum = 0;
ctx->cur_checksum = 0;
} }
void void
@ -214,16 +200,18 @@ cmpblks(const void *a, const void *b)
uint32_t uint32_t
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos) rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
{ {
ssize_t i, last_offset,j; ssize_t i, last_offset, j;
uint32_t blknum; uint32_t blknum;
char *buf1 = (char *)buf; char *buf1 = (char *)buf;
uint32_t length; uint32_t length;
uint64_t cur_roll_checksum[2];
length = offset; length = offset;
last_offset = 0; last_offset = 0;
blknum = 0; blknum = 0;
ctx->valid = 0; ctx->valid = 0;
ctx->cur_checksum = 0; cur_roll_checksum[0] = 0;
cur_roll_checksum[1] = 0;
j = 0; j = 0;
/* /*
@ -243,40 +231,40 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
* We want to do: * We want to do:
* cur_roll_checksum = cur_roll_checksum * RAB_POLYNOMIAL_CONST + cur_byte; * cur_roll_checksum = cur_roll_checksum * RAB_POLYNOMIAL_CONST + cur_byte;
* cur_roll_checksum -= pushed_out * polynomial_pow; * cur_roll_checksum -= pushed_out * polynomial_pow;
* cur_checksum = cur_checksum * RAB_POLYNOMIAL_CONST + cur_byte;
* *
* However since RAB_POLYNOMIAL_CONST == 2, we use shifts. * However since RAB_POLYNOMIAL_CONST == 2, we use shifts.
*/ */
ctx->cur_roll_checksum = (ctx->cur_roll_checksum << 1) + cur_byte; cur_roll_checksum[1] = (cur_roll_checksum[1] << 1) + cur_byte;
ctx->cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE); cur_roll_checksum[1] -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
// CRC64 Calculation swiped from LZMA // Compute Sum 0 mod 25 Sketch. We are avoiding a branch here.
ctx->cur_checksum = lzma_crc64_table[0][cur_byte ^ A1(ctx->cur_checksum)] ^ S8(ctx->cur_checksum); // See: http://www.armedia.com/wp/SimilarityIndex.pdf
j += cur_roll_checksum[(cur_roll_checksum[1] % 25 == 0)];
// Count how many bytes have msb set. Needed to detect 7-bit text data. /*
j += (cur_byte >> 7); * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
* We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
// Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 * to be power of 2
// This requires RAB_POLYNOMIAL_WIN_SIZE to be power of 2 */
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
length++; length++;
if (length < ctx->rabin_poly_min_block_size) continue; if (length < ctx->rabin_poly_min_block_size) continue;
// If we hit our special value or reached the max block size update block offset // If we hit our special value or reached the max block size update block offset
if ((ctx->cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt || if ((cur_roll_checksum[1] & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
length >= rabin_polynomial_max_block_size) { length >= rabin_polynomial_max_block_size) {
if (rabin_pos == NULL) { if (rabin_pos == NULL) {
ctx->blocks[blknum].offset = last_offset; ctx->blocks[blknum].offset = last_offset;
ctx->blocks[blknum].index = blknum; // Need to store for sorting ctx->blocks[blknum].index = blknum; // Need to store for sorting
ctx->blocks[blknum].cksum_n_offset = ctx->cur_checksum; ctx->blocks[blknum].cksum_n_offset = j;
ctx->blocks[blknum].length = length; ctx->blocks[blknum].length = length;
ctx->blocks[blknum].refcount = 0; ctx->blocks[blknum].refcount = 0;
blknum++; blknum++;
} }
ctx->cur_checksum = 0;
last_offset = i+1; last_offset = i+1;
length = 0; length = 0;
j = 0;
} }
} }
@ -284,10 +272,6 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
*rabin_pos = last_offset; *rabin_pos = last_offset;
return (0); return (0);
} }
if (j > *size * 0.40)
ctx->data_type = DATA_BINARY;
else
ctx->data_type = DATA_TEXT;
// If we found at least a few chunks, perform dedup. // If we found at least a few chunks, perform dedup.
if (blknum > 2) { if (blknum > 2) {
@ -303,11 +287,10 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
if (last_offset < *size) { if (last_offset < *size) {
ctx->blocks[blknum].offset = last_offset; ctx->blocks[blknum].offset = last_offset;
ctx->blocks[blknum].index = blknum; ctx->blocks[blknum].index = blknum;
ctx->blocks[blknum].cksum_n_offset = ctx->cur_checksum; ctx->blocks[blknum].cksum_n_offset = j;
ctx->blocks[blknum].length = *size - last_offset; ctx->blocks[blknum].length = *size - last_offset;
ctx->blocks[blknum].refcount = 0; ctx->blocks[blknum].refcount = 0;
blknum++; blknum++;
ctx->cur_checksum = 0;
last_offset = *size; last_offset = *size;
} }
@ -355,14 +338,12 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
matchlen += prev_length; matchlen += prev_length;
continue; continue;
} }
prev_offset = buf1 + ctx->blocks[blk].offset; prev_offset = buf1 + ctx->blocks[blk].offset;
prev_cksum = ctx->blocks[blk].cksum_n_offset; prev_cksum = ctx->blocks[blk].cksum_n_offset;
prev_length = ctx->blocks[blk].length; prev_length = ctx->blocks[blk].length;
prev_index = ctx->blocks[blk].index; prev_index = ctx->blocks[blk].index;
prev_blk = blk; prev_blk = blk;
} }
if (matchlen < rabin_index_sz) { if (matchlen < rabin_index_sz) {
ctx->valid = 0; ctx->valid = 0;
return; return;

View file

@ -80,7 +80,7 @@
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048 #define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048
#define LZMA_WINDOW_MAX (128L * 1024L * 1024L) #define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
#define RAB_POLYNOMIAL_WIN_SIZE 32 #define RAB_POLYNOMIAL_WIN_SIZE 16
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8 #define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64 #define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
@ -126,8 +126,6 @@ typedef struct {
rabin_blockentry_t *blocks; rabin_blockentry_t *blocks;
unsigned char *cbuf; unsigned char *cbuf;
int window_pos; int window_pos;
uint64_t cur_roll_checksum;
uint64_t cur_checksum;
uint32_t rabin_poly_max_block_size; uint32_t rabin_poly_max_block_size;
uint32_t rabin_poly_min_block_size; uint32_t rabin_poly_min_block_size;
uint32_t rabin_poly_avg_block_size; uint32_t rabin_poly_avg_block_size;
@ -135,7 +133,6 @@ typedef struct {
uint32_t rabin_break_patt; uint32_t rabin_break_patt;
uint64_t real_chunksize; uint64_t real_chunksize;
short valid; short valid;
short data_type;
void *lzma_data; void *lzma_data;
int level; int level;
} rabin_context_t; } rabin_context_t;