Separate initial rabin boundary detection and block splitting for performance.
Also fix a rare corner case latent bug.
This commit is contained in:
parent
962a2cae8a
commit
7e14909ad1
1 changed files with 51 additions and 33 deletions
|
@ -260,9 +260,36 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
||||||
* in the chunk. We start scanning at chunk end - max rabin block size. We avoid doing
|
* in the chunk. We start scanning at chunk end - max rabin block size. We avoid doing
|
||||||
* a full chunk scan.
|
* a full chunk scan.
|
||||||
|
*
|
||||||
|
* !!!NOTE!!!: Code duplication below for performance.
|
||||||
*/
|
*/
|
||||||
if (rabin_pos) {
|
if (rabin_pos) {
|
||||||
offset = *size - RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
offset = *size - RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||||
|
for (i=offset; i<*size; i++) {
|
||||||
|
char cur_byte = buf1[i];
|
||||||
|
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||||
|
|
||||||
|
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||||
|
cur_roll_checksum = (cur_roll_checksum << 1) + cur_byte;
|
||||||
|
cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
|
|
||||||
|
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
|
||||||
|
length++;
|
||||||
|
if (length < ctx->rabin_poly_min_block_size) continue;
|
||||||
|
|
||||||
|
// If we hit our special value or reached the max block size update block offset
|
||||||
|
if ((cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
||||||
|
length >= rabin_polynomial_max_block_size) {
|
||||||
|
last_offset = i+1;
|
||||||
|
length = 0;
|
||||||
|
j = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last_offset < *size) {
|
||||||
|
*rabin_pos = last_offset;
|
||||||
|
}
|
||||||
|
return (0);
|
||||||
}
|
}
|
||||||
if (*size < ctx->rabin_poly_avg_block_size) return;
|
if (*size < ctx->rabin_poly_avg_block_size) return;
|
||||||
for (i=offset; i<*size; i++) {
|
for (i=offset; i<*size; i++) {
|
||||||
|
@ -293,21 +320,19 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
* variant of some approaches detailed in:
|
* variant of some approaches detailed in:
|
||||||
* http://www.armedia.com/wp/SimilarityIndex.pdf
|
* http://www.armedia.com/wp/SimilarityIndex.pdf
|
||||||
*/
|
*/
|
||||||
if (rabin_pos == NULL) {
|
len1++;
|
||||||
len1++;
|
j = cur_roll_checksum & ctx->rabin_avg_block_mask;
|
||||||
j = cur_roll_checksum & ctx->rabin_avg_block_mask;
|
fplist[j] += cur_roll_checksum;
|
||||||
fplist[j] += cur_roll_checksum;
|
if (fplist[j] > fplist[fpos]) fpos = j;
|
||||||
if (fplist[j] > fplist[fpos]) fpos = j;
|
if (len1 == SKETCH_BASIC_BLOCK_SZ) {
|
||||||
if (len1 == SKETCH_BASIC_BLOCK_SZ) {
|
/*
|
||||||
/*
|
* Compute the super sketch value by summing all the representative
|
||||||
* Compute the super sketch value by summing all the representative
|
* fingerprints of the block.
|
||||||
* fingerprints of the block.
|
*/
|
||||||
*/
|
cur_sketch += fplist[fpos];
|
||||||
cur_sketch += fplist[fpos];
|
memset(fplist, 0, fplist_sz);
|
||||||
memset(fplist, 0, fplist_sz);
|
fpos = 0;
|
||||||
fpos = 0;
|
len1 = 0;
|
||||||
len1 = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
||||||
|
@ -322,30 +347,23 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
// If we hit our special value or reached the max block size update block offset
|
// If we hit our special value or reached the max block size update block offset
|
||||||
if ((cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
if ((cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
||||||
length >= rabin_polynomial_max_block_size) {
|
length >= rabin_polynomial_max_block_size) {
|
||||||
if (rabin_pos == NULL) {
|
ctx->blocks[blknum].offset = last_offset;
|
||||||
ctx->blocks[blknum].offset = last_offset;
|
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
||||||
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
ctx->blocks[blknum].length = length;
|
||||||
ctx->blocks[blknum].length = length;
|
ctx->blocks[blknum].refcount = 0;
|
||||||
ctx->blocks[blknum].refcount = 0;
|
ctx->blocks[blknum].similar = 0;
|
||||||
ctx->blocks[blknum].similar = 0;
|
ctx->blocks[blknum].cksum_n_offset = cur_sketch;
|
||||||
ctx->blocks[blknum].cksum_n_offset = cur_sketch;
|
memset(fplist, 0, fplist_sz);
|
||||||
memset(fplist, 0, fplist_sz);
|
fpos = 0;
|
||||||
fpos = 0;
|
len1 = 0;
|
||||||
len1 = 0;
|
cur_sketch = 0;
|
||||||
cur_sketch = 0;
|
blknum++;
|
||||||
blknum++;
|
|
||||||
}
|
|
||||||
last_offset = i+1;
|
last_offset = i+1;
|
||||||
length = 0;
|
length = 0;
|
||||||
j = 0;
|
j = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rabin_pos && last_offset < *size) {
|
|
||||||
*rabin_pos = last_offset;
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we found at least a few chunks, perform dedup.
|
// If we found at least a few chunks, perform dedup.
|
||||||
if (blknum > 2) {
|
if (blknum > 2) {
|
||||||
uint64_t prev_cksum;
|
uint64_t prev_cksum;
|
||||||
|
|
Loading…
Reference in a new issue