Tweak percentage intervals computation to improve segmented dedupe ratio.

Avoid repeat processing of already processed segments.
This commit is contained in:
Moinak Ghosh 2013-04-23 18:53:56 +05:30
parent d29f125ca7
commit 6b7d883393
3 changed files with 14 additions and 8 deletions

View file

@ -37,7 +37,7 @@ extern "C" {
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
#define DEFAULT_COMPRESS COMPRESS_LZ4
#define DEFAULT_PCT_INTERVAL 10
#define DEFAULT_PCT_INTERVAL 5
#define CONTAINER_ITEMS 2048
#define MIN_CK 1
#define MAX_CK 5

View file

@ -169,7 +169,8 @@ set_cfg:
*pct_interval = 0;
} else {
cfg->intervals = 100 / *pct_interval;
cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals;
cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2;
cfg->intervals--;
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
*hash_slots *= (cfg->intervals + cfg->sub_intervals);
}

View file

@ -846,7 +846,6 @@ process_blocks:
*/
blks = cfg->segment_sz;
if (blks > blknum-i) blks = blknum-i;
len = 0;
length = 0;
tgt = seg_heap;
for (j=0; j<blks; j++) {
@ -855,11 +854,12 @@ process_blocks:
tgt += cfg->chunk_cksum_sz;
}
blks = j+i;
qsort(seg_heap, length/8, 8, cmpint);
/*
* Compute the range similarity hashes.
* Sort concatenated chunk hash buffer by raw 64-bit integer
* magnitudes.
*/
qsort(seg_heap, length/8, 8, cmpint);
sim_ck = ctx->similarity_cksums;
crc = 0;
sub_i = cfg->sub_intervals;
@ -869,6 +869,10 @@ process_blocks:
sub_i--;
increment = (length / cfg->intervals) / sub_i;
}
/*
* Compute the range similarity hashes.
*/
len = length;
for (j = 0; j<sub_i; j++) {
crc = lzma_crc64(tgt, increment, 0);
@ -951,18 +955,19 @@ process_blocks:
}
/*
* Now lookup the similarity minhashes starting at the highest
* Now lookup the similarity hashes starting at the highest
* significance level.
*/
for (j=cfg->intervals + sub_i; j > 0; j--) {
hash_entry_t *he;
hash_entry_t *he = NULL, *he1 = NULL;
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
if (he) {
if (he && he != he1) {
/*
* Match found. Load segment metadata from disk and perform
* identity deduplication with the segment chunks.
*/
he1 = he;
offset = he->item_offset;
if (db_segcache_map(cfg, ctx->id, &o_blks, &offset,
(uchar_t **)&seg_blocks) == -1) {