Tweak percentage intervals computation to improve segmented dedupe ratio.

Avoid repeat processing of already processed segments.
This commit is contained in:
Moinak Ghosh 2013-04-23 18:53:56 +05:30
parent d29f125ca7
commit 6b7d883393
3 changed files with 14 additions and 8 deletions

View file

@ -37,7 +37,7 @@ extern "C" {
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256 #define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256 #define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
#define DEFAULT_COMPRESS COMPRESS_LZ4 #define DEFAULT_COMPRESS COMPRESS_LZ4
#define DEFAULT_PCT_INTERVAL 10 #define DEFAULT_PCT_INTERVAL 5
#define CONTAINER_ITEMS 2048 #define CONTAINER_ITEMS 2048
#define MIN_CK 1 #define MIN_CK 1
#define MAX_CK 5 #define MAX_CK 5

View file

@ -169,7 +169,8 @@ set_cfg:
*pct_interval = 0; *pct_interval = 0;
} else { } else {
cfg->intervals = 100 / *pct_interval; cfg->intervals = 100 / *pct_interval;
cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals; cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2;
cfg->intervals--;
*hash_slots = file_sz / cfg->segment_sz_bytes + 1; *hash_slots = file_sz / cfg->segment_sz_bytes + 1;
*hash_slots *= (cfg->intervals + cfg->sub_intervals); *hash_slots *= (cfg->intervals + cfg->sub_intervals);
} }

View file

@ -846,7 +846,6 @@ process_blocks:
*/ */
blks = cfg->segment_sz; blks = cfg->segment_sz;
if (blks > blknum-i) blks = blknum-i; if (blks > blknum-i) blks = blknum-i;
len = 0;
length = 0; length = 0;
tgt = seg_heap; tgt = seg_heap;
for (j=0; j<blks; j++) { for (j=0; j<blks; j++) {
@ -855,11 +854,12 @@ process_blocks:
tgt += cfg->chunk_cksum_sz; tgt += cfg->chunk_cksum_sz;
} }
blks = j+i; blks = j+i;
qsort(seg_heap, length/8, 8, cmpint);
/* /*
* Compute the range similarity hashes. * Sort concatenated chunk hash buffer by raw 64-bit integer
* magnitudes.
*/ */
qsort(seg_heap, length/8, 8, cmpint);
sim_ck = ctx->similarity_cksums; sim_ck = ctx->similarity_cksums;
crc = 0; crc = 0;
sub_i = cfg->sub_intervals; sub_i = cfg->sub_intervals;
@ -869,6 +869,10 @@ process_blocks:
sub_i--; sub_i--;
increment = (length / cfg->intervals) / sub_i; increment = (length / cfg->intervals) / sub_i;
} }
/*
* Compute the range similarity hashes.
*/
len = length; len = length;
for (j = 0; j<sub_i; j++) { for (j = 0; j<sub_i; j++) {
crc = lzma_crc64(tgt, increment, 0); crc = lzma_crc64(tgt, increment, 0);
@ -951,18 +955,19 @@ process_blocks:
} }
/* /*
* Now lookup the similarity minhashes starting at the highest * Now lookup the similarity hashes starting at the highest
* significance level. * significance level.
*/ */
for (j=cfg->intervals + sub_i; j > 0; j--) { for (j=cfg->intervals + sub_i; j > 0; j--) {
hash_entry_t *he; hash_entry_t *he = NULL, *he1 = NULL;
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1); he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
if (he) { if (he && he != he1) {
/* /*
* Match found. Load segment metadata from disk and perform * Match found. Load segment metadata from disk and perform
* identity deduplication with the segment chunks. * identity deduplication with the segment chunks.
*/ */
he1 = he;
offset = he->item_offset; offset = he->item_offset;
if (db_segcache_map(cfg, ctx->id, &o_blks, &offset, if (db_segcache_map(cfg, ctx->id, &o_blks, &offset,
(uchar_t **)&seg_blocks) == -1) { (uchar_t **)&seg_blocks) == -1) {