Tweak percentage intervals computation to improve segmented dedupe ratio.
Avoid repeat processing of already processed segments.
This commit is contained in:
parent
d29f125ca7
commit
6b7d883393
3 changed files with 14 additions and 8 deletions
|
@ -37,7 +37,7 @@ extern "C" {
|
|||
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
|
||||
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
|
||||
#define DEFAULT_COMPRESS COMPRESS_LZ4
|
||||
#define DEFAULT_PCT_INTERVAL 10
|
||||
#define DEFAULT_PCT_INTERVAL 5
|
||||
#define CONTAINER_ITEMS 2048
|
||||
#define MIN_CK 1
|
||||
#define MAX_CK 5
|
||||
|
|
|
@ -169,7 +169,8 @@ set_cfg:
|
|||
*pct_interval = 0;
|
||||
} else {
|
||||
cfg->intervals = 100 / *pct_interval;
|
||||
cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals;
|
||||
cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2;
|
||||
cfg->intervals--;
|
||||
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||
*hash_slots *= (cfg->intervals + cfg->sub_intervals);
|
||||
}
|
||||
|
|
|
@ -846,7 +846,6 @@ process_blocks:
|
|||
*/
|
||||
blks = cfg->segment_sz;
|
||||
if (blks > blknum-i) blks = blknum-i;
|
||||
len = 0;
|
||||
length = 0;
|
||||
tgt = seg_heap;
|
||||
for (j=0; j<blks; j++) {
|
||||
|
@ -855,11 +854,12 @@ process_blocks:
|
|||
tgt += cfg->chunk_cksum_sz;
|
||||
}
|
||||
blks = j+i;
|
||||
qsort(seg_heap, length/8, 8, cmpint);
|
||||
|
||||
/*
|
||||
* Compute the range similarity hashes.
|
||||
* Sort concatenated chunk hash buffer by raw 64-bit integer
|
||||
* magnitudes.
|
||||
*/
|
||||
qsort(seg_heap, length/8, 8, cmpint);
|
||||
sim_ck = ctx->similarity_cksums;
|
||||
crc = 0;
|
||||
sub_i = cfg->sub_intervals;
|
||||
|
@ -869,6 +869,10 @@ process_blocks:
|
|||
sub_i--;
|
||||
increment = (length / cfg->intervals) / sub_i;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the range similarity hashes.
|
||||
*/
|
||||
len = length;
|
||||
for (j = 0; j<sub_i; j++) {
|
||||
crc = lzma_crc64(tgt, increment, 0);
|
||||
|
@ -951,18 +955,19 @@ process_blocks:
|
|||
}
|
||||
|
||||
/*
|
||||
* Now lookup the similarity minhashes starting at the highest
|
||||
* Now lookup the similarity hashes starting at the highest
|
||||
* significance level.
|
||||
*/
|
||||
for (j=cfg->intervals + sub_i; j > 0; j--) {
|
||||
hash_entry_t *he;
|
||||
hash_entry_t *he = NULL, *he1 = NULL;
|
||||
|
||||
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
|
||||
if (he) {
|
||||
if (he && he != he1) {
|
||||
/*
|
||||
* Match found. Load segment metadata from disk and perform
|
||||
* identity deduplication with the segment chunks.
|
||||
*/
|
||||
he1 = he;
|
||||
offset = he->item_offset;
|
||||
if (db_segcache_map(cfg, ctx->id, &o_blks, &offset,
|
||||
(uchar_t **)&seg_blocks) == -1) {
|
||||
|
|
Loading…
Reference in a new issue