Tweak percentage intervals computation to improve segmented dedupe ratio.
Avoid repeat processing of already processed segments.
This commit is contained in:
parent
d29f125ca7
commit
6b7d883393
3 changed files with 14 additions and 8 deletions
|
@ -37,7 +37,7 @@ extern "C" {
|
||||||
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
|
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
|
||||||
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
|
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
|
||||||
#define DEFAULT_COMPRESS COMPRESS_LZ4
|
#define DEFAULT_COMPRESS COMPRESS_LZ4
|
||||||
#define DEFAULT_PCT_INTERVAL 10
|
#define DEFAULT_PCT_INTERVAL 5
|
||||||
#define CONTAINER_ITEMS 2048
|
#define CONTAINER_ITEMS 2048
|
||||||
#define MIN_CK 1
|
#define MIN_CK 1
|
||||||
#define MAX_CK 5
|
#define MAX_CK 5
|
||||||
|
|
|
@ -169,7 +169,8 @@ set_cfg:
|
||||||
*pct_interval = 0;
|
*pct_interval = 0;
|
||||||
} else {
|
} else {
|
||||||
cfg->intervals = 100 / *pct_interval;
|
cfg->intervals = 100 / *pct_interval;
|
||||||
cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals;
|
cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2;
|
||||||
|
cfg->intervals--;
|
||||||
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||||
*hash_slots *= (cfg->intervals + cfg->sub_intervals);
|
*hash_slots *= (cfg->intervals + cfg->sub_intervals);
|
||||||
}
|
}
|
||||||
|
|
|
@ -846,7 +846,6 @@ process_blocks:
|
||||||
*/
|
*/
|
||||||
blks = cfg->segment_sz;
|
blks = cfg->segment_sz;
|
||||||
if (blks > blknum-i) blks = blknum-i;
|
if (blks > blknum-i) blks = blknum-i;
|
||||||
len = 0;
|
|
||||||
length = 0;
|
length = 0;
|
||||||
tgt = seg_heap;
|
tgt = seg_heap;
|
||||||
for (j=0; j<blks; j++) {
|
for (j=0; j<blks; j++) {
|
||||||
|
@ -855,11 +854,12 @@ process_blocks:
|
||||||
tgt += cfg->chunk_cksum_sz;
|
tgt += cfg->chunk_cksum_sz;
|
||||||
}
|
}
|
||||||
blks = j+i;
|
blks = j+i;
|
||||||
qsort(seg_heap, length/8, 8, cmpint);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute the range similarity hashes.
|
* Sort concatenated chunk hash buffer by raw 64-bit integer
|
||||||
|
* magnitudes.
|
||||||
*/
|
*/
|
||||||
|
qsort(seg_heap, length/8, 8, cmpint);
|
||||||
sim_ck = ctx->similarity_cksums;
|
sim_ck = ctx->similarity_cksums;
|
||||||
crc = 0;
|
crc = 0;
|
||||||
sub_i = cfg->sub_intervals;
|
sub_i = cfg->sub_intervals;
|
||||||
|
@ -869,6 +869,10 @@ process_blocks:
|
||||||
sub_i--;
|
sub_i--;
|
||||||
increment = (length / cfg->intervals) / sub_i;
|
increment = (length / cfg->intervals) / sub_i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compute the range similarity hashes.
|
||||||
|
*/
|
||||||
len = length;
|
len = length;
|
||||||
for (j = 0; j<sub_i; j++) {
|
for (j = 0; j<sub_i; j++) {
|
||||||
crc = lzma_crc64(tgt, increment, 0);
|
crc = lzma_crc64(tgt, increment, 0);
|
||||||
|
@ -951,18 +955,19 @@ process_blocks:
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now lookup the similarity minhashes starting at the highest
|
* Now lookup the similarity hashes starting at the highest
|
||||||
* significance level.
|
* significance level.
|
||||||
*/
|
*/
|
||||||
for (j=cfg->intervals + sub_i; j > 0; j--) {
|
for (j=cfg->intervals + sub_i; j > 0; j--) {
|
||||||
hash_entry_t *he;
|
hash_entry_t *he = NULL, *he1 = NULL;
|
||||||
|
|
||||||
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
|
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
|
||||||
if (he) {
|
if (he && he != he1) {
|
||||||
/*
|
/*
|
||||||
* Match found. Load segment metadata from disk and perform
|
* Match found. Load segment metadata from disk and perform
|
||||||
* identity deduplication with the segment chunks.
|
* identity deduplication with the segment chunks.
|
||||||
*/
|
*/
|
||||||
|
he1 = he;
|
||||||
offset = he->item_offset;
|
offset = he->item_offset;
|
||||||
if (db_segcache_map(cfg, ctx->id, &o_blks, &offset,
|
if (db_segcache_map(cfg, ctx->id, &o_blks, &offset,
|
||||||
(uchar_t **)&seg_blocks) == -1) {
|
(uchar_t **)&seg_blocks) == -1) {
|
||||||
|
|
Loading…
Reference in a new issue