Improve segment similarity detection and drastically reduce index size.
This commit is contained in:
parent
b32f4b3f9a
commit
eabd670790
2 changed files with 17 additions and 32 deletions
|
@ -158,21 +158,18 @@ set_cfg:
|
||||||
// Compute total hashtable entries first
|
// Compute total hashtable entries first
|
||||||
*hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
|
*hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
|
||||||
if (*pct_interval == 0) {
|
if (*pct_interval == 0) {
|
||||||
cfg->intervals = 1;
|
cfg->sub_intervals = 1;
|
||||||
cfg->sub_intervals = 0;
|
|
||||||
*hash_slots = file_sz / cfg->chunk_sz_bytes + 1;
|
*hash_slots = file_sz / cfg->chunk_sz_bytes + 1;
|
||||||
|
|
||||||
} else if (*pct_interval == 100) {
|
} else if (*pct_interval == 100) {
|
||||||
cfg->intervals = 1;
|
cfg->sub_intervals = 1;
|
||||||
cfg->sub_intervals = 0;
|
|
||||||
*hash_slots = SLOTS_FOR_MEM(memlimit, *hash_entry_size);
|
*hash_slots = SLOTS_FOR_MEM(memlimit, *hash_entry_size);
|
||||||
*pct_interval = 0;
|
*pct_interval = 0;
|
||||||
} else {
|
} else {
|
||||||
cfg->intervals = 100 / *pct_interval;
|
cfg->intervals = 100 / *pct_interval;
|
||||||
cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2;
|
cfg->sub_intervals = (cfg->segment_sz / cfg->intervals) >> 1;
|
||||||
cfg->intervals--;
|
|
||||||
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||||
*hash_slots *= (cfg->intervals + cfg->sub_intervals);
|
*hash_slots *= cfg->sub_intervals;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -236,7 +233,7 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
if (cfg->dedupe_mode == MODE_SIMILARITY)
|
if (cfg->dedupe_mode == MODE_SIMILARITY)
|
||||||
intervals = 1;
|
intervals = 1;
|
||||||
else
|
else
|
||||||
intervals = cfg->intervals + cfg->sub_intervals;
|
intervals = cfg->sub_intervals;
|
||||||
indx->memlimit = memlimit - (hash_entry_size << 2);
|
indx->memlimit = memlimit - (hash_entry_size << 2);
|
||||||
indx->list = (htab_t *)calloc(intervals, sizeof (htab_t));
|
indx->list = (htab_t *)calloc(intervals, sizeof (htab_t));
|
||||||
indx->hash_entry_size = hash_entry_size;
|
indx->hash_entry_size = hash_entry_size;
|
||||||
|
|
|
@ -310,7 +310,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
|
|
||||||
if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
|
if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
|
||||||
ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL,
|
ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL,
|
||||||
arc->intervals + arc->sub_intervals,
|
arc->sub_intervals,
|
||||||
arc->similarity_cksum_sz);
|
arc->similarity_cksum_sz);
|
||||||
if (!ctx->similarity_cksums) {
|
if (!ctx->similarity_cksums) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
|
@ -860,31 +860,19 @@ process_blocks:
|
||||||
* magnitudes.
|
* magnitudes.
|
||||||
*/
|
*/
|
||||||
qsort(seg_heap, length/8, 8, cmpint);
|
qsort(seg_heap, length/8, 8, cmpint);
|
||||||
sim_ck = ctx->similarity_cksums;
|
|
||||||
crc = 0;
|
|
||||||
sub_i = cfg->sub_intervals;
|
|
||||||
increment = (length / cfg->intervals) / sub_i;
|
|
||||||
tgt = seg_heap;
|
|
||||||
while (increment < cfg->chunk_cksum_sz/4 && sub_i > 1) {
|
|
||||||
sub_i--;
|
|
||||||
increment = (length / cfg->intervals) / sub_i;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute the range similarity hashes.
|
* Compute the min-values range similarity hashes.
|
||||||
*/
|
*/
|
||||||
|
sim_ck = ctx->similarity_cksums;
|
||||||
|
sub_i = cfg->sub_intervals;
|
||||||
len = length;
|
len = length;
|
||||||
|
tgt = seg_heap;
|
||||||
|
increment = cfg->chunk_cksum_sz;
|
||||||
|
if (increment * sub_i > len)
|
||||||
|
sub_i = len / increment;
|
||||||
for (j = 0; j<sub_i; j++) {
|
for (j = 0; j<sub_i; j++) {
|
||||||
crc = lzma_crc64(tgt, increment, 0);
|
crc = lzma_crc64(tgt, increment/4, 0);
|
||||||
*((uint64_t *)sim_ck) = crc;
|
|
||||||
tgt += increment;
|
|
||||||
len -= increment;
|
|
||||||
sim_ck += cfg->similarity_cksum_sz;
|
|
||||||
}
|
|
||||||
|
|
||||||
increment = length / cfg->intervals;
|
|
||||||
for (j=0; j<cfg->intervals-1; j++) {
|
|
||||||
crc = lzma_crc64(tgt, increment/8, 0);
|
|
||||||
*((uint64_t *)sim_ck) = crc;
|
*((uint64_t *)sim_ck) = crc;
|
||||||
tgt += increment;
|
tgt += increment;
|
||||||
len -= increment;
|
len -= increment;
|
||||||
|
@ -956,12 +944,12 @@ process_blocks:
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now lookup all the similarity hashes. We sort the hashes first so that
|
* Now lookup all the similarity hashes. We sort the hashes first so that
|
||||||
* all duplicates can be easily detected.
|
* all duplicate hash values can be easily eliminated.
|
||||||
*/
|
*/
|
||||||
qsort(ctx->similarity_cksums, cfg->intervals + sub_i - 1, 8, cmpint);
|
qsort(ctx->similarity_cksums, sub_i, 8, cmpint);
|
||||||
crc = 0;
|
crc = 0;
|
||||||
off1 = UINT64_MAX;
|
off1 = UINT64_MAX;
|
||||||
for (j=cfg->intervals + sub_i; j > 0; j--) {
|
for (j=sub_i; j > 0; j--) {
|
||||||
hash_entry_t *he = NULL;
|
hash_entry_t *he = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in a new issue