Improve segment similarity detection and drastically reduce index size.

This commit is contained in:
Moinak Ghosh 2013-04-23 23:15:32 +05:30
parent b32f4b3f9a
commit eabd670790
2 changed files with 17 additions and 32 deletions

View file

@ -158,21 +158,18 @@ set_cfg:
// Compute total hashtable entries first
*hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
if (*pct_interval == 0) {
cfg->intervals = 1;
cfg->sub_intervals = 0;
cfg->sub_intervals = 1;
*hash_slots = file_sz / cfg->chunk_sz_bytes + 1;
} else if (*pct_interval == 100) {
cfg->intervals = 1;
cfg->sub_intervals = 0;
cfg->sub_intervals = 1;
*hash_slots = SLOTS_FOR_MEM(memlimit, *hash_entry_size);
*pct_interval = 0;
} else {
cfg->intervals = 100 / *pct_interval;
cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2;
cfg->intervals--;
cfg->sub_intervals = (cfg->segment_sz / cfg->intervals) >> 1;
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
*hash_slots *= (cfg->intervals + cfg->sub_intervals);
*hash_slots *= cfg->sub_intervals;
}
/*
@ -236,7 +233,7 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
if (cfg->dedupe_mode == MODE_SIMILARITY)
intervals = 1;
else
intervals = cfg->intervals + cfg->sub_intervals;
intervals = cfg->sub_intervals;
indx->memlimit = memlimit - (hash_entry_size << 2);
indx->list = (htab_t *)calloc(intervals, sizeof (htab_t));
indx->hash_entry_size = hash_entry_size;

View file

@ -310,7 +310,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL,
arc->intervals + arc->sub_intervals,
arc->sub_intervals,
arc->similarity_cksum_sz);
if (!ctx->similarity_cksums) {
fprintf(stderr,
@ -860,31 +860,19 @@ process_blocks:
* magnitudes.
*/
qsort(seg_heap, length/8, 8, cmpint);
sim_ck = ctx->similarity_cksums;
crc = 0;
sub_i = cfg->sub_intervals;
increment = (length / cfg->intervals) / sub_i;
tgt = seg_heap;
while (increment < cfg->chunk_cksum_sz/4 && sub_i > 1) {
sub_i--;
increment = (length / cfg->intervals) / sub_i;
}
/*
* Compute the range similarity hashes.
* Compute the min-values range similarity hashes.
*/
sim_ck = ctx->similarity_cksums;
sub_i = cfg->sub_intervals;
len = length;
tgt = seg_heap;
increment = cfg->chunk_cksum_sz;
if (increment * sub_i > len)
sub_i = len / increment;
for (j = 0; j<sub_i; j++) {
crc = lzma_crc64(tgt, increment, 0);
*((uint64_t *)sim_ck) = crc;
tgt += increment;
len -= increment;
sim_ck += cfg->similarity_cksum_sz;
}
increment = length / cfg->intervals;
for (j=0; j<cfg->intervals-1; j++) {
crc = lzma_crc64(tgt, increment/8, 0);
crc = lzma_crc64(tgt, increment/4, 0);
*((uint64_t *)sim_ck) = crc;
tgt += increment;
len -= increment;
@ -956,12 +944,12 @@ process_blocks:
/*
* Now lookup all the similarity hashes. We sort the hashes first so that
* all duplicates can be easily detected.
* all duplicate hash values can be easily eliminated.
*/
qsort(ctx->similarity_cksums, cfg->intervals + sub_i - 1, 8, cmpint);
qsort(ctx->similarity_cksums, sub_i, 8, cmpint);
crc = 0;
off1 = UINT64_MAX;
for (j=cfg->intervals + sub_i; j > 0; j--) {
for (j=sub_i; j > 0; j--) {
hash_entry_t *he = NULL;
/*