From eabd6707907d1724ddc46ceac3dc74547d775f68 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Tue, 23 Apr 2013 23:15:32 +0530 Subject: [PATCH] Improve segment similarity detection and drastically reduce index size. --- rabin/global/index.c | 13 +++++-------- rabin/rabin_dedup.c | 36 ++++++++++++------------------------ 2 files changed, 17 insertions(+), 32 deletions(-) diff --git a/rabin/global/index.c b/rabin/global/index.c index 7bf526e..c8705eb 100644 --- a/rabin/global/index.c +++ b/rabin/global/index.c @@ -158,21 +158,18 @@ set_cfg: // Compute total hashtable entries first *hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1; if (*pct_interval == 0) { - cfg->intervals = 1; - cfg->sub_intervals = 0; + cfg->sub_intervals = 1; *hash_slots = file_sz / cfg->chunk_sz_bytes + 1; } else if (*pct_interval == 100) { - cfg->intervals = 1; - cfg->sub_intervals = 0; + cfg->sub_intervals = 1; *hash_slots = SLOTS_FOR_MEM(memlimit, *hash_entry_size); *pct_interval = 0; } else { cfg->intervals = 100 / *pct_interval; - cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2; - cfg->intervals--; + cfg->sub_intervals = (cfg->segment_sz / cfg->intervals) >> 1; *hash_slots = file_sz / cfg->segment_sz_bytes + 1; - *hash_slots *= (cfg->intervals + cfg->sub_intervals); + *hash_slots *= cfg->sub_intervals; } /* @@ -236,7 +233,7 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch if (cfg->dedupe_mode == MODE_SIMILARITY) intervals = 1; else - intervals = cfg->intervals + cfg->sub_intervals; + intervals = cfg->sub_intervals; indx->memlimit = memlimit - (hash_entry_size << 2); indx->list = (htab_t *)calloc(intervals, sizeof (htab_t)); indx->hash_entry_size = hash_entry_size; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index f509302..1cb6e7a 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -310,7 +310,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) { ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL, - arc->intervals + arc->sub_intervals, + arc->sub_intervals, arc->similarity_cksum_sz); if (!ctx->similarity_cksums) { fprintf(stderr, @@ -860,31 +860,19 @@ process_blocks: * magnitudes. */ qsort(seg_heap, length/8, 8, cmpint); - sim_ck = ctx->similarity_cksums; - crc = 0; - sub_i = cfg->sub_intervals; - increment = (length / cfg->intervals) / sub_i; - tgt = seg_heap; - while (increment < cfg->chunk_cksum_sz/4 && sub_i > 1) { - sub_i--; - increment = (length / cfg->intervals) / sub_i; - } /* - * Compute the range similarity hashes. + * Compute the min-values range similarity hashes. */ + sim_ck = ctx->similarity_cksums; + sub_i = cfg->sub_intervals; len = length; + tgt = seg_heap; + increment = cfg->chunk_cksum_sz; + if (increment * sub_i > len) + sub_i = len / increment; for (j = 0; jsimilarity_cksum_sz; - } - - increment = length / cfg->intervals; - for (j=0; jintervals-1; j++) { - crc = lzma_crc64(tgt, increment/8, 0); + crc = lzma_crc64(tgt, increment/4, 0); *((uint64_t *)sim_ck) = crc; tgt += increment; len -= increment; @@ -956,12 +944,12 @@ process_blocks: /* * Now lookup all the similarity hashes. We sort the hashes first so that - * all duplicates can be easily detected. + * all duplicate hash values can be easily eliminated. */ - qsort(ctx->similarity_cksums, cfg->intervals + sub_i - 1, 8, cmpint); + qsort(ctx->similarity_cksums, sub_i, 8, cmpint); crc = 0; off1 = UINT64_MAX; - for (j=cfg->intervals + sub_i; j > 0; j--) { + for (j=sub_i; j > 0; j--) { hash_entry_t *he = NULL; /*