From 2c4024792a2e776b39c219df32a6b3b7e2df5786 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 22 Apr 2013 22:07:07 +0530 Subject: [PATCH] Several bugfixes. Avoid matching with self during hash lookup. --- rabin/global/index.c | 5 +++-- rabin/rabin_dedup.c | 12 ++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/rabin/global/index.c b/rabin/global/index.c index 060f82a..9a96357 100644 --- a/rabin/global/index.c +++ b/rabin/global/index.c @@ -432,7 +432,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, if (cfg->pct_interval == 0) { // Global dedupe with simple index while (ent) { if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0 && - ent->item_size == item_size) { + ent->item_size == item_size && ent->item_offset != item_offset) { return (ent); } pent = &(ent->next); @@ -440,7 +440,8 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, } } else { while (ent) { - if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0) { + if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0 && + ent->item_offset != item_offset) { return (ent); } pent = &(ent->next); diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 401a5b3..2818616 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -858,14 +858,14 @@ process_blocks: qsort(seg_heap, length/8, 8, cmpint); /* - * Compute the range similarity minhashes. + * Compute the range similarity hashes. */ sim_ck = ctx->similarity_cksums; crc = 0; sub_i = cfg->sub_intervals; increment = (length / cfg->intervals) / sub_i; tgt = seg_heap; - while (increment < cfg->chunk_cksum_sz/4 && sub_i > 0) { + while (increment < cfg->chunk_cksum_sz/4 && sub_i > 1) { sub_i--; increment = (length / cfg->intervals) / sub_i; } @@ -880,7 +880,7 @@ process_blocks: increment = length / cfg->intervals; for (j=0; jintervals-1; j++) { - crc = lzma_crc64(tgt, increment/2, 0); + crc = lzma_crc64(tgt, increment/8, 0); *((uint64_t *)sim_ck) = crc; tgt += increment; len -= increment; @@ -958,11 +958,7 @@ process_blocks: hash_entry_t *he; he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1); - - /* - * If match found also check that match is not with self! - */ - if (he && he->item_offset != seg_offset) { + if (he) { /* * Match found. Load segment metadata from disk and perform * identity deduplication with the segment chunks.