diff --git a/rabin/global/dedupe_config.h b/rabin/global/dedupe_config.h index f4a5c9a..afbed5f 100644 --- a/rabin/global/dedupe_config.h +++ b/rabin/global/dedupe_config.h @@ -37,7 +37,7 @@ extern "C" { #define DEFAULT_CHUNK_CKSUM CKSUM_SHA256 #define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256 #define DEFAULT_COMPRESS COMPRESS_LZ4 -#define DEFAULT_PCT_INTERVAL 10 +#define DEFAULT_PCT_INTERVAL 5 #define CONTAINER_ITEMS 2048 #define MIN_CK 1 #define MAX_CK 5 diff --git a/rabin/global/index.c b/rabin/global/index.c index 5babaef..7bf526e 100644 --- a/rabin/global/index.c +++ b/rabin/global/index.c @@ -169,7 +169,8 @@ set_cfg: *pct_interval = 0; } else { cfg->intervals = 100 / *pct_interval; - cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals; + cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2; + cfg->intervals--; *hash_slots = file_sz / cfg->segment_sz_bytes + 1; *hash_slots *= (cfg->intervals + cfg->sub_intervals); } diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 2818616..7486009 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -846,7 +846,6 @@ process_blocks: */ blks = cfg->segment_sz; if (blks > blknum-i) blks = blknum-i; - len = 0; length = 0; tgt = seg_heap; for (j=0; jchunk_cksum_sz; } blks = j+i; - qsort(seg_heap, length/8, 8, cmpint); /* - * Compute the range similarity hashes. + * Sort concatenated chunk hash buffer by raw 64-bit integer + * magnitudes. */ + qsort(seg_heap, length/8, 8, cmpint); sim_ck = ctx->similarity_cksums; crc = 0; sub_i = cfg->sub_intervals; @@ -869,6 +869,10 @@ process_blocks: sub_i--; increment = (length / cfg->intervals) / sub_i; } + + /* + * Compute the range similarity hashes. + */ len = length; for (j = 0; jintervals + sub_i; j > 0; j--) { - hash_entry_t *he; + hash_entry_t *he = NULL, *he1 = NULL; he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1); - if (he) { + if (he && he != he1) { /* * Match found. Load segment metadata from disk and perform * identity deduplication with the segment chunks. */ + he1 = he; offset = he->item_offset; if (db_segcache_map(cfg, ctx->id, &o_blks, &offset, (uchar_t **)&seg_blocks) == -1) {