Tweak percentage intervals computation to improve segmented dedupe ratio.

Avoid repeat processing of already processed segments.
2013-04-23 18:53:56 +05:30 · 2013-04-23 18:53:56 +05:30 · 6b7d883393
commit 6b7d883393
parent d29f125ca7
3 changed files with 14 additions and 8 deletions
--- a/rabin/global/dedupe_config.h
+++ b/rabin/global/dedupe_config.h
@ -37,7 +37,7 @@ extern "C" {
 #define	DEFAULT_CHUNK_CKSUM	CKSUM_SHA256
 #define	DEFAULT_SIMILARITY_CKSUM	CKSUM_BLAKE256
 #define	DEFAULT_COMPRESS		COMPRESS_LZ4
-#define	DEFAULT_PCT_INTERVAL	10
+#define	DEFAULT_PCT_INTERVAL	5
 #define	CONTAINER_ITEMS		2048
 #define	MIN_CK 1
 #define	MAX_CK 5
--- a/rabin/global/index.c
+++ b/rabin/global/index.c
@ -169,7 +169,8 @@ set_cfg:
 		*pct_interval = 0;
 	} else {
 		cfg->intervals = 100 / *pct_interval;
-		cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals;
+		cfg->sub_intervals = (cfg->segment_sz-2) / cfg->intervals * 2;
 		cfg->intervals--;
 		*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
 		*hash_slots *= (cfg->intervals + cfg->sub_intervals);
 	}
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@ -846,7 +846,6 @@ process_blocks:
 					 */
 					blks = cfg->segment_sz;
 					if (blks > blknum-i) blks = blknum-i;
 					len = 0;
 					length = 0;
 					tgt = seg_heap;
 					for (j=0; j<blks; j++) {
@ -855,11 +854,12 @@ process_blocks:
 						tgt += cfg->chunk_cksum_sz;
 					}
 					blks = j+i;
 					qsort(seg_heap, length/8, 8, cmpint);
 					/*
-					 * Compute the range similarity hashes.
+					 * Sort concatenated chunk hash buffer by raw 64-bit integer
 					 * magnitudes.
 					 */
 					qsort(seg_heap, length/8, 8, cmpint);
 					sim_ck = ctx->similarity_cksums;
 					crc = 0;
 					sub_i = cfg->sub_intervals;
@ -869,6 +869,10 @@ process_blocks:
 						sub_i--;
 						increment = (length / cfg->intervals) / sub_i;
 					}
 					/*
 					 * Compute the range similarity hashes.
 					 */
 					len = length;
 					for (j = 0; j<sub_i; j++) {
 						crc = lzma_crc64(tgt, increment, 0);
@ -951,18 +955,19 @@ process_blocks:
 					}
 					/*
-					 * Now lookup the similarity minhashes starting at the highest
+					 * Now lookup the similarity hashes starting at the highest
 					 * significance level.
 					 */
 					for (j=cfg->intervals + sub_i; j > 0; j--) {
-						hash_entry_t *he;
+						hash_entry_t *he = NULL, *he1 = NULL;
 						he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
-						if (he) {
+						if (he && he != he1) {
 							/*
 							 * Match found. Load segment metadata from disk and perform
 							 * identity deduplication with the segment chunks.
 							 */
 							he1 = he;
 							offset = he->item_offset;
 							if (db_segcache_map(cfg, ctx->id, &o_blks, &offset,
 							    (uchar_t **)&seg_blocks) == -1) {