Improve accuracy of the KMV sketch computation and speed it up.

2013-07-03 19:24:06 +05:30 · 2013-07-03 19:24:06 +05:30 · e10a13ad94
commit e10a13ad94
parent 6b67e98747
1 changed files with 19 additions and 19 deletions
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@ -76,7 +76,6 @@
 #include <heap.h>
 #include <xxhash.h>
 #include <qsort.h>
 #include <lzma_crc.h>
 #include "rabin_dedup.h"
 #if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) && RAB_POLYNOMIAL_WIN_SIZE == 16
@ -874,7 +873,7 @@ process_blocks:
 			} else {
 				uchar_t *seg_heap, *sim_ck, *sim_offsets;
 				archive_config_t *cfg;
-				uint32_t increment, len, blks, o_blks, k;
+				uint32_t len, blks, o_blks, k;
 				global_blockentry_t *seg_blocks;
 				uint64_t seg_offset, offset;
 				global_blockentry_t **htab, *be;
@ -897,6 +896,7 @@ process_blocks:
 				htab = (global_blockentry_t **)(src - ary_sz);
 				for (i=0; i<blknum;) {
 					uint64_t crc, off1;
 					uint64_t a, b;
 					length = 0;
 					/*
@ -943,25 +943,29 @@ process_blocks:
 					blks = j+i;
 					/*
-					 * Sort concatenated chunk hash buffer by raw 64-bit integer
+					 * Assume the concatenated chunk hash buffer as an array of 64-bit
-					 * magnitudes.
+					 * integers and sort them in ascending order.
 					 */
 					do_qsort((uint64_t *)seg_heap, length/8);
 					/*
-					 * Compute the min-values range similarity hashes.
+					 * Compute the K min values sketch where K == 20 in this case.
 					 */
 					sim_ck = ctx->similarity_cksums;
 					sub_i = cfg->sub_intervals;
 					tgt = seg_heap;
-					increment = cfg->chunk_cksum_sz / 2;
+					sub_i = 0;
-					if  (increment * sub_i > length)
+
-						sub_i = length / increment;
+					*((uint64_t *)sim_ck) = 0;
-					for (j = 0; j<sub_i; j++) {
+					a = 0;
-						crc = lzma_crc64(tgt, increment/2, 0);
+					for (j = 0; j < length && sub_i < cfg->sub_intervals;) {
-						*((uint64_t *)sim_ck) = crc;
+						b = *((uint64_t *)tgt);
-						tgt += increment;
+						tgt += sizeof (uint64_t);
-						sim_ck += cfg->similarity_cksum_sz;
+						if (b != a) {
 							*((uint64_t *)sim_ck) = b;
 							sim_ck += sizeof (uint64_t);
 							a = b;
 							sub_i++;
 						}
 					}
 					/*
@ -984,14 +988,10 @@ process_blocks:
 					}
 					/*
-					 * Now lookup all the similarity hashes. We sort the hashes first so that
+					 * Now lookup all the similarity hashes.
 					 * all duplicate hash values can be easily eliminated.
 					 * 
 					 * The matching segment offsets in the segcache are stored in a list. Entries
 					 * that were not found are stored with offset of UINT64_MAX.
 					 */
 					isort_uint64((uint64_t *)(ctx->similarity_cksums), sub_i);
 					sim_ck = ctx->similarity_cksums;
 					tgt = src + 1; // One byte for number of entries
 					crc = 0;