From 75f62d6a36be8abc98486d5cb24aa7c7fd1f463c Mon Sep 17 00:00:00 2001
From: Moinak Ghosh <moinakg@gmail.com>
Date: Fri, 26 Apr 2013 10:56:29 +0530
Subject: [PATCH] Simplify segment lookup loop. Fix assertion.

---
 rabin/rabin_dedup.c | 69 +++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 37 deletions(-)

diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c
index b099f7c..e9c55bf 100755
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@@ -866,7 +866,7 @@ process_blocks:
 				 */
 
 				cfg = ctx->arc;
-				assert(cfg->similarity_cksum_sz >= sizeof (uint64_t));
+				assert(cfg->similarity_cksum_sz == sizeof (uint64_t));
 				seg_heap = (uchar_t *)(ctx->g_blocks) - cfg->segment_sz * cfg->chunk_cksum_sz;
 				ary_sz = (cfg->sub_intervals * cfg->similarity_cksum_sz + sizeof (blks) + 1) *
 				    ((blknum+1) / cfg->segment_sz) + 3;
@@ -935,47 +935,49 @@ process_blocks:
 					 * Now lookup all the similarity hashes. We sort the hashes first so that
 					 * all duplicate hash values can be easily eliminated.
 					 * 
-					 * The matching segment offsets in the segcache are stored in a list.
+					 * The matching segment offsets in the segcache are stored in a list. Entries
+					 * that were not found are stored with offset of UINT64_MAX.
 					 */
-					if (cfg->similarity_cksum_sz == 8) {
-						isort_uint64((uint64_t *)(ctx->similarity_cksums), sub_i);
-					} else {
-						fprintf(stderr, "Similarity Checksum Size: %d not implemented.\n",
-							cfg->similarity_cksum_sz);
-						ctx->valid = 0;
-						sem_post(ctx->index_sem_next);
-						return (0);
-					}
+					isort_uint64((uint64_t *)(ctx->similarity_cksums), sub_i);
 
 					sim_ck = ctx->similarity_cksums;
 					tgt = src + 1; // One byte for number of entries
 					crc = 0;
 					off1 = UINT64_MAX;
 					k = 0;
+
 					for (j=0; j < sub_i; j++) {
 						hash_entry_t *he = NULL;
-
-						/*
-						 * Check for duplicate checksum which need not be looked up
-						 * again.
-						 */
-						if (crc == *((uint64_t *)sim_ck)) {
-							he = NULL;
-						} else {
+						if (j > 0 && crc != *((uint64_t *)sim_ck)) {
 							he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
-							/*
-							 * Check for different checksum but same segment match.
-							 * This is not a complete check but does help to reduce
-							 * wasted processing.
-							 */
-							if (he && off1 == he->item_offset) {
-								crc = *((uint64_t *)sim_ck);
-								he = NULL;
-							}
+						} else {
+							he = NULL;
 						}
 						if (he) {
-							crc = *((uint64_t *)sim_ck);
-							off1 = he->item_offset;
+							*((uint64_t *)tgt) = he->item_offset;
+						} else {
+							*((uint64_t *)tgt) = UINT64_MAX;
+						}
+						crc = *((uint64_t *)sim_ck);
+						sim_ck += cfg->similarity_cksum_sz;
+						tgt += cfg->similarity_cksum_sz;
+					}
+
+					/*
+					 * At this point we have a list of segment offsets from the segcache
+					 * file. Sort the offsets to avoid subsequent random access.
+					 */
+					tgt = src + 1;
+					isort_uint64((uint64_t *)tgt, k);
+
+					/*
+					 * Now eliminate duplicate offsets and UINT64_MAX offset entries which
+					 * indicate entries that were not found.
+					 */
+					sim_ck = tgt;
+					for (j=0; j < sub_i; j++) {
+						if (off1 != *((uint64_t *)sim_ck) && *((uint64_t *)sim_ck) != UINT64_MAX) {
+							off1 = *((uint64_t *)sim_ck);
 							*((uint64_t *)tgt) = off1;
 							tgt += cfg->similarity_cksum_sz;
 							k++;
@@ -983,13 +985,6 @@ process_blocks:
 						sim_ck += cfg->similarity_cksum_sz;
 					}
 					*src = k; // Number of entries
-					src++;
-
-					/*
-					 * At this point we have a list of segment offsets from the segcache
-					 * file. Sort the offsets to avoid subsequent random access.
-					 */
-					isort_uint64((uint64_t *)src, k);
 					src = tgt;
 					i = blks;
 				}