Update usage text and add minor tweaks.

2013-04-18 22:55:49 +05:30 · 2013-04-18 22:55:49 +05:30 · 2f6ccca6e5
commit 2f6ccca6e5
parent 426c0d0bf2
2 changed files with 25 additions and 10 deletions
--- a/main.c
+++ b/main.c
@ -150,11 +150,14 @@ usage(void)
 	    "4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
 	    "   %s -D ...\n"
 	    "   %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n\n"
-	    "5) Perform Delta Encoding in addition to Identical Dedup:\n"
+	    "5) Perform Deduplication across the entire dataset (Global Dedupe):\n"
+	    "   %s -G <-D|-F> - This option requires one of '-D' or '-F' to be specified\n"
+	    "             to identify the block splitting method.\n"
+	    "6) Perform Delta Encoding in addition to Identical Dedupe:\n"
 	    "   %s -E ... - This also implies '-D'. This checks for at least 60%% similarity.\n"
 	    "   The flag can be repeated as in '-EE' to indicate at least 40%% similarity.\n\n"
-	    "6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
-	    "7) Other flags:\n"
+	    "7) Number of threads can optionally be specified: -t <1 - 256 count>\n"
+	    "8) Other flags:\n"
 	    "   '-L'    - Enable LZP pre-compression. This improves compression ratio of all\n"
 	    "             algorithms with some extra CPU and very low RAM overhead.\n"
 	    "   '-P'    - Enable Adaptive Delta Encoding. It can improve compresion ratio for\n"
@ -164,11 +167,12 @@ usage(void)
 	    "             datasets.\n"
 	    "   '-S' <cksum>\n"
 	    "           - Specify chunk checksum to use:\n\n",
-	    UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name);
+	    UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name,
+	    exec_name);
 	list_checksums(stderr, "             ");
 	fprintf(stderr, "\n"
-	    "   '-F'    - Perform Fixed-Block Deduplication. Faster than '-D' in some cases\n"
-	    "             but with lower deduplication ratio.\n"
+	    "   '-F'    - Perform Fixed-Block Deduplication. Faster than '-D' but with lower\n"
+	    "             deduplication ratio.\n"
 	    "   '-B' <1..5>\n"
 	    "           - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
 	    "   '-M'    - Display memory allocator statistics\n"
@ -2669,6 +2673,11 @@ main(int argc, char *argv[])
 	if (cksum == 0)
 		get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes, 0);

+	if ((enable_rabin_scan || enable_fixed_scan) && cksum == CKSUM_CRC64) {
+		fprintf(stderr, "CRC64 checksum is not suitable for Deduplication.\n");
+		exit(1);
+	}
+
 	if (!encrypt_type) {
 		/*
 		 * If not encrypting we compute a header CRC32.
--- a/rabin/global/index.c
+++ b/rabin/global/index.c
@ -169,16 +169,22 @@ set_cfg:
 		*pct_interval = 0;
 	} else {
 		cfg->intervals = 100 / *pct_interval;
-		cfg->sub_intervals = cfg->segment_sz / cfg->intervals;
+		cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals;
 		*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
 		*hash_slots *= (cfg->intervals + cfg->sub_intervals);
 	}

-	// Compute memory required to hold all hash entries assuming worst case 50%
-	// occupancy.
+	/*
+	 * Compute memory required to hold all hash entries assuming worst case 50%
+	 * occupancy.
+	 */
 	*memreqd = MEM_REQD(*hash_slots, *hash_entry_size);

-	if (*memreqd > (memlimit + (memlimit >> 1)) && cfg->dedupe_mode == MODE_SIMPLE &&
+	/*
+	 * If memory required is more than twice the indicated memory limit then
+	 * we switch to Segmented Cumulative Similarity based dedupe.
+	 */
+	if (*memreqd > (memlimit * 2) && cfg->dedupe_mode == MODE_SIMPLE &&
 	    *pct_interval == 0) {
 		*pct_interval = DEFAULT_PCT_INTERVAL;
 		set_user = 1;