Support for deduplication using 2KB block size.

2013-08-19 13:38:52 +05:30 · 2013-08-19 13:38:52 +05:30 · 3db5188445
commit 3db5188445
parent ef98422bd4
4 changed files with 27 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -156,6 +156,20 @@ NOTE: The option "libbsc" uses  Ilya Grebnov's block sorting compression library

       '-B' <1..5>
            -     Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.
+                  Default deduplication block size is 4KB.
+       '-B' 0
+            -     This uses blocks as small as 2KB for deduplication. This option can be
+                  used for datasets of a few GBs to a few hundred TBs in size depending on
+                  available RAM.
+                  
+                  Caveats:
+                  In some cases like LZMA with extreme compression levels and with '-L' and
+                  '-P' preprocessing enabled, this can result in lower compression as compared
+                  to using '-B 1'.
+                  For fast compression algorithms like LZ4 and Zlib this should always benefit.
+                  However please test on your sample data with your desired compression
+                  algorithm to verify the results.
+
       '-M' -     Display memory allocator statistics
       '-C' -     Display compression statistics

--- a/pcompress.c
+++ b/pcompress.c
@ -147,6 +147,8 @@ usage(pc_ctx_t *pctx)
 	    "             deduplication ratio.\n"
 	    "   '-B' <1..5>\n"
 	    "           - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
+	    "   '-B' 0\n"
+	    "           - Use ultra-small 2KB blocks for deduplication. See README for caveats.\n"
 	    "   '-M'    - Display memory allocator statistics\n"
 	    "   '-C'    - Display compression statistics\n\n");
 	fprintf(stderr, "\n"
@ -2640,8 +2642,8 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])

 		    case 'B':
 			pctx->rab_blk_size = atoi(optarg);
-			if (pctx->rab_blk_size < 1 || pctx->rab_blk_size > 5) {
-				err_print(0, "Average Dedupe block size must be in range 1 (4k) - 5 (64k)\n");
+			if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) {
+				err_print(0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)\n");
 				return (1);
 			}
 			break;
--- a/rabin/global/dedupe_config.c
+++ b/rabin/global/dedupe_config.c
@ -378,9 +378,13 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim
 		cfg->dedupe_mode = MODE_SIMPLE;
 		cfg->segment_sz_bytes = user_chunk_sz;
 		cfg->similarity_cksum_sz = cfg->chunk_cksum_sz;
+	} else {
+		if (cfg->chunk_sz == 0) {
+			cfg->segment_sz_bytes = FOUR_MB;
 		} else {
 			cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
 		}
+	}

 	cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes;
 	return (0);
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@ -125,7 +125,7 @@ dedupe_min_blksz(int rab_blk_sz)
 uint32_t
 dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
 {
-	if (rab_blk_sz < 1 || rab_blk_sz > 5)
+	if (rab_blk_sz < 0 || rab_blk_sz > 5)
 		rab_blk_sz = RAB_BLK_DEFAULT;

 	return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
@ -166,7 +166,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
 	dedupe_context_t *ctx;
 	uint32_t i;

-	if (rab_blk_sz < 1 || rab_blk_sz > 5)
+	if (rab_blk_sz < 0 || rab_blk_sz > 5)
 		rab_blk_sz = RAB_BLK_DEFAULT;

 	if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
@ -209,7 +209,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
 		 * It is essentially a hashtable that is used for crypto-hash based
 		 * chunk matching.
 		 */
-		if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz > 0) {
+		if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz >= 0) {
 			my_sysinfo msys_info;
 			int pct_interval, chunk_cksum, cksum_bytes, mac_bytes;
 			char *ck;