diff --git a/README.md b/README.md
index 593c9a8..cee09c9 100644
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ NOTE: The option "libbsc" uses  Ilya Grebnov's block sorting compression library
                   datasets.
 
        '-S' <cksum>
-            - Specify chunk checksum to use:
+            -     Specify chunk checksum to use:
 
                      CRC64 - Extremely Fast 64-bit CRC from LZMA SDK.
                     SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.
@@ -156,6 +156,20 @@ NOTE: The option "libbsc" uses  Ilya Grebnov's block sorting compression library
 
        '-B' <1..5>
             -     Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.
+                  Default deduplication block size is 4KB.
+       '-B' 0
+            -     This uses blocks as small as 2KB for deduplication. This option can be
+                  used for datasets of a few GBs to a few hundred TBs in size depending on
+                  available RAM.
+                  
+                  Caveats:
+                  In some cases like LZMA with extreme compression levels and with '-L' and
+                  '-P' preprocessing enabled, this can result in lower compression as compared
+                  to using '-B 1'.
+                  For fast compression algorithms like LZ4 and Zlib this should always benefit.
+                  However please test on your sample data with your desired compression
+                  algorithm to verify the results.
+
        '-M' -     Display memory allocator statistics
        '-C' -     Display compression statistics
 
diff --git a/pcompress.c b/pcompress.c
index 4e23ff3..c55b4a0 100644
--- a/pcompress.c
+++ b/pcompress.c
@@ -147,6 +147,8 @@ usage(pc_ctx_t *pctx)
 	    "             deduplication ratio.\n"
 	    "   '-B' <1..5>\n"
 	    "           - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
+	    "   '-B' 0\n"
+	    "           - Use ultra-small 2KB blocks for deduplication. See README for caveats.\n"
 	    "   '-M'    - Display memory allocator statistics\n"
 	    "   '-C'    - Display compression statistics\n\n");
 	fprintf(stderr, "\n"
@@ -2640,8 +2642,8 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
 
 		    case 'B':
 			pctx->rab_blk_size = atoi(optarg);
-			if (pctx->rab_blk_size < 1 || pctx->rab_blk_size > 5) {
-				err_print(0, "Average Dedupe block size must be in range 1 (4k) - 5 (64k)\n");
+			if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) {
+				err_print(0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)\n");
 				return (1);
 			}
 			break;
diff --git a/rabin/global/dedupe_config.c b/rabin/global/dedupe_config.c
index 02f38e5..24655c2 100644
--- a/rabin/global/dedupe_config.c
+++ b/rabin/global/dedupe_config.c
@@ -379,7 +379,11 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim
 		cfg->segment_sz_bytes = user_chunk_sz;
 		cfg->similarity_cksum_sz = cfg->chunk_cksum_sz;
 	} else {
-		cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
+		if (cfg->chunk_sz == 0) {
+			cfg->segment_sz_bytes = FOUR_MB;
+		} else {
+			cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
+		}
 	}
 
 	cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes;
diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c
index 6dd6538..e3af483 100755
--- a/rabin/rabin_dedup.c
+++ b/rabin/rabin_dedup.c
@@ -125,7 +125,7 @@ dedupe_min_blksz(int rab_blk_sz)
 uint32_t
 dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
 {
-	if (rab_blk_sz < 1 || rab_blk_sz > 5)
+	if (rab_blk_sz < 0 || rab_blk_sz > 5)
 		rab_blk_sz = RAB_BLK_DEFAULT;
 
 	return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
@@ -166,7 +166,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
 	dedupe_context_t *ctx;
 	uint32_t i;
 
-	if (rab_blk_sz < 1 || rab_blk_sz > 5)
+	if (rab_blk_sz < 0 || rab_blk_sz > 5)
 		rab_blk_sz = RAB_BLK_DEFAULT;
 
 	if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
@@ -209,7 +209,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
 		 * It is essentially a hashtable that is used for crypto-hash based
 		 * chunk matching.
 		 */
-		if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz > 0) {
+		if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz >= 0) {
 			my_sysinfo msys_info;
 			int pct_interval, chunk_cksum, cksum_bytes, mac_bytes;
 			char *ck;