From 3db51884459f48bf62b147b832f6272a17d5c440 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 19 Aug 2013 13:38:52 +0530 Subject: [PATCH] Support for deduplication using 2KB block size. --- README.md | 16 +++++++++++++++- pcompress.c | 6 ++++-- rabin/global/dedupe_config.c | 6 +++++- rabin/rabin_dedup.c | 6 +++--- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 593c9a8..cee09c9 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library datasets. '-S' - - Specify chunk checksum to use: + - Specify chunk checksum to use: CRC64 - Extremely Fast 64-bit CRC from LZMA SDK. SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86. @@ -156,6 +156,20 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library '-B' <1..5> - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K. + Default deduplication block size is 4KB. + '-B' 0 + - This uses blocks as small as 2KB for deduplication. This option can be + used for datasets of a few GBs to a few hundred TBs in size depending on + available RAM. + + Caveats: + In some cases like LZMA with extreme compression levels and with '-L' and + '-P' preprocessing enabled, this can result in lower compression as compared + to using '-B 1'. + For fast compression algorithms like LZ4 and Zlib this should always benefit. + However please test on your sample data with your desired compression + algorithm to verify the results. + '-M' - Display memory allocator statistics '-C' - Display compression statistics diff --git a/pcompress.c b/pcompress.c index 4e23ff3..c55b4a0 100644 --- a/pcompress.c +++ b/pcompress.c @@ -147,6 +147,8 @@ usage(pc_ctx_t *pctx) " deduplication ratio.\n" " '-B' <1..5>\n" " - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n" + " '-B' 0\n" + " - Use ultra-small 2KB blocks for deduplication. See README for caveats.\n" " '-M' - Display memory allocator statistics\n" " '-C' - Display compression statistics\n\n"); fprintf(stderr, "\n" @@ -2640,8 +2642,8 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) case 'B': pctx->rab_blk_size = atoi(optarg); - if (pctx->rab_blk_size < 1 || pctx->rab_blk_size > 5) { - err_print(0, "Average Dedupe block size must be in range 1 (4k) - 5 (64k)\n"); + if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) { + err_print(0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)\n"); return (1); } break; diff --git a/rabin/global/dedupe_config.c b/rabin/global/dedupe_config.c index 02f38e5..24655c2 100644 --- a/rabin/global/dedupe_config.c +++ b/rabin/global/dedupe_config.c @@ -379,7 +379,11 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim cfg->segment_sz_bytes = user_chunk_sz; cfg->similarity_cksum_sz = cfg->chunk_cksum_sz; } else { - cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz; + if (cfg->chunk_sz == 0) { + cfg->segment_sz_bytes = FOUR_MB; + } else { + cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz; + } } cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 6dd6538..e3af483 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -125,7 +125,7 @@ dedupe_min_blksz(int rab_blk_sz) uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag) { - if (rab_blk_sz < 1 || rab_blk_sz > 5) + if (rab_blk_sz < 0 || rab_blk_sz > 5) rab_blk_sz = RAB_BLK_DEFAULT; return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t)); @@ -166,7 +166,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s dedupe_context_t *ctx; uint32_t i; - if (rab_blk_sz < 1 || rab_blk_sz > 5) + if (rab_blk_sz < 0 || rab_blk_sz > 5) rab_blk_sz = RAB_BLK_DEFAULT; if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) { @@ -209,7 +209,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s * It is essentially a hashtable that is used for crypto-hash based * chunk matching. */ - if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz > 0) { + if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz >= 0) { my_sysinfo msys_info; int pct_interval, chunk_cksum, cksum_bytes, mac_bytes; char *ck;