Support for deduplication using 2KB block size.

This commit is contained in:
Moinak Ghosh 2013-08-19 13:38:52 +05:30
parent ef98422bd4
commit 3db5188445
4 changed files with 27 additions and 7 deletions

View file

@ -136,7 +136,7 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
datasets. datasets.
'-S' <cksum> '-S' <cksum>
- Specify chunk checksum to use: - Specify chunk checksum to use:
CRC64 - Extremely Fast 64-bit CRC from LZMA SDK. CRC64 - Extremely Fast 64-bit CRC from LZMA SDK.
SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86. SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.
@ -156,6 +156,20 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
'-B' <1..5> '-B' <1..5>
- Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K. - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.
Default deduplication block size is 4KB.
'-B' 0
- This uses blocks as small as 2KB for deduplication. This option can be
used for datasets of a few GBs to a few hundred TBs in size depending on
available RAM.
Caveats:
In some cases like LZMA with extreme compression levels and with '-L' and
'-P' preprocessing enabled, this can result in lower compression as compared
to using '-B 1'.
For fast compression algorithms like LZ4 and Zlib this should always benefit.
However please test on your sample data with your desired compression
algorithm to verify the results.
'-M' - Display memory allocator statistics '-M' - Display memory allocator statistics
'-C' - Display compression statistics '-C' - Display compression statistics

View file

@ -147,6 +147,8 @@ usage(pc_ctx_t *pctx)
" deduplication ratio.\n" " deduplication ratio.\n"
" '-B' <1..5>\n" " '-B' <1..5>\n"
" - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n" " - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
" '-B' 0\n"
" - Use ultra-small 2KB blocks for deduplication. See README for caveats.\n"
" '-M' - Display memory allocator statistics\n" " '-M' - Display memory allocator statistics\n"
" '-C' - Display compression statistics\n\n"); " '-C' - Display compression statistics\n\n");
fprintf(stderr, "\n" fprintf(stderr, "\n"
@ -2640,8 +2642,8 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
case 'B': case 'B':
pctx->rab_blk_size = atoi(optarg); pctx->rab_blk_size = atoi(optarg);
if (pctx->rab_blk_size < 1 || pctx->rab_blk_size > 5) { if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) {
err_print(0, "Average Dedupe block size must be in range 1 (4k) - 5 (64k)\n"); err_print(0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)\n");
return (1); return (1);
} }
break; break;

View file

@ -379,7 +379,11 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim
cfg->segment_sz_bytes = user_chunk_sz; cfg->segment_sz_bytes = user_chunk_sz;
cfg->similarity_cksum_sz = cfg->chunk_cksum_sz; cfg->similarity_cksum_sz = cfg->chunk_cksum_sz;
} else { } else {
cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz; if (cfg->chunk_sz == 0) {
cfg->segment_sz_bytes = FOUR_MB;
} else {
cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
}
} }
cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes; cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes;

View file

@ -125,7 +125,7 @@ dedupe_min_blksz(int rab_blk_sz)
uint32_t uint32_t
dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag) dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
{ {
if (rab_blk_sz < 1 || rab_blk_sz > 5) if (rab_blk_sz < 0 || rab_blk_sz > 5)
rab_blk_sz = RAB_BLK_DEFAULT; rab_blk_sz = RAB_BLK_DEFAULT;
return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t)); return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
@ -166,7 +166,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
dedupe_context_t *ctx; dedupe_context_t *ctx;
uint32_t i; uint32_t i;
if (rab_blk_sz < 1 || rab_blk_sz > 5) if (rab_blk_sz < 0 || rab_blk_sz > 5)
rab_blk_sz = RAB_BLK_DEFAULT; rab_blk_sz = RAB_BLK_DEFAULT;
if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) { if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
@ -209,7 +209,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
* It is essentially a hashtable that is used for crypto-hash based * It is essentially a hashtable that is used for crypto-hash based
* chunk matching. * chunk matching.
*/ */
if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz > 0) { if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz >= 0) {
my_sysinfo msys_info; my_sysinfo msys_info;
int pct_interval, chunk_cksum, cksum_bytes, mac_bytes; int pct_interval, chunk_cksum, cksum_bytes, mac_bytes;
char *ck; char *ck;