Support for deduplication using 2KB block size.
This commit is contained in:
parent
ef98422bd4
commit
3db5188445
4 changed files with 27 additions and 7 deletions
16
README.md
16
README.md
|
@ -136,7 +136,7 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
||||||
datasets.
|
datasets.
|
||||||
|
|
||||||
'-S' <cksum>
|
'-S' <cksum>
|
||||||
- Specify chunk checksum to use:
|
- Specify chunk checksum to use:
|
||||||
|
|
||||||
CRC64 - Extremely Fast 64-bit CRC from LZMA SDK.
|
CRC64 - Extremely Fast 64-bit CRC from LZMA SDK.
|
||||||
SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.
|
SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.
|
||||||
|
@ -156,6 +156,20 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
||||||
|
|
||||||
'-B' <1..5>
|
'-B' <1..5>
|
||||||
- Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.
|
- Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.
|
||||||
|
Default deduplication block size is 4KB.
|
||||||
|
'-B' 0
|
||||||
|
- This uses blocks as small as 2KB for deduplication. This option can be
|
||||||
|
used for datasets of a few GBs to a few hundred TBs in size depending on
|
||||||
|
available RAM.
|
||||||
|
|
||||||
|
Caveats:
|
||||||
|
In some cases like LZMA with extreme compression levels and with '-L' and
|
||||||
|
'-P' preprocessing enabled, this can result in lower compression as compared
|
||||||
|
to using '-B 1'.
|
||||||
|
For fast compression algorithms like LZ4 and Zlib this should always benefit.
|
||||||
|
However please test on your sample data with your desired compression
|
||||||
|
algorithm to verify the results.
|
||||||
|
|
||||||
'-M' - Display memory allocator statistics
|
'-M' - Display memory allocator statistics
|
||||||
'-C' - Display compression statistics
|
'-C' - Display compression statistics
|
||||||
|
|
||||||
|
|
|
@ -147,6 +147,8 @@ usage(pc_ctx_t *pctx)
|
||||||
" deduplication ratio.\n"
|
" deduplication ratio.\n"
|
||||||
" '-B' <1..5>\n"
|
" '-B' <1..5>\n"
|
||||||
" - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
|
" - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
|
||||||
|
" '-B' 0\n"
|
||||||
|
" - Use ultra-small 2KB blocks for deduplication. See README for caveats.\n"
|
||||||
" '-M' - Display memory allocator statistics\n"
|
" '-M' - Display memory allocator statistics\n"
|
||||||
" '-C' - Display compression statistics\n\n");
|
" '-C' - Display compression statistics\n\n");
|
||||||
fprintf(stderr, "\n"
|
fprintf(stderr, "\n"
|
||||||
|
@ -2640,8 +2642,8 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
|
||||||
|
|
||||||
case 'B':
|
case 'B':
|
||||||
pctx->rab_blk_size = atoi(optarg);
|
pctx->rab_blk_size = atoi(optarg);
|
||||||
if (pctx->rab_blk_size < 1 || pctx->rab_blk_size > 5) {
|
if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) {
|
||||||
err_print(0, "Average Dedupe block size must be in range 1 (4k) - 5 (64k)\n");
|
err_print(0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)\n");
|
||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -379,7 +379,11 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim
|
||||||
cfg->segment_sz_bytes = user_chunk_sz;
|
cfg->segment_sz_bytes = user_chunk_sz;
|
||||||
cfg->similarity_cksum_sz = cfg->chunk_cksum_sz;
|
cfg->similarity_cksum_sz = cfg->chunk_cksum_sz;
|
||||||
} else {
|
} else {
|
||||||
cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
|
if (cfg->chunk_sz == 0) {
|
||||||
|
cfg->segment_sz_bytes = FOUR_MB;
|
||||||
|
} else {
|
||||||
|
cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes;
|
cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes;
|
||||||
|
|
|
@ -125,7 +125,7 @@ dedupe_min_blksz(int rab_blk_sz)
|
||||||
uint32_t
|
uint32_t
|
||||||
dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||||
{
|
{
|
||||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
if (rab_blk_sz < 0 || rab_blk_sz > 5)
|
||||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||||
|
|
||||||
return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
|
return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
|
||||||
|
@ -166,7 +166,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
dedupe_context_t *ctx;
|
dedupe_context_t *ctx;
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
|
|
||||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
if (rab_blk_sz < 0 || rab_blk_sz > 5)
|
||||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||||
|
|
||||||
if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
|
if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
|
||||||
|
@ -209,7 +209,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
* It is essentially a hashtable that is used for crypto-hash based
|
* It is essentially a hashtable that is used for crypto-hash based
|
||||||
* chunk matching.
|
* chunk matching.
|
||||||
*/
|
*/
|
||||||
if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz > 0) {
|
if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz >= 0) {
|
||||||
my_sysinfo msys_info;
|
my_sysinfo msys_info;
|
||||||
int pct_interval, chunk_cksum, cksum_bytes, mac_bytes;
|
int pct_interval, chunk_cksum, cksum_bytes, mac_bytes;
|
||||||
char *ck;
|
char *ck;
|
||||||
|
|
Loading…
Reference in a new issue