Support for deduplication using 2KB block size.
This commit is contained in:
parent
ef98422bd4
commit
3db5188445
4 changed files with 27 additions and 7 deletions
16
README.md
16
README.md
|
@ -136,7 +136,7 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
|||
datasets.
|
||||
|
||||
'-S' <cksum>
|
||||
- Specify chunk checksum to use:
|
||||
- Specify chunk checksum to use:
|
||||
|
||||
CRC64 - Extremely Fast 64-bit CRC from LZMA SDK.
|
||||
SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.
|
||||
|
@ -156,6 +156,20 @@ NOTE: The option "libbsc" uses Ilya Grebnov's block sorting compression library
|
|||
|
||||
'-B' <1..5>
|
||||
- Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.
|
||||
Default deduplication block size is 4KB.
|
||||
'-B' 0
|
||||
- This uses blocks as small as 2KB for deduplication. This option can be
|
||||
used for datasets of a few GBs to a few hundred TBs in size depending on
|
||||
available RAM.
|
||||
|
||||
Caveats:
|
||||
In some cases like LZMA with extreme compression levels and with '-L' and
|
||||
'-P' preprocessing enabled, this can result in lower compression as compared
|
||||
to using '-B 1'.
|
||||
For fast compression algorithms like LZ4 and Zlib this should always benefit.
|
||||
However please test on your sample data with your desired compression
|
||||
algorithm to verify the results.
|
||||
|
||||
'-M' - Display memory allocator statistics
|
||||
'-C' - Display compression statistics
|
||||
|
||||
|
|
|
@ -147,6 +147,8 @@ usage(pc_ctx_t *pctx)
|
|||
" deduplication ratio.\n"
|
||||
" '-B' <1..5>\n"
|
||||
" - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n"
|
||||
" '-B' 0\n"
|
||||
" - Use ultra-small 2KB blocks for deduplication. See README for caveats.\n"
|
||||
" '-M' - Display memory allocator statistics\n"
|
||||
" '-C' - Display compression statistics\n\n");
|
||||
fprintf(stderr, "\n"
|
||||
|
@ -2640,8 +2642,8 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
|
|||
|
||||
case 'B':
|
||||
pctx->rab_blk_size = atoi(optarg);
|
||||
if (pctx->rab_blk_size < 1 || pctx->rab_blk_size > 5) {
|
||||
err_print(0, "Average Dedupe block size must be in range 1 (4k) - 5 (64k)\n");
|
||||
if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) {
|
||||
err_print(0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)\n");
|
||||
return (1);
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -379,7 +379,11 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim
|
|||
cfg->segment_sz_bytes = user_chunk_sz;
|
||||
cfg->similarity_cksum_sz = cfg->chunk_cksum_sz;
|
||||
} else {
|
||||
cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
|
||||
if (cfg->chunk_sz == 0) {
|
||||
cfg->segment_sz_bytes = FOUR_MB;
|
||||
} else {
|
||||
cfg->segment_sz_bytes = EIGHT_MB * cfg->chunk_sz;
|
||||
}
|
||||
}
|
||||
|
||||
cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes;
|
||||
|
|
|
@ -125,7 +125,7 @@ dedupe_min_blksz(int rab_blk_sz)
|
|||
uint32_t
|
||||
dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag)
|
||||
{
|
||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
||||
if (rab_blk_sz < 0 || rab_blk_sz > 5)
|
||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||
|
||||
return ((chunksize / dedupe_min_blksz(rab_blk_sz)) * sizeof (uint32_t));
|
||||
|
@ -166,7 +166,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
|||
dedupe_context_t *ctx;
|
||||
uint32_t i;
|
||||
|
||||
if (rab_blk_sz < 1 || rab_blk_sz > 5)
|
||||
if (rab_blk_sz < 0 || rab_blk_sz > 5)
|
||||
rab_blk_sz = RAB_BLK_DEFAULT;
|
||||
|
||||
if (dedupe_flag == RABIN_DEDUPE_FIXED || dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
|
||||
|
@ -209,7 +209,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
|||
* It is essentially a hashtable that is used for crypto-hash based
|
||||
* chunk matching.
|
||||
*/
|
||||
if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz > 0) {
|
||||
if (dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL && op == COMPRESS && rab_blk_sz >= 0) {
|
||||
my_sysinfo msys_info;
|
||||
int pct_interval, chunk_cksum, cksum_bytes, mac_bytes;
|
||||
char *ck;
|
||||
|
|
Loading…
Reference in a new issue