diff --git a/main.c b/main.c index 3d27ded..008753f 100644 --- a/main.c +++ b/main.c @@ -150,11 +150,14 @@ usage(void) "4) Attempt Rabin fingerprinting based deduplication on chunks:\n" " %s -D ...\n" " %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n\n" - "5) Perform Delta Encoding in addition to Identical Dedup:\n" + "5) Perform Deduplication across the entire dataset (Global Dedupe):\n" + " %s -G <-D|-F> - This option requires one of '-D' or '-F' to be specified\n" + " to identify the block splitting method.\n" + "6) Perform Delta Encoding in addition to Identical Dedupe:\n" " %s -E ... - This also implies '-D'. This checks for at least 60%% similarity.\n" " The flag can be repeated as in '-EE' to indicate at least 40%% similarity.\n\n" - "6) Number of threads can optionally be specified: -t <1 - 256 count>\n" - "7) Other flags:\n" + "7) Number of threads can optionally be specified: -t <1 - 256 count>\n" + "8) Other flags:\n" " '-L' - Enable LZP pre-compression. This improves compression ratio of all\n" " algorithms with some extra CPU and very low RAM overhead.\n" " '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio for\n" @@ -164,11 +167,12 @@ usage(void) " datasets.\n" " '-S' \n" " - Specify chunk checksum to use:\n\n", - UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name); + UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name, + exec_name); list_checksums(stderr, " "); fprintf(stderr, "\n" - " '-F' - Perform Fixed-Block Deduplication. Faster than '-D' in some cases\n" - " but with lower deduplication ratio.\n" + " '-F' - Perform Fixed-Block Deduplication. Faster than '-D' but with lower\n" + " deduplication ratio.\n" " '-B' <1..5>\n" " - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n" " '-M' - Display memory allocator statistics\n" @@ -2669,6 +2673,11 @@ main(int argc, char *argv[]) if (cksum == 0) get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes, 0); + if ((enable_rabin_scan || enable_fixed_scan) && cksum == CKSUM_CRC64) { + fprintf(stderr, "CRC64 checksum is not suitable for Deduplication.\n"); + exit(1); + } + if (!encrypt_type) { /* * If not encrypting we compute a header CRC32. diff --git a/rabin/global/index.c b/rabin/global/index.c index 078a210..d67072b 100644 --- a/rabin/global/index.c +++ b/rabin/global/index.c @@ -169,16 +169,22 @@ set_cfg: *pct_interval = 0; } else { cfg->intervals = 100 / *pct_interval; - cfg->sub_intervals = cfg->segment_sz / cfg->intervals; + cfg->sub_intervals = (cfg->segment_sz + 1) / cfg->intervals; *hash_slots = file_sz / cfg->segment_sz_bytes + 1; *hash_slots *= (cfg->intervals + cfg->sub_intervals); } - // Compute memory required to hold all hash entries assuming worst case 50% - // occupancy. + /* + * Compute memory required to hold all hash entries assuming worst case 50% + * occupancy. + */ *memreqd = MEM_REQD(*hash_slots, *hash_entry_size); - if (*memreqd > (memlimit + (memlimit >> 1)) && cfg->dedupe_mode == MODE_SIMPLE && + /* + * If memory required is more than twice the indicated memory limit then + * we switch to Segmented Cumulative Similarity based dedupe. + */ + if (*memreqd > (memlimit * 2) && cfg->dedupe_mode == MODE_SIMPLE && *pct_interval == 0) { *pct_interval = DEFAULT_PCT_INTERVAL; set_user = 1;