diff --git a/crypto/xsalsa20/xsalsa20_xor.c b/crypto/xsalsa20/xsalsa20_xor.c index 514681e..90d7c4f 100644 --- a/crypto/xsalsa20/xsalsa20_xor.c +++ b/crypto/xsalsa20/xsalsa20_xor.c @@ -130,7 +130,7 @@ salsa20_init(salsa20_ctx_t *ctx, uchar_t *salt, int saltlen, uchar_t *pwd, int p n = (uint64_t *)IV; n1 = (uint64_t *)(ctx->nonce); for (i = 0; i < XSALSA20_CRYPTO_NONCEBYTES/8; i++) { - *n1 = ntohll(*n); + *n1 = LE64(*n); n++; n1++; } diff --git a/main.c b/main.c index 5278cee..ef25835 100644 --- a/main.c +++ b/main.c @@ -693,13 +693,22 @@ start_decompress(const char *filename, const char *to_filename) * Open files and do sanity checks. */ if (!pipe_mode) { - if ((compfd = open(filename, O_RDONLY, 0)) == -1) - err_exit(1, "Cannot open: %s", filename); + if (filename == NULL) { + compfd = fileno(stdin); + if (compfd == -1) { + perror("fileno "); + UNCOMP_BAIL; + } + sbuf.st_size = 0; + } else { + if ((compfd = open(filename, O_RDONLY, 0)) == -1) + err_exit(1, "Cannot open: %s", filename); - if (fstat(compfd, &sbuf) == -1) - err_exit(1, "Cannot stat: %s", filename); - if (sbuf.st_size == 0) - return (1); + if (fstat(compfd, &sbuf) == -1) + err_exit(1, "Cannot stat: %s", filename); + if (sbuf.st_size == 0) + return (1); + } if ((uncompfd = open(to_filename, O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR | S_IWUSR)) == -1) { close(compfd); @@ -726,7 +735,10 @@ start_decompress(const char *filename, const char *to_filename) UNCOMP_BAIL; } if (init_algo(algorithm, 0) != 0) { - fprintf(stderr, "%s is not a pcompressed file.\n", filename); + if (pipe_mode || filename == NULL) + fprintf(stderr, "Input stream is not pcompressed.\n"); + else + fprintf(stderr, "%s is not a pcompressed file.\n", filename); UNCOMP_BAIL; } algo = algorithm; @@ -1279,9 +1291,11 @@ uncomp_done: /* * Ownership and mode of target should be same as original. */ - fchmod(uncompfd, sbuf.st_mode); - if (fchown(uncompfd, sbuf.st_uid, sbuf.st_gid) == -1) - perror("Chown "); + if (filename != NULL) { + fchmod(uncompfd, sbuf.st_mode); + if (fchown(uncompfd, sbuf.st_uid, sbuf.st_gid) == -1) + perror("Chown "); + } if (dary != NULL) { for (i = 0; i < nprocs; i++) { if (!dary[i]) continue; @@ -1299,7 +1313,7 @@ uncomp_done: slab_free(NULL, dary); } if (!pipe_mode) { - if (compfd != -1) close(compfd); + if (filename && compfd != -1) close(compfd); if (uncompfd != -1) close(uncompfd); } @@ -1667,14 +1681,12 @@ start_compress(const char *filename, uint64_t chunksize, int level) } flags = 0; + sbuf.st_size = 0; dedupe_flag = RABIN_DEDUPE_SEGMENTED; // Silence the compiler if (enable_rabin_scan || enable_fixed_scan || enable_rabin_global) { if (enable_rabin_global) { flags |= (FLAG_DEDUP | FLAG_DEDUP_FIXED); dedupe_flag = RABIN_DEDUPE_FILE_GLOBAL; - if (pipe_mode) { - return (1); - } } else if (enable_rabin_scan) { flags |= FLAG_DEDUP; dedupe_flag = RABIN_DEDUPE_SEGMENTED; @@ -2572,11 +2584,6 @@ main(int argc, char *argv[]) enable_rabin_split = 1; } - if (enable_rabin_global && pipe_mode) { - fprintf(stderr, "Global Deduplication is not supported in pipe mode.\n"); - exit(1); - } - if (enable_rabin_global && enable_delta_encode) { fprintf(stderr, "Global Deduplication does not support Delta Compression.\n"); exit(1); @@ -2605,8 +2612,15 @@ main(int argc, char *argv[]) } } else if (num_rem == 2) { if (do_uncompress) { - if ((filename = realpath(argv[optind], NULL)) == NULL) - err_exit(1, "%s", argv[optind]); + /* + * While decompressing, input can be stdin and output a physical file. + */ + if (*(argv[optind]) == '-') { + filename = NULL; + } else { + if ((filename = realpath(argv[optind], NULL)) == NULL) + err_exit(1, "%s", argv[optind]); + } optind++; if ((to_filename = realpath(argv[optind], NULL)) != NULL) { free(filename); diff --git a/rabin/global/db.c b/rabin/global/db.c index 5b1c186..11d2ac5 100644 --- a/rabin/global/db.c +++ b/rabin/global/db.c @@ -88,6 +88,9 @@ static cleanup_indx(index_t *indx) } } +#define MEM_PER_UNIT ( (hash_entry_size + sizeof (hash_entry_t *) + \ + (sizeof (hash_entry_t *)) / 2) + sizeof (hash_entry_t **) ) + archive_config_t * init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_chunk_sz, int pct_interval, const char *algo, cksum_t ck, cksum_t ck_sim, @@ -97,11 +100,26 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch int rv; float diff; + /* + * file_sz = 0 and pct_interval = 0 means we are in pipe mode and want a simple + * index. Set pct_interval to 100 to indicate that we need to use all of memlimit + * for the simple index. + * + * If file_sz != 0 but pct_interval = 0 then we need to create a simple index + * sized for the given file. + * + * If file_sz = 0 and pct_interval = 100 then we are in pipe mode and want a segmented + * index. This is typically for WAN deduplication of large data transfers. + */ + if (file_sz == 0 && pct_interval == 0) + pct_interval = 100; + cfg = calloc(1, sizeof (archive_config_t)); rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, user_chunk_sz, pct_interval); if (cfg->dedupe_mode == MODE_SIMPLE) { - pct_interval = 0; + if (pct_interval != 100) + pct_interval = 0; cfg->pct_interval = 0; } @@ -115,30 +133,36 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch int hash_entry_size; index_t *indx; + hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1; + // Compute total hashtable entries first if (pct_interval == 0) { intervals = 1; hash_slots = file_sz / cfg->chunk_sz_bytes + 1; + + } else if (pct_interval == 100) { + intervals = 1; + hash_slots = memlimit / MEM_PER_UNIT - 5; + pct_interval = 0; } else { intervals = 100 / pct_interval - 1; hash_slots = file_sz / cfg->segment_sz_bytes + 1; hash_slots *= intervals; } - hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1; // Compute memory required to hold all hash entries assuming worst case 50% // occupancy. - memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) + - (sizeof (hash_entry_t *)) / 2); - memreqd += hash_slots * sizeof (hash_entry_t **); + memreqd = hash_slots * MEM_PER_UNIT; diff = (float)pct_interval / 100.0; // Reduce hash_slots to remain within memlimit while (memreqd > memlimit) { - hash_slots -= (hash_slots * diff); - memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) + - (sizeof (hash_entry_t *)) / 2); - memreqd += hash_slots * sizeof (hash_entry_t **); + if (pct_interval == 0) { + hash_slots--; + } else { + hash_slots -= (hash_slots * diff); + } + memreqd = hash_slots * MEM_PER_UNIT; } // Now create as many hash tables as there are similarity match intervals @@ -251,7 +275,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, } } if (do_insert) { - if (indx->memused + indx->hash_entry_size >= indx->memlimit) { + if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) { ent = htab[htab_entry]; htab[htab_entry] = htab[htab_entry]->next; } else { diff --git a/rabin/global/dedupe_config.c b/rabin/global/dedupe_config.c index e9bcf7f..688b9a3 100644 --- a/rabin/global/dedupe_config.c +++ b/rabin/global/dedupe_config.c @@ -360,7 +360,7 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim cfg->archive_sz = file_sz; cfg->dedupe_mode = MODE_SIMILARITY; - if (cfg->archive_sz <= SIXTEEN_GB || pct_interval == 0) { + if (cfg->archive_sz <= SIXTEEN_GB || pct_interval == 0 || pct_interval == 100) { cfg->dedupe_mode = MODE_SIMPLE; cfg->segment_sz_bytes = user_chunk_sz; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 842de2d..33e922e 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -188,9 +188,9 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s get_sysinfo(&msys_info); /* - * Use a maximum of approx 62% of free RAM for the index. + * Use a maximum of approx 75% of free RAM for the index. */ - msys_info.freeram = (msys_info.freeram >> 1) + (msys_info.freeram >> 3); + msys_info.freeram = (msys_info.freeram >> 1) + (msys_info.freeram >> 2); arc = init_global_db_s(NULL, NULL, rab_blk_sz, chunksize, 0, algo, props->cksum, props->cksum, file_size, msys_info.freeram, props->nthreads); @@ -1107,7 +1107,7 @@ dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size) * RAM. Just mem-copy it. * Otherwise it will be in the current output file. We mmap() the relevant * region and copy it. The way deduplication is done it is guaranteed that - * all duplicate reference will be backward references so this approach works. + * all duplicate references will be backward references so this approach works. * * However this approach precludes pipe-mode streamed decompression since * it requires random access to the output file.