From c7cc7b469ce02b41d8af0831e72d0695d58aee0d Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Fri, 27 Jul 2012 22:03:24 +0530 Subject: [PATCH] Update chunk size computation to reduce memory usage. Implement runtime bypass of custom allocator. Update README. --- README.md | 82 +++++++++++++++++++++++++++++++++++++++- allocator.c | 13 ++++++- main.c | 21 ++++++---- pcompress.h | 1 + rabin/rabin_polynomial.c | 6 +++ rabin/rabin_polynomial.h | 3 +- zlib_compress.c | 8 ++++ 7 files changed, 122 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b447833..f3bd059 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,82 @@ -pcompress +Pcompress ========= -A Parallel Compression/Decompression utility \ No newline at end of file +Copyright (C) 2012 Moinak Ghosh. All rights reserved. +Use is subject to license terms. + +Pcompress is a utility to do compression and decompression in parallel by +splitting input data into chunks. It has a modular structure and includes +support for multiple algorithms like LZMA, Bzip2, PPMD, etc., with CRC64 +chunk checksums. SSE optimizations for the bundled LZMA are included. It +also implements chunk-level Content-Aware Deduplication and Delta +Compression features based on a Semi-Rabin Fingerprinting scheme. Delta +Compression is implemented via the widely popular bsdiff algorithm. +Similarity is detected using a custom hashing of maximal features of a +block. When doing chunk-level dedupe it attempts to merge adjacent +non-duplicate blocks index entries into a single larger entry to reduce +metadata. In addition to all these it can internally split chunks at +rabin boundaries to help dedupe and compression. + +It has low metadata overhead and overlaps I/O and compression to achieve +maximum parallelism. It also bundles a simple slab allocator to speed +repeated allocation of similar chunks. It can work in pipe mode, reading +from stdin and writing to stdout. It also provides some adaptive compression +modes in which multiple algorithms are tried per chunk to determine the best +one for the given chunk. Finally it support 14 compression levels to allow +for ultra compression modes in some algorithms. + +Usage +===== + + To compress a file: + pcompress -c [-l ] [-s ] + Where can be the folowing: + lzfx - Very fast and small algorithm based on LZF. + lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1. + zlib - The base Zlib format compression (not Gzip). + lzma - The LZMA (Lempel-Ziv Markov) algorithm from 7Zip. + bzip2 - Bzip2 Algorithm from libbzip2. + ppmd - The PPMd algorithm excellent for textual data. PPMd requires + at least 64MB X CPUs more memory than the other modes. + adapt - Adaptive mode where ppmd or bzip2 will be used per chunk, + depending on which one produces better compression. This mode + is obviously fairly slow and requires lots of memory. + adapt2 - Adaptive mode which includes ppmd and lzma. This requires + more memory than adapt mode, is slower and potentially gives + the best compression. + - This can be in bytes or can use the following suffixes: + g - Gigabyte, m - Megabyte, k - Kilobyte. + Larger chunks produce better compression at the cost of memory. + - Can be a number from 0 meaning minimum and 14 meaning + maximum compression. + + To decompress a file compressed using above command: + pcompress -d + + To operate as a pipe, read from stdin and write to stdout: + pcompress -p ... + + Attempt Rabin fingerprinting based deduplication on chunks: + pcompress -D ... + pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split. + + Perform Delta Encoding in addition to Exact Dedup: + pcompress -E ... - This also implies '-D'. + + Number of threads can optionally be specified: -t <1 - 256 count> + Pass '-M' to display memory allocator statistics + Pass '-C' to display compression statistics + +Examples +======== + +Compress "file.tar" using bzip2 level 6, 64MB chunk size and use 4 threads. In +addition perform exact deduplication and delta compression prior to compression. + + pcompress -D -E -c bzip2 -l6 -s64m -t4 file.tar + +Compress "file.tar" using extreme compression mode of LZMA and a chunk size of +of 1GB. Allow pcompress to detect the number of CPU cores and use as many threads. + + pcompress -c lzma -l14 -s1g file.tar + diff --git a/allocator.c b/allocator.c index b0ac4cf..8562c15 100644 --- a/allocator.c +++ b/allocator.c @@ -96,7 +96,7 @@ static struct bufentry **htable; static pthread_mutex_t *hbucket_locks; static pthread_mutex_t htable_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t slab_table_lock = PTHREAD_MUTEX_INITIALIZER; -static int inited = 0; +static int inited = 0, bypass = 0; static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries; @@ -124,6 +124,12 @@ slab_init() size_t slab_sz; int nprocs; + /* Check bypass env variable. */ + if (getenv("ALLOCATOR_BYPASS") != NULL) { + bypass = 1; + return; + } + /* Initialize first NUM_POW2 power of 2 slots. */ slab_sz = SLAB_START_SZ; for (i = 0; i < NUM_POW2; i++) { @@ -177,6 +183,7 @@ slab_cleanup(int quiet) uint64_t nonfreed_oversize; if (!inited) return; + if (bypass) return; if (!quiet) { fprintf(stderr, "Slab Allocation Stats\n"); @@ -276,6 +283,7 @@ void * slab_calloc(void *p, size_t items, size_t size) { void *ptr; + if (bypass) return(calloc(items, size)); ptr = slab_alloc(p, items * size); memset(ptr, 0, items * size); return (ptr); @@ -338,6 +346,7 @@ slab_cache_add(size_t size) { uint32_t sindx; struct slabentry *slab; + if (bypass) return (0); if (try_dynamic_slab(size)) return (0); /* Already added. */ /* Locate the hash slot for the size. */ @@ -375,6 +384,7 @@ slab_alloc(void *p, size_t size) void *ptr; struct slabentry *slab; + if (bypass) return (malloc(size)); ATOMIC_ADD(total_allocs, 1); slab = NULL; @@ -444,6 +454,7 @@ slab_free(void *p, void *address) uint32_t hindx; if (!address) return; + if (bypass) { free(address); return; } hindx = hash6432shift((uint64_t)(address)) & (HTABLE_SZ - 1); pthread_mutex_lock(&hbucket_locks[hindx]); diff --git a/main.c b/main.c index 5ff3599..6fa7101 100644 --- a/main.c +++ b/main.c @@ -45,6 +45,7 @@ #include #include #include +#include /* Needed for CLzmaEncprops. */ #include @@ -788,7 +789,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) rabin_context_t *rctx; /* - * Compressed buffer size must include zlib scratch space and + * Compressed buffer size must include zlib/dedup scratch space and * chunk header space. * See http://www.zlib.net/manual.html#compress2 * @@ -799,19 +800,23 @@ start_compress(const char *filename, uint64_t chunksize, int level) * See start_decompress() routine for details of chunk header. * We also keep extra 8-byte space for the last chunk's size. */ - compressed_chunksize = chunksize + (chunksize >> 6) + - sizeof (chunksize) + sizeof (uint64_t) + sizeof (chunksize); - err = 0; + compressed_chunksize = chunksize + sizeof (chunksize) + + sizeof (uint64_t) + sizeof (chunksize) + zlib_buf_extra(chunksize); + flags = 0; + if (enable_rabin_scan) { + flags |= FLAG_DEDUP; + /* Additional scratch space for dedup arrays. */ + compressed_chunksize += (rabin_buf_extra(chunksize) - + (compressed_chunksize - chunksize)); + } + + err = 0; thread = 0; slab_cache_add(chunksize); slab_cache_add(compressed_chunksize + CHDR_SZ); slab_cache_add(sizeof (struct cmp_data)); - if (enable_rabin_scan) { - flags |= FLAG_DEDUP; - } - /* A host of sanity checks. */ if (!pipe_mode) { if ((uncompfd = open(filename, O_RDWR, 0)) == -1) diff --git a/pcompress.h b/pcompress.h index 1eaa7f2..05cdfc7 100644 --- a/pcompress.h +++ b/pcompress.h @@ -54,6 +54,7 @@ extern "C" { extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc); extern uint64_t lzma_crc64_8bchk(const uint8_t *buf, size_t size, uint64_t crc, uint64_t *cnt); +extern uint32_t zlib_buf_extra(ssize_t buflen); extern int zlib_compress(void *src, size_t srclen, void *dst, size_t *destlen, int level, uchar_t chdr, void *data); diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c index 7dc0789..b655a34 100755 --- a/rabin/rabin_polynomial.c +++ b/rabin/rabin_polynomial.c @@ -81,6 +81,12 @@ extern int bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new, uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; +uint32_t +rabin_buf_extra(uint64_t chunksize) +{ + return ((chunksize / RAB_POLYNOMIAL_MIN_BLOCK_SIZE2) * sizeof (uint32_t)); +} + /* * Initialize the algorithm with the default params. */ diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h index 91eb54e..2365c00 100644 --- a/rabin/rabin_polynomial.h +++ b/rabin/rabin_polynomial.h @@ -168,5 +168,6 @@ extern void rabin_parse_hdr(uchar_t *buf, unsigned int *blknum, ssize_t *rabin_i extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp, ssize_t rabin_data_sz_cmp); extern void reset_rabin_context(rabin_context_t *ctx); +extern uint32_t rabin_buf_extra(uint64_t chunksize); -#endif /* _RABIN_POLY_H_ */ \ No newline at end of file +#endif /* _RABIN_0POLY_H_ */ \ No newline at end of file diff --git a/zlib_compress.c b/zlib_compress.c index c2a3723..382b612 100644 --- a/zlib_compress.c +++ b/zlib_compress.c @@ -43,6 +43,14 @@ slab_alloc_ui(void *p, unsigned int items, unsigned int size) { return (ptr); } +uint32_t +zlib_buf_extra(ssize_t buflen) +{ + if (buflen > SINGLE_CALL_MAX) + buflen = SINGLE_CALL_MAX; + return (compressBound(buflen) - buflen); +} + int zlib_init(void **data, int *level, ssize_t chunksize) {