Fix buffer size computation when allocating Rabin block array.

Reduce memory usage of Rabin block array. Add an SSE optimization for bsdiff. Move integer hashing function to utils file. More updates to README.
2012-07-28 23:55:24 +05:30 · 2012-07-28 23:55:24 +05:30 · 94563a7ecd
commit 94563a7ecd
parent f83652aa90
7 changed files with 100 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -22,7 +22,7 @@ maximum parallelism. It also bundles a simple slab allocator to speed
 repeated allocation of similar chunks. It can work in pipe mode, reading
 from stdin and writing to stdout. It also provides some adaptive compression
 modes in which multiple algorithms are tried per chunk to determine the best
-one for the given chunk. Finally it support 14 compression levels to allow
+one for the given chunk. Finally it supports 14 compression levels to allow
 for ultra compression modes in some algorithms.
 Usage
@ -58,7 +58,8 @@ Usage
    Attempt Rabin fingerprinting based deduplication on chunks:
       pcompress -D ...
-       pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.
+       pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default
                             is to split.
    Perform Delta Encoding in addition to Exact Dedup:
       pcompress -E ... - This also implies '-D'.
@ -67,6 +68,13 @@ Usage
    Pass '-M' to display memory allocator statistics
    Pass '-C' to display compression statistics
 Environment Variables
 =====================
 Set ALLOCATOR_BYPASS=1 in the environment to avoid using the the built-in
 allocator. Due to the the way it rounds up an allocation request to the nearest
 slab the built-in allocator can allocate extra unused memory.
 Examples
 ========
@ -80,4 +88,41 @@ of 1GB. Allow pcompress to detect the number of CPU cores and use as many thread
    pcompress -c lzma -l14 -s1g file.tar
 Compression Algorithms
 ======================
 LZFX	- Ultra Fast, average compression. This algorithm is the fastest overall.
 	  Levels: 1 - 5
 LZ4	- Very Fast, better compression than LZFX.
 	  Levels: 1 - 3
 Zlib	- Fast, better compression.
 	  Levels: 1 - 9
 Bzip2	- Slow, much better compression than Zlib.
 	  Levels: 1 - 9
 LZMA	- Very slow. Extreme compression.
 	  Levels: 1 - 14
 PPMD	- Slow. Extreme compression for Text, average compression for binary.
 	  Levels: 1 - 14.
 Adapt	- Very slow synthetic mode. Both Bzip2 and PPMD are tried per chunk and
 	  better result selected.
 	  Levels: 1 - 14
 Adapt2	- Ultra slow synthetic mode. Both LZMA and PPMD are tried per chunk and
 	  better result selected. Can give best compression ration when splitting
 	  file into multiple chunks.
 	  Levels: 1 - 14
 It is possible for a single chunk to span the entire file if enough RAM is
 available. However for adaptive modes to be effective for large files, especially
 multi-file archives splitting into chunks is required so that best compression
 algorithm can be selected for textual and binary portions.
 Caveats
 =======
 This utility can gobble up RAM depending on compression algorithm,
 compression level, and dedupe being enabled. Larger chunk sizes can give
 better compression ratio but at the same time use more RAM.
 In some cases for files less than a gigabyte. Using Delta Compression in addition
 to exact Dedupe can have a slight negative impact on LZMA compression ratio
 especially when using the large-window ultra compression levels above 12.
--- a/allocator.c
+++ b/allocator.c
@ -100,23 +100,6 @@ static int inited = 0, bypass = 0;
 static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries;
 /*
 * Hash function for 64Bit pointers that generates a 32Bit hash value.
 * Taken from Thomas Wang's Integer hashing paper:
 * http://www.cris.com/~Ttwang/tech/inthash.htm
 */
 uint32_t
 hash6432shift(uint64_t key)
 {
 	key = (~key) + (key << 18); // key = (key << 18) - key - 1;
 	key = key ^ (key >> 31);
 	key = key * 21; // key = (key + (key << 2)) + (key << 4);
 	key = key ^ (key >> 11);
 	key = key + (key << 6);
 	key = key ^ (key >> 22);
 	return (uint32_t) key;
 }
 void
 slab_init()
 {
--- a/bsdiff/bsdiff.c
+++ b/bsdiff/bsdiff.c
@ -61,6 +61,11 @@ __FBSDID("$FreeBSD: src/usr.bin/bsdiff/bsdiff/bsdiff.c,v 1.1 2005/08/06 01:59:05
 #include <unistd.h>
 #include <allocator.h>
 #include <utils.h>
 #ifdef __USE_SSE_INTRIN__
 #include <emmintrin.h>
 #endif
 #include "bscommon.h"
 #define MIN(x,y) (((x)<(y)) ? (x) : (y))
@ -131,7 +136,28 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *old,bsize_t oldsize)
 	bsize_t buckets[256];
 	bsize_t i,h,len;
 #ifdef __USE_SSE_INTRIN__
 	if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
 		int iters;
 		uchar_t *pos;
 		iters = (256 * sizeof (bsize_t)) / (16 * 4);
 		__m128i zero = _mm_setzero_si128 ();
 		pos = (uchar_t *)buckets;
 		for (i=0; i<iters; i++) {
 			_mm_store_si128((__m128i *)pos, zero);
 			_mm_store_si128((__m128i *)(pos+16), zero);
 			_mm_store_si128((__m128i *)(pos+32), zero);
 			_mm_store_si128((__m128i *)(pos+48), zero);
 			pos += 64;
 		}
 	} else {
 #endif
 	for(i=0;i<256;i++) buckets[i]=0;
 #ifdef __USE_SSE_INTRIN__
 	}
 #endif
 	for(i=0;i<oldsize;i++) buckets[old[i]]++;
 	for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
 	for(i=255;i>0;i--) buckets[i]=buckets[i-1];
--- a/rabin/rabin_polynomial.c
+++ b/rabin/rabin_polynomial.c
@ -151,7 +151,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
 	ctx->blocks = NULL;
 	if (real_chunksize > 0) {
 		ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL,
-			blknum * ctx->rabin_poly_min_block_size);
+			blknum * sizeof (rabin_blockentry_t));
 	}
 	if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
 		fprintf(stderr,
@ -370,7 +370,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			ctx->blocks[blknum].offset = last_offset;
 			ctx->blocks[blknum].index = blknum; // Need to store for sorting
 			ctx->blocks[blknum].length = length;
-			ctx->blocks[blknum].refcount = 0;
+			ctx->blocks[blknum].ref = 0;
 			ctx->blocks[blknum].similar = 0;
 			ctx->blocks[blknum].cksum_n_offset = cur_sketch;
 			memset(fplist, 0, fplist_sz);
@ -399,7 +399,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			ctx->blocks[blknum].offset = last_offset;
 			ctx->blocks[blknum].index = blknum;
 			ctx->blocks[blknum].length = *size - last_offset;
-			ctx->blocks[blknum].refcount = 0;
+			ctx->blocks[blknum].ref = 0;
 			ctx->blocks[blknum].similar = 0;
 			ctx->blocks[blknum].cksum_n_offset = cur_sketch;
 			blknum++;
@ -445,7 +445,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			    memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
 				ctx->blocks[blk].similar = SIMILAR_EXACT;
 				ctx->blocks[blk].index = prev_index;
-				(ctx->blocks[prev_blk].refcount)++;
+				ctx->blocks[prev_blk].ref = 1;
 				matchlen += prev_length;
 				continue;
 			}
@ -460,11 +460,11 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			for (blk = 0; blk < blknum; blk++) {
 				if (ctx->blocks[blk].similar) continue;
-				if (blk > 0 && ctx->blocks[blk].refcount == 0 &&
+				if (blk > 0 && ctx->blocks[blk].ref == 0 &&
 				    ctx->blocks[blk].cksum_n_offset == prev_cksum) {
 					ctx->blocks[blk].index = prev_index;
 					ctx->blocks[blk].similar = SIMILAR_PARTIAL;
-					(ctx->blocks[prev_blk].refcount)++;
+					ctx->blocks[prev_blk].ref = 1;
 					matchlen += prev_length/2;
 					continue;
 				}
@ -502,7 +502,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 				 * size.
 				 */
 				if (prev_index == 0) {
-					if (be->refcount == 0) {
+					if (be->ref == 0) {
 						prev_index = pos;
 						prev_length = be->length;
 					}
@ -511,7 +511,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 					trans[blk] = pos;
 					pos++;
 				} else {
-					if (be->refcount > 0) {
+					if (be->ref > 0) {
 						prev_index = 0;
 						prev_length = 0;
 						rabin_index[pos] = be->length;
--- a/rabin/rabin_polynomial.h
+++ b/rabin/rabin_polynomial.h
@ -136,8 +136,7 @@ typedef struct {
 	unsigned int index;
 	unsigned int length;
 	unsigned int new_length;
-	unsigned short refcount;
+	unsigned char ref, similar;
 	short similar;
 } rabin_blockentry_t;
 typedef struct {
--- a/utils.c
+++ b/utils.c
@ -169,6 +169,23 @@ bytes_to_size(uint64_t bytes)
 	return (num);
 }
 /*
 * Hash function for 64Bit pointers that generates a 32Bit hash value.
 * Taken from Thomas Wang's Integer hashing paper:
 * http://www.cris.com/~Ttwang/tech/inthash.htm
 */
 uint32_t
 hash6432shift(uint64_t key)
 {
 	key = (~key) + (key << 18); // key = (key << 18) - key - 1;
 	key = key ^ (key >> 31);
 	key = key * 21; // key = (key + (key << 2)) + (key << 4);
 	key = key ^ (key >> 11);
 	key = key + (key << 6);
 	key = key ^ (key >> 22);
 	return (uint32_t) key;
 }
 /*
 * Read/Write helpers to ensure a full chunk is read or written
 * unless there is an error.
--- a/utils.h
+++ b/utils.h
@ -102,6 +102,7 @@ extern void err_exit(int show_errno, const char *format, ...);
 extern const char *get_execname(const char *);
 extern int parse_numeric(ssize_t *val, const char *str);
 extern char *bytes_to_size(uint64_t bytes);
 extern uint32_t hash6432shift(uint64_t key);
 extern ssize_t Read(int fd, void *buf, size_t count);
 extern ssize_t Read_Adjusted(int fd, uchar_t *buf, size_t count,
 	ssize_t *rabin_count, void *ctx);