Fix buffer size computation when allocating Rabin block array.

Reduce memory usage of Rabin block array.
Add an SSE optimization for bsdiff.
Move integer hashing function to utils file.
More updates to README.
This commit is contained in:
Moinak Ghosh 2012-07-28 23:55:24 +05:30
parent f83652aa90
commit 94563a7ecd
7 changed files with 100 additions and 29 deletions

View file

@ -22,7 +22,7 @@ maximum parallelism. It also bundles a simple slab allocator to speed
repeated allocation of similar chunks. It can work in pipe mode, reading repeated allocation of similar chunks. It can work in pipe mode, reading
from stdin and writing to stdout. It also provides some adaptive compression from stdin and writing to stdout. It also provides some adaptive compression
modes in which multiple algorithms are tried per chunk to determine the best modes in which multiple algorithms are tried per chunk to determine the best
one for the given chunk. Finally it support 14 compression levels to allow one for the given chunk. Finally it supports 14 compression levels to allow
for ultra compression modes in some algorithms. for ultra compression modes in some algorithms.
Usage Usage
@ -58,7 +58,8 @@ Usage
Attempt Rabin fingerprinting based deduplication on chunks: Attempt Rabin fingerprinting based deduplication on chunks:
pcompress -D ... pcompress -D ...
pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split. pcompress -D -r ... - Do NOT split chunks at a rabin boundary. Default
is to split.
Perform Delta Encoding in addition to Exact Dedup: Perform Delta Encoding in addition to Exact Dedup:
pcompress -E ... - This also implies '-D'. pcompress -E ... - This also implies '-D'.
@ -67,6 +68,13 @@ Usage
Pass '-M' to display memory allocator statistics Pass '-M' to display memory allocator statistics
Pass '-C' to display compression statistics Pass '-C' to display compression statistics
Environment Variables
=====================
Set ALLOCATOR_BYPASS=1 in the environment to avoid using the the built-in
allocator. Due to the the way it rounds up an allocation request to the nearest
slab the built-in allocator can allocate extra unused memory.
Examples Examples
======== ========
@ -80,4 +88,41 @@ of 1GB. Allow pcompress to detect the number of CPU cores and use as many thread
pcompress -c lzma -l14 -s1g file.tar pcompress -c lzma -l14 -s1g file.tar
Compression Algorithms
======================
LZFX - Ultra Fast, average compression. This algorithm is the fastest overall.
Levels: 1 - 5
LZ4 - Very Fast, better compression than LZFX.
Levels: 1 - 3
Zlib - Fast, better compression.
Levels: 1 - 9
Bzip2 - Slow, much better compression than Zlib.
Levels: 1 - 9
LZMA - Very slow. Extreme compression.
Levels: 1 - 14
PPMD - Slow. Extreme compression for Text, average compression for binary.
Levels: 1 - 14.
Adapt - Very slow synthetic mode. Both Bzip2 and PPMD are tried per chunk and
better result selected.
Levels: 1 - 14
Adapt2 - Ultra slow synthetic mode. Both LZMA and PPMD are tried per chunk and
better result selected. Can give best compression ration when splitting
file into multiple chunks.
Levels: 1 - 14
It is possible for a single chunk to span the entire file if enough RAM is
available. However for adaptive modes to be effective for large files, especially
multi-file archives splitting into chunks is required so that best compression
algorithm can be selected for textual and binary portions.
Caveats
=======
This utility can gobble up RAM depending on compression algorithm,
compression level, and dedupe being enabled. Larger chunk sizes can give
better compression ratio but at the same time use more RAM.
In some cases for files less than a gigabyte. Using Delta Compression in addition
to exact Dedupe can have a slight negative impact on LZMA compression ratio
especially when using the large-window ultra compression levels above 12.

View file

@ -100,23 +100,6 @@ static int inited = 0, bypass = 0;
static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries; static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries;
/*
* Hash function for 64Bit pointers that generates a 32Bit hash value.
* Taken from Thomas Wang's Integer hashing paper:
* http://www.cris.com/~Ttwang/tech/inthash.htm
*/
uint32_t
hash6432shift(uint64_t key)
{
key = (~key) + (key << 18); // key = (key << 18) - key - 1;
key = key ^ (key >> 31);
key = key * 21; // key = (key + (key << 2)) + (key << 4);
key = key ^ (key >> 11);
key = key + (key << 6);
key = key ^ (key >> 22);
return (uint32_t) key;
}
void void
slab_init() slab_init()
{ {

View file

@ -61,6 +61,11 @@ __FBSDID("$FreeBSD: src/usr.bin/bsdiff/bsdiff/bsdiff.c,v 1.1 2005/08/06 01:59:05
#include <unistd.h> #include <unistd.h>
#include <allocator.h> #include <allocator.h>
#include <utils.h> #include <utils.h>
#ifdef __USE_SSE_INTRIN__
#include <emmintrin.h>
#endif
#include "bscommon.h" #include "bscommon.h"
#define MIN(x,y) (((x)<(y)) ? (x) : (y)) #define MIN(x,y) (((x)<(y)) ? (x) : (y))
@ -131,7 +136,28 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *old,bsize_t oldsize)
bsize_t buckets[256]; bsize_t buckets[256];
bsize_t i,h,len; bsize_t i,h,len;
#ifdef __USE_SSE_INTRIN__
if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
int iters;
uchar_t *pos;
iters = (256 * sizeof (bsize_t)) / (16 * 4);
__m128i zero = _mm_setzero_si128 ();
pos = (uchar_t *)buckets;
for (i=0; i<iters; i++) {
_mm_store_si128((__m128i *)pos, zero);
_mm_store_si128((__m128i *)(pos+16), zero);
_mm_store_si128((__m128i *)(pos+32), zero);
_mm_store_si128((__m128i *)(pos+48), zero);
pos += 64;
}
} else {
#endif
for(i=0;i<256;i++) buckets[i]=0; for(i=0;i<256;i++) buckets[i]=0;
#ifdef __USE_SSE_INTRIN__
}
#endif
for(i=0;i<oldsize;i++) buckets[old[i]]++; for(i=0;i<oldsize;i++) buckets[old[i]]++;
for(i=1;i<256;i++) buckets[i]+=buckets[i-1]; for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
for(i=255;i>0;i--) buckets[i]=buckets[i-1]; for(i=255;i>0;i--) buckets[i]=buckets[i-1];

View file

@ -151,7 +151,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
ctx->blocks = NULL; ctx->blocks = NULL;
if (real_chunksize > 0) { if (real_chunksize > 0) {
ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL, ctx->blocks = (rabin_blockentry_t *)slab_alloc(NULL,
blknum * ctx->rabin_poly_min_block_size); blknum * sizeof (rabin_blockentry_t));
} }
if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) { if(ctx == NULL || current_window_data == NULL || (ctx->blocks == NULL && real_chunksize > 0)) {
fprintf(stderr, fprintf(stderr,
@ -370,7 +370,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
ctx->blocks[blknum].offset = last_offset; ctx->blocks[blknum].offset = last_offset;
ctx->blocks[blknum].index = blknum; // Need to store for sorting ctx->blocks[blknum].index = blknum; // Need to store for sorting
ctx->blocks[blknum].length = length; ctx->blocks[blknum].length = length;
ctx->blocks[blknum].refcount = 0; ctx->blocks[blknum].ref = 0;
ctx->blocks[blknum].similar = 0; ctx->blocks[blknum].similar = 0;
ctx->blocks[blknum].cksum_n_offset = cur_sketch; ctx->blocks[blknum].cksum_n_offset = cur_sketch;
memset(fplist, 0, fplist_sz); memset(fplist, 0, fplist_sz);
@ -399,7 +399,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
ctx->blocks[blknum].offset = last_offset; ctx->blocks[blknum].offset = last_offset;
ctx->blocks[blknum].index = blknum; ctx->blocks[blknum].index = blknum;
ctx->blocks[blknum].length = *size - last_offset; ctx->blocks[blknum].length = *size - last_offset;
ctx->blocks[blknum].refcount = 0; ctx->blocks[blknum].ref = 0;
ctx->blocks[blknum].similar = 0; ctx->blocks[blknum].similar = 0;
ctx->blocks[blknum].cksum_n_offset = cur_sketch; ctx->blocks[blknum].cksum_n_offset = cur_sketch;
blknum++; blknum++;
@ -445,7 +445,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) { memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
ctx->blocks[blk].similar = SIMILAR_EXACT; ctx->blocks[blk].similar = SIMILAR_EXACT;
ctx->blocks[blk].index = prev_index; ctx->blocks[blk].index = prev_index;
(ctx->blocks[prev_blk].refcount)++; ctx->blocks[prev_blk].ref = 1;
matchlen += prev_length; matchlen += prev_length;
continue; continue;
} }
@ -460,11 +460,11 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
for (blk = 0; blk < blknum; blk++) { for (blk = 0; blk < blknum; blk++) {
if (ctx->blocks[blk].similar) continue; if (ctx->blocks[blk].similar) continue;
if (blk > 0 && ctx->blocks[blk].refcount == 0 && if (blk > 0 && ctx->blocks[blk].ref == 0 &&
ctx->blocks[blk].cksum_n_offset == prev_cksum) { ctx->blocks[blk].cksum_n_offset == prev_cksum) {
ctx->blocks[blk].index = prev_index; ctx->blocks[blk].index = prev_index;
ctx->blocks[blk].similar = SIMILAR_PARTIAL; ctx->blocks[blk].similar = SIMILAR_PARTIAL;
(ctx->blocks[prev_blk].refcount)++; ctx->blocks[prev_blk].ref = 1;
matchlen += prev_length/2; matchlen += prev_length/2;
continue; continue;
} }
@ -502,7 +502,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
* size. * size.
*/ */
if (prev_index == 0) { if (prev_index == 0) {
if (be->refcount == 0) { if (be->ref == 0) {
prev_index = pos; prev_index = pos;
prev_length = be->length; prev_length = be->length;
} }
@ -511,7 +511,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
trans[blk] = pos; trans[blk] = pos;
pos++; pos++;
} else { } else {
if (be->refcount > 0) { if (be->ref > 0) {
prev_index = 0; prev_index = 0;
prev_length = 0; prev_length = 0;
rabin_index[pos] = be->length; rabin_index[pos] = be->length;

View file

@ -136,8 +136,7 @@ typedef struct {
unsigned int index; unsigned int index;
unsigned int length; unsigned int length;
unsigned int new_length; unsigned int new_length;
unsigned short refcount; unsigned char ref, similar;
short similar;
} rabin_blockentry_t; } rabin_blockentry_t;
typedef struct { typedef struct {

17
utils.c
View file

@ -169,6 +169,23 @@ bytes_to_size(uint64_t bytes)
return (num); return (num);
} }
/*
* Hash function for 64Bit pointers that generates a 32Bit hash value.
* Taken from Thomas Wang's Integer hashing paper:
* http://www.cris.com/~Ttwang/tech/inthash.htm
*/
uint32_t
hash6432shift(uint64_t key)
{
key = (~key) + (key << 18); // key = (key << 18) - key - 1;
key = key ^ (key >> 31);
key = key * 21; // key = (key + (key << 2)) + (key << 4);
key = key ^ (key >> 11);
key = key + (key << 6);
key = key ^ (key >> 22);
return (uint32_t) key;
}
/* /*
* Read/Write helpers to ensure a full chunk is read or written * Read/Write helpers to ensure a full chunk is read or written
* unless there is an error. * unless there is an error.

View file

@ -102,6 +102,7 @@ extern void err_exit(int show_errno, const char *format, ...);
extern const char *get_execname(const char *); extern const char *get_execname(const char *);
extern int parse_numeric(ssize_t *val, const char *str); extern int parse_numeric(ssize_t *val, const char *str);
extern char *bytes_to_size(uint64_t bytes); extern char *bytes_to_size(uint64_t bytes);
extern uint32_t hash6432shift(uint64_t key);
extern ssize_t Read(int fd, void *buf, size_t count); extern ssize_t Read(int fd, void *buf, size_t count);
extern ssize_t Read_Adjusted(int fd, uchar_t *buf, size_t count, extern ssize_t Read_Adjusted(int fd, uchar_t *buf, size_t count,
ssize_t *rabin_count, void *ctx); ssize_t *rabin_count, void *ctx);