Implement Adaptive Delta Encoding.

This commit is contained in:
Moinak Ghosh 2012-12-05 00:09:47 +05:30
parent 51249c858d
commit 29b0d8fd7b
12 changed files with 93 additions and 22 deletions

View file

@ -67,6 +67,10 @@ LZPSRCS = lzp/lzp.c
LZPHDRS = lzp/lzp.h LZPHDRS = lzp/lzp.h
LZPOBJS = $(LZPSRCS:.c=.o) LZPOBJS = $(LZPSRCS:.c=.o)
DELTA2SRCS = delta2/delta2.c
DELTA2HDRS = delta2/delta2.h
DELTA2OBJS = $(DELTA2SRCS:.c=.o)
SKEIN_BLOCK_C = crypto/skein/skein_block.c SKEIN_BLOCK_C = crypto/skein/skein_block.c
SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s
SKEIN_BLOCK_SRC = @SKEIN_BLOCK@ SKEIN_BLOCK_SRC = @SKEIN_BLOCK@
@ -108,8 +112,8 @@ COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \ LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \
-L@OPENSSL_LIBDIR@ -lcrypto -lrt -L@OPENSSL_LIBDIR@ -lcrypto -lrt
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \ OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) $(SKEIN_BLOCK_OBJ) \ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
@SHA256ASM_OBJS@ @SHA256_OBJS@ $(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@
DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@ DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@
DEBUG_COMPILE = gcc -m64 -g -msse3 -c DEBUG_COMPILE = gcc -m64 -g -msse3 -c
@ -168,6 +172,9 @@ $(LZ4OBJS): $(LZ4SRCS) $(LZ4HDRS)
$(LZPOBJS): $(LZPSRCS) $(LZPHDRS) $(LZPOBJS): $(LZPSRCS) $(LZPHDRS)
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ $(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(DELTA2OBJS): $(DELTA2SRCS) $(DELTA2HDRS)
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC) $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(SKEIN_BLOCK_SRC) -o $@ $(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(SKEIN_BLOCK_SRC) -o $@

View file

@ -95,6 +95,12 @@ adapt_stats(int show)
ppmd_count = 0; ppmd_count = 0;
} }
void
adapt_props(algo_props_t *data, int level, ssize_t chunksize)
{
data->delta2_stride = 200;
}
int int
adapt_init(void **data, int *level, int nthreads, ssize_t chunksize, adapt_init(void **data, int *level, int nthreads, ssize_t chunksize,
int file_version, compress_op_t op) int file_version, compress_op_t op)

View file

@ -48,6 +48,11 @@ bzip2_stats(int show)
{ {
} }
void
bzip2_props(algo_props_t *data, int level, ssize_t chunksize) {
data->delta2_stride = 200;
}
int int
bzip2_init(void **data, int *level, int nthreads, ssize_t chunksize, bzip2_init(void **data, int *level, int nthreads, ssize_t chunksize,
int file_version, compress_op_t op) int file_version, compress_op_t op)

View file

@ -79,6 +79,7 @@ libbsc_props(algo_props_t *data, int level, ssize_t chunksize) {
data->buf_extra = 0; data->buf_extra = 0;
data->c_max_threads = 8; data->c_max_threads = 8;
data->d_max_threads = 8; data->d_max_threads = 8;
data->delta2_stride = 150;
} }
int int

View file

@ -56,6 +56,7 @@ lz4_props(algo_props_t *data, int level, ssize_t chunksize) {
data->compress_mt_capable = 0; data->compress_mt_capable = 0;
data->decompress_mt_capable = 0; data->decompress_mt_capable = 0;
data->buf_extra = lz4_buf_extra(chunksize); data->buf_extra = lz4_buf_extra(chunksize);
data->delta2_stride = 50;
} }
int int

View file

@ -39,6 +39,11 @@ lz_fx_stats(int show)
{ {
} }
void
lz_fx_props(algo_props_t *data, int level, ssize_t chunksize) {
data->delta2_stride = 50;
}
int int
lz_fx_init(void **data, int *level, int nthreads, ssize_t chunksize, lz_fx_init(void **data, int *level, int nthreads, ssize_t chunksize,
int file_version, compress_op_t op) int file_version, compress_op_t op)

View file

@ -52,6 +52,7 @@ lzma_mt_props(algo_props_t *data, int level, ssize_t chunksize) {
data->decompress_mt_capable = 0; data->decompress_mt_capable = 0;
data->buf_extra = 0; data->buf_extra = 0;
data->c_max_threads = 2; data->c_max_threads = 2;
data->delta2_stride = 150;
} }
void void
@ -59,6 +60,7 @@ lzma_props(algo_props_t *data, int level, ssize_t chunksize) {
data->compress_mt_capable = 0; data->compress_mt_capable = 0;
data->decompress_mt_capable = 0; data->decompress_mt_capable = 0;
data->buf_extra = 0; data->buf_extra = 0;
data->delta2_stride = 150;
} }
/* /*

65
main.c
View file

@ -80,6 +80,7 @@ static int hide_mem_stats = 1;
static int hide_cmp_stats = 1; static int hide_cmp_stats = 1;
static int enable_rabin_scan = 0; static int enable_rabin_scan = 0;
static int enable_delta_encode = 0; static int enable_delta_encode = 0;
static int enable_delta2_encode = 0;
static int enable_rabin_split = 1; static int enable_rabin_split = 1;
static int enable_fixed_scan = 0; static int enable_fixed_scan = 0;
static int lzp_preprocess = 0; static int lzp_preprocess = 0;
@ -148,6 +149,8 @@ usage(void)
"7) Other flags:\n" "7) Other flags:\n"
" '-L' - Enable LZP pre-compression. This improves compression ratio of all\n" " '-L' - Enable LZP pre-compression. This improves compression ratio of all\n"
" algorithms with some extra CPU and very low RAM overhead.\n" " algorithms with some extra CPU and very low RAM overhead.\n"
" '-P' - Enable Adaptive Delta Encoding. This implies '-L' as well. It improves\n"
" compresion ratio further at the cost of more CPU overhead.\n"
" '-S' <cksum>\n" " '-S' <cksum>\n"
" - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n" " - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n"
" SHA512. Default one is SKEIN256.\n" " SHA512. Default one is SKEIN256.\n"
@ -188,7 +191,7 @@ show_compression_stats(uint64_t chunksize)
*/ */
int int
preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst, preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst,
size_t *dstlen, int level, uchar_t chdr, void *data) size_t *dstlen, int level, uchar_t chdr, void *data, algo_props_t *props)
{ {
uchar_t *dest = (uchar_t *)dst, type = 0; uchar_t *dest = (uchar_t *)dst, type = 0;
ssize_t result, _dstlen; ssize_t result, _dstlen;
@ -210,11 +213,14 @@ preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst
return (-1); return (-1);
} }
_dstlen = srclen; if (enable_delta2_encode && props->delta2_stride > 0) {
result = delta2_encode(src, srclen, dst, &_dstlen, 150); _dstlen = srclen;
if (result != -1) { result = delta2_encode(src, srclen, dst, &_dstlen, props->delta2_stride);
memcpy(src, dst, _dstlen); if (result != -1) {
srclen = _dstlen; memcpy(src, dst, _dstlen);
srclen = _dstlen;
type |= PREPROC_TYPE_DELTA2;
}
} }
*dest = type; *dest = type;
@ -225,15 +231,17 @@ preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst
*dest |= PREPROC_COMPRESSED; *dest |= PREPROC_COMPRESSED;
*dstlen = _dstlen + 9; *dstlen = _dstlen + 9;
} else { } else {
result = -1; memcpy(dest+1, src, srclen);
*dstlen = srclen + 1;
result = 0;
} }
result = 0;
return (result); return (result);
} }
int int
preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *dst, preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *dst,
size_t *dstlen, int level, uchar_t chdr, void *data) size_t *dstlen, int level, uchar_t chdr, void *data, algo_props_t *props)
{ {
uchar_t *sorc = (uchar_t *)src, type; uchar_t *sorc = (uchar_t *)src, type;
ssize_t result; ssize_t result;
@ -252,12 +260,14 @@ preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *d
srclen = *dstlen; srclen = *dstlen;
} }
result = delta2_decode(src, srclen, dst, &_dstlen); if (type & PREPROC_TYPE_DELTA2) {
if (result != -1) { result = delta2_decode(src, srclen, dst, &_dstlen);
memcpy(src, dst, _dstlen); if (result != -1) {
srclen = _dstlen; memcpy(src, dst, _dstlen);
} else { srclen = _dstlen;
return (result); } else {
return (result);
}
} }
if (type & PREPROC_TYPE_LZP) { if (type & PREPROC_TYPE_LZP) {
@ -423,7 +433,7 @@ redo:
if (HDR & COMPRESSED) { if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) { if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp, rv = preproc_decompress(tdat->decompress, cmpbuf, dedupe_data_sz_cmp,
ubuf, &_chunksize, tdat->level, HDR, tdat->data); ubuf, &_chunksize, tdat->level, HDR, tdat->data, tdat->props);
} else { } else {
rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize, rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
tdat->level, HDR, tdat->data); tdat->level, HDR, tdat->data);
@ -452,7 +462,8 @@ redo:
if (HDR & COMPRESSED) { if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) { if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(tdat->decompress, cseg, tdat->len_cmp, rv = preproc_decompress(tdat->decompress, cseg, tdat->len_cmp,
tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, tdat->data); tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, tdat->data,
tdat->props);
} else { } else {
rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk, rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk,
&_chunksize, tdat->level, HDR, tdat->data); &_chunksize, tdat->level, HDR, tdat->data);
@ -875,6 +886,7 @@ start_decompress(const char *filename, const char *to_filename)
tdat->cancel = 0; tdat->cancel = 0;
tdat->level = level; tdat->level = level;
tdat->data = NULL; tdat->data = NULL;
tdat->props = &props;
sem_init(&(tdat->start_sem), 0, 0); sem_init(&(tdat->start_sem), 0, 0);
sem_init(&(tdat->cmp_done_sem), 0, 0); sem_init(&(tdat->cmp_done_sem), 0, 0);
sem_init(&(tdat->write_done_sem), 0, 1); sem_init(&(tdat->write_done_sem), 0, 1);
@ -1155,7 +1167,7 @@ redo:
rv = preproc_compress(tdat->compress, rv = preproc_compress(tdat->compress,
tdat->uncompressed_chunk + dedupe_index_sz, tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize, _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, tdat->data); tdat->level, 0, tdat->data, tdat->props);
} else { } else {
rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz, rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize, _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
@ -1185,7 +1197,8 @@ plain_compress:
if (lzp_preprocess) { if (lzp_preprocess) {
rv = preproc_compress(tdat->compress, rv = preproc_compress(tdat->compress,
tdat->uncompressed_chunk, tdat->rbytes, tdat->uncompressed_chunk, tdat->rbytes,
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data); compressed_chunk, &_chunksize, tdat->level, 0, tdat->data,
tdat->props);
} else { } else {
rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes, rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes,
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data); compressed_chunk, &_chunksize, tdat->level, 0, tdat->data);
@ -1575,6 +1588,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
tdat->cancel = 0; tdat->cancel = 0;
tdat->level = level; tdat->level = level;
tdat->data = NULL; tdat->data = NULL;
tdat->props = &props;
sem_init(&(tdat->start_sem), 0, 0); sem_init(&(tdat->start_sem), 0, 0);
sem_init(&(tdat->cmp_done_sem), 0, 0); sem_init(&(tdat->cmp_done_sem), 0, 0);
sem_init(&(tdat->write_done_sem), 0, 1); sem_init(&(tdat->write_done_sem), 0, 1);
@ -1927,6 +1941,7 @@ init_algo(const char *algo, int bail)
_init_func = zlib_init; _init_func = zlib_init;
_deinit_func = zlib_deinit; _deinit_func = zlib_deinit;
_stats_func = zlib_stats; _stats_func = zlib_stats;
_props_func = zlib_props;
rv = 0; rv = 0;
} else if (memcmp(algorithm, "lzmaMt", 6) == 0) { } else if (memcmp(algorithm, "lzmaMt", 6) == 0) {
@ -1953,6 +1968,7 @@ init_algo(const char *algo, int bail)
_init_func = bzip2_init; _init_func = bzip2_init;
_deinit_func = NULL; _deinit_func = NULL;
_stats_func = bzip2_stats; _stats_func = bzip2_stats;
_props_func = bzip2_props;
rv = 0; rv = 0;
} else if (memcmp(algorithm, "ppmd", 4) == 0) { } else if (memcmp(algorithm, "ppmd", 4) == 0) {
@ -1961,6 +1977,7 @@ init_algo(const char *algo, int bail)
_init_func = ppmd_init; _init_func = ppmd_init;
_deinit_func = ppmd_deinit; _deinit_func = ppmd_deinit;
_stats_func = ppmd_stats; _stats_func = ppmd_stats;
_props_func = ppmd_props;
rv = 0; rv = 0;
} else if (memcmp(algorithm, "lzfx", 4) == 0) { } else if (memcmp(algorithm, "lzfx", 4) == 0) {
@ -1969,6 +1986,7 @@ init_algo(const char *algo, int bail)
_init_func = lz_fx_init; _init_func = lz_fx_init;
_deinit_func = lz_fx_deinit; _deinit_func = lz_fx_deinit;
_stats_func = lz_fx_stats; _stats_func = lz_fx_stats;
_props_func = lz_fx_props;
rv = 0; rv = 0;
} else if (memcmp(algorithm, "lz4", 3) == 0) { } else if (memcmp(algorithm, "lz4", 3) == 0) {
@ -1995,6 +2013,7 @@ init_algo(const char *algo, int bail)
_init_func = adapt2_init; _init_func = adapt2_init;
_deinit_func = adapt_deinit; _deinit_func = adapt_deinit;
_stats_func = adapt_stats; _stats_func = adapt_stats;
_props_func = adapt_props;
adapt_mode = 1; adapt_mode = 1;
rv = 0; rv = 0;
@ -2004,6 +2023,7 @@ init_algo(const char *algo, int bail)
_init_func = adapt_init; _init_func = adapt_init;
_deinit_func = adapt_deinit; _deinit_func = adapt_deinit;
_stats_func = adapt_stats; _stats_func = adapt_stats;
_props_func = adapt_props;
adapt_mode = 1; adapt_mode = 1;
rv = 0; rv = 0;
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
@ -2034,7 +2054,7 @@ main(int argc, char *argv[])
level = 6; level = 6;
slab_init(); slab_init();
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEew:rLS:B:F")) != -1) { while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEew:rLPS:B:F")) != -1) {
int ovr; int ovr;
switch (opt) { switch (opt) {
@ -2124,6 +2144,11 @@ main(int argc, char *argv[])
lzp_preprocess = 1; lzp_preprocess = 1;
break; break;
case 'P':
lzp_preprocess = 1;
enable_delta2_encode = 1;
break;
case 'r': case 'r':
enable_rabin_split = 0; enable_rabin_split = 0;
break; break;

View file

@ -56,6 +56,7 @@ extern "C" {
#define COMP_EXTN ".pz" #define COMP_EXTN ".pz"
#define PREPROC_TYPE_LZP 1 #define PREPROC_TYPE_LZP 1
#define PREPROC_TYPE_DELTA2 2
#define PREPROC_COMPRESSED 128 #define PREPROC_COMPRESSED 128
/* /*
@ -135,6 +136,11 @@ extern int none_init(void **data, int *level, int nthreads, ssize_t chunksize,
extern void lzma_props(algo_props_t *data, int level, ssize_t chunksize); extern void lzma_props(algo_props_t *data, int level, ssize_t chunksize);
extern void lzma_mt_props(algo_props_t *data, int level, ssize_t chunksize); extern void lzma_mt_props(algo_props_t *data, int level, ssize_t chunksize);
extern void lz4_props(algo_props_t *data, int level, ssize_t chunksize); extern void lz4_props(algo_props_t *data, int level, ssize_t chunksize);
extern void zlib_props(algo_props_t *data, int level, ssize_t chunksize);
extern void ppmd_props(algo_props_t *data, int level, ssize_t chunksize);
extern void lz_fx_props(algo_props_t *data, int level, ssize_t chunksize);
extern void bzip2_props(algo_props_t *data, int level, ssize_t chunksize);
extern void adapt_props(algo_props_t *data, int level, ssize_t chunksize);
extern int zlib_deinit(void **data); extern int zlib_deinit(void **data);
extern int adapt_deinit(void **data); extern int adapt_deinit(void **data);
@ -188,6 +194,7 @@ struct cmp_data {
void *data; void *data;
pthread_t thr; pthread_t thr;
mac_ctx_t chunk_hmac; mac_ctx_t chunk_hmac;
algo_props_t *props;
}; };
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -61,6 +61,11 @@ ppmd_stats(int show)
{ {
} }
void
ppmd_props(algo_props_t *data, int level, ssize_t chunksize) {
data->delta2_stride = 100;
}
int int
ppmd_init(void **data, int *level, int nthreads, ssize_t chunksize, ppmd_init(void **data, int *level, int nthreads, ssize_t chunksize,
int file_version, compress_op_t op) int file_version, compress_op_t op)

View file

@ -111,6 +111,7 @@ typedef struct {
int nthreads; int nthreads;
int c_max_threads; int c_max_threads;
int d_max_threads; int d_max_threads;
int delta2_stride;
} algo_props_t; } algo_props_t;
typedef enum { typedef enum {
@ -205,6 +206,7 @@ init_algo_props(algo_props_t *props)
props->nthreads = 1; props->nthreads = 1;
props->c_max_threads = 1; props->c_max_threads = 1;
props->d_max_threads = 1; props->d_max_threads = 1;
props->delta2_stride = 0;
} }
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -89,6 +89,11 @@ zlib_stats(int show)
{ {
} }
void
zlib_props(algo_props_t *data, int level, ssize_t chunksize) {
data->delta2_stride = 50;
}
int int
zlib_deinit(void **data) zlib_deinit(void **data)
{ {