Add ability to adjust chunk boundary based on Rabin Fingerprinting to improve compression.

Remove unnecessary checks in compression loop.
This commit is contained in:
Moinak Ghosh 2012-06-21 20:27:05 +05:30
parent 7e9f636f8d
commit 733923cbf2
6 changed files with 298 additions and 25 deletions

View file

@ -24,9 +24,16 @@
PROG= pcompress
MAINSRCS = main.c utils.c allocator.c zlib_compress.c bzip2_compress.c \
lzma_compress.c ppmd_compress.c adaptive_compress.c
MAINHDRS = allocator.h pcompress.h utils.h
MAINOBJS = $(MAINSRCS:.c=.o)
RABINSRCS = rabin/rabin_polynomial.c
RABINHDRS = rabin/rabin_polynomial.h utils.h
RABINOBJS = $(RABINSRCS:.c=.o)
LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c
LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \
lzma/LzHash.h lzma/LzmaDec.h utils.h
LZMAOBJS = $(LZMASRCS:.c=.o)
PPMDSRCS = lzma/Ppmd8.c lzma/Ppmd8Enc.c lzma/Ppmd8Dec.c
@ -34,43 +41,49 @@ PPMDHDRS = lzma/Ppmd.h lzma/Ppmd8.h
PPMDOBJS = $(PPMDSRCS:.c=.o)
CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c
CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h
CRCOBJS = $(CRCSRCS:.c=.o)
BAKFILES = *~ lzma/*~
RM = rm -f
CPPFLAGS = -I. -I./lzma -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
CPPFLAGS = -I. -I./lzma -I./rabin -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
-D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
VEC_FLAGS = -ftree-vectorize
LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm
ifdef DEBUG
LINK = gcc -m64 -pthread -msse3
LINK = g++ -m64 -pthread -msse3
COMPILE = gcc -m64 -g -msse3 -c
COMPILE_cpp = g++ -m64 -g -msse3 -c
else
LINK = gcc -m64 -pthread -msse3
LINK = g++ -m64 -pthread -msse3
COMPILE = gcc -m64 -O3 -msse3 -c
COMPILE_cpp = g++ -m64 -O3 -msse3 -c
CPPFLAGS += -DNDEBUG
endif
all: $(PROG)
$(LZMAOBJS): $(LZMASRCS)
$(LZMAOBJS): $(LZMASRCS) $(LZMAHDRS)
$(COMPILE) $(CPPFLAGS) $(@:.o=.c) -o $@
$(CRCOBJS): $(CRCSRCS)
$(CRCOBJS): $(CRCSRCS) $(CRCHDRS)
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(PPMDOBJS): $(PPMDSRCS) $(PPMDHDRS)
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(MAINOBJS): $(MAINSRCS)
$(RABINOBJS): $(RABINSRCS) $(RABINHDRS)
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
$(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS)
$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(LDLIBS)
$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS)
$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(LDLIBS)
clean:
$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(BAKFILES)
$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BAKFILES)

61
main.c
View file

@ -44,6 +44,7 @@
#include <utils.h>
#include <pcompress.h>
#include <allocator.h>
#include <rabin_polynomial.h>
/* Needed for CLzmaEncprops. */
#include <LzmaEnc.h>
@ -75,12 +76,14 @@ static int pipe_mode = 0;
static int nthreads = 0;
static int hide_mem_stats = 1;
static int hide_cmp_stats = 1;
static int enable_rabin_scan = 0;
static unsigned int chunk_num;
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
static const char *exec_name;
static const char *algo = NULL;
static int do_compress = 0;
static int do_uncompress = 0;
static rabin_context_t *rctx;
static void
usage(void)
@ -109,17 +112,22 @@ usage(void)
"2) To decompress a file compressed using above command:\n"
" %s -d <compressed file> <target file>\n"
"3) To operate as a pipe, read from stdin and write to stdout:\n"
" %s <-c ...|-d ...> -p\n"
"4) Number of threads can optionally be specified: -t <1 - 256 count>\n"
"5) Pass '-M' to display memory allocator statistics\n"
"6) Pass '-C' to display compression statistics\n\n",
exec_name, exec_name, exec_name);
" %s -p ...\n"
"4) To use Rabin Fingerprinting to adjust chunk boundaries:\n"
" %s -r -c ...\n"
" In this case <chunk_size> will specify the max chunk size and chunks\n"
" will be variable-length delimited at the rabin boundary closest to\n"
" <chunk_size> bytes. This should improve chunked compression.\n"
" This option is obviously valid only when compressing.\n"
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
"6) Pass '-M' to display memory allocator statistics\n"
"7) Pass '-C' to display compression statistics\n\n",
exec_name, exec_name, exec_name, exec_name);
}
void
show_compression_stats(uint64_t chunksize)
{
chunk_num++;
fprintf(stderr, "\nCompression Statistics\n");
fprintf(stderr, "======================\n");
fprintf(stderr, "Total chunks : %u\n", chunk_num);
@ -614,7 +622,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
char tmpfile[MAXPATHLEN];
char to_filename[MAXPATHLEN];
ssize_t compressed_chunksize;
ssize_t n_chunksize, rbytes;
ssize_t n_chunksize, rbytes, rabin_count;
int version;
struct stat sbuf;
int compfd = -1, uncompfd = -1, err;
@ -640,6 +648,12 @@ start_compress(const char *filename, uint64_t chunksize, int level)
sizeof (chunksize) + sizeof (uint64_t) + sizeof (chunksize);
err = 0;
if (enable_rabin_scan) {
rctx = create_rabin_context();
if (rctx == NULL)
err_exit(0, "Initializing Rabin Polynomial failed\n");
}
/* A host of sanity checks. */
if (!pipe_mode) {
if ((uncompfd = open(filename, O_RDWR, 0)) == -1)
@ -794,7 +808,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
/*
* Read the first chunk into a spare buffer (a simple double-buffering).
*/
rbytes = Read(uncompfd, cread_buf, chunksize);
rabin_count = 0;
rbytes = Read2(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
while (!bail) {
uchar_t *tmp;
@ -816,27 +831,33 @@ start_compress(const char *filename, uint64_t chunksize, int level)
tdat->uncompressed_chunk = cread_buf;
cread_buf = tmp;
tdat->rbytes = rbytes;
if (rabin_count) {
memcpy(cread_buf,
tdat->uncompressed_chunk + rabin_count,
rbytes - rabin_count);
tdat->rbytes = rabin_count;
rabin_count = rbytes - rabin_count;
}
if (rbytes < chunksize) {
bail = 1;
if (rbytes < 0) {
bail = 1;
perror("Read: ");
COMP_BAIL;
} else if (tdat->rbytes == 0) { /* EOF */
bail = 1;
break;
}
np = nprocs + 1;
sem_post(&tdat->start_sem);
break;
}
/* Signal the compression thread to start */
sem_post(&tdat->start_sem);
chunk_num++;
/*
* Read the next buffer we want to process while previous
* buffer is in progress.
*/
rbytes = Read(uncompfd, cread_buf, chunksize);
chunk_num++;
rbytes = Read2(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
}
}
@ -996,7 +1017,7 @@ main(int argc, char *argv[])
level = 6;
slab_init();
while ((opt = getopt(argc, argv, "dc:s:l:pt:MC")) != -1) {
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCr")) != -1) {
int ovr;
switch (opt) {
@ -1048,6 +1069,10 @@ main(int argc, char *argv[])
hide_cmp_stats = 0;
break;
case 'r':
enable_rabin_scan = 1;
break;
case '?':
default:
usage();
@ -1071,6 +1096,12 @@ main(int argc, char *argv[])
exit(1);
}
if (enable_rabin_scan && !do_compress) {
fprintf(stderr, "Rabin Fingerprinting is only used during compression.\n");
usage();
exit(1);
}
if (num_rem == 0 && !pipe_mode) {
usage(); /* At least 1 filename needed. */
exit(1);

135
rabin/rabin_polynomial.c Executable file
View file

@ -0,0 +1,135 @@
/*
* rabin_polynomial.c
*
* Created by Joel Lawrence Tucci on 09-March-2011.
*
* Copyright (c) 2011 Joel Lawrence Tucci
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* Neither the name of the project's author nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <allocator.h>
#include <utils.h>
#include "rabin_polynomial.h"
unsigned int rabin_polynomial_max_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
/*
* Initialize the algorithm with the default params. Not thread-safe.
*/
rabin_context_t *
create_rabin_context() {
rabin_context_t *ctx;
unsigned char *current_window_data;
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
if(ctx == NULL || current_window_data == NULL) {
fprintf(stderr,
"Could not allocate rabin polynomial context, out of memory\n");
return (NULL);
}
memset(current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
ctx->current_window_data = current_window_data;
/*
* We should compute the power for the window size.
* static uint64_t polynomial_pow;
* polynomial_pow = 1;
* for(index=0; index<RAB_POLYNOMIAL_WIN_SIZE; index++) {
* polynomial_pow *= RAB_POLYNOMIAL_CONST;
* }
* But since RAB_POLYNOMIAL_CONST == 2, any expression of the form
* x * polynomial_pow can we written as x << RAB_POLYNOMIAL_WIN_SIZE
*/
ctx->window_pos = 0;
ctx->cur_roll_checksum = 0;
return (ctx);
}
void
destroy_rabin_context(rabin_context_t *ctx)
{
slab_free(NULL, ctx->current_window_data);
slab_free(NULL, ctx);
}
/**
* Given a buffer compute all the rabin chunks and return the end offset of the
* last chunk in the buffer. The last chunk may not end at the buffer end. The
* bytes till the last chunk end is used as the compression chunk and remaining
* bytes are carried over to the next chunk.
*/
ssize_t
scan_rabin_chunks(rabin_context_t *ctx, void *buf, ssize_t size, ssize_t offset)
{
size_t i, length, last_offset;
length = 0;
last_offset = 0;
for (i=offset; i<size; i++) {
char cur_byte = *((char *)(buf+i));
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
ctx->current_window_data[ctx->window_pos] = cur_byte;
/*
* We want to do:
* cur_roll_checksum = cur_roll_checksum * RAB_POLYNOMIAL_CONST + cur_byte;
* cur_roll_checksum -= pushed_out * polynomial_pow;
*
* However since RAB_POLYNOMIAL_CONST == 2, we use shifts.
*/
ctx->cur_roll_checksum = (ctx->cur_roll_checksum << 1) + cur_byte;
ctx->cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
ctx->window_pos++;
length++;
if (ctx->window_pos == RAB_POLYNOMIAL_WIN_SIZE) // Loop back around
ctx->window_pos=0;
// If we hit our special value or reached the max block size create a new block
if ((ctx->cur_roll_checksum & RAB_POLYNOMIAL_AVG_BLOCK_MASK) == RAB_POLYNOMIAL_CONST ||
length >= rabin_polynomial_max_block_size) {
last_offset = i+1;
length = 0;
}
}
if (last_offset == 0) last_offset = size;
return last_offset;
}

64
rabin/rabin_polynomial.h Normal file
View file

@ -0,0 +1,64 @@
/*
* rabin_polynomial_constants.h
*
* Created by Joel Lawrence Tucci on 09-May-2011.
*
* Copyright (c) 2011 Joel Lawrence Tucci
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* Neither the name of the project's author nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
//List of constants, mostly constraints and defaults for various parameters
//to the Rabin Fingerprinting algorithm
#define RAB_POLYNOMIAL_CONST 2
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
//
// A value of 11 below gives block size of 2048 bytes
//
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 11
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
#define RAB_POLYNOMIAL_WIN_SIZE 32
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
typedef struct {
unsigned char *current_window_data;
int window_pos;
uint64_t cur_roll_checksum;
} rabin_context_t;
extern rabin_context_t *create_rabin_context();
extern void destroy_rabin_context(rabin_context_t *ctx);
extern ssize_t scan_rabin_chunks(rabin_context_t *ctx, void *buf,
ssize_t size, ssize_t offset);

28
utils.c
View file

@ -31,6 +31,7 @@
#include <stdio.h>
#include <errno.h>
#include <link.h>
#include <rabin_polynomial.h>
#include "utils.h"
@ -171,12 +172,15 @@ bytes_to_size(uint64_t bytes)
/*
* Read/Write helpers to ensure a full chunk is read or written
* unless there is an error.
* Additionally can be given an offset in the buf where the data
* should be inserted.
*/
ssize_t
Read(int fd, void *buf, size_t count)
{
ssize_t rcount, rem;
uchar_t *cbuf;
va_list args;
rem = count;
cbuf = (uchar_t *)buf;
@ -190,6 +194,30 @@ Read(int fd, void *buf, size_t count)
return (count - rem);
}
ssize_t
Read2(int fd, void *buf, size_t count, ssize_t *rabin_count, void *ctx)
{
char *buf2;
ssize_t rcount;
rabin_context_t *rctx = (rabin_context_t *)ctx;
if (!ctx) return (Read(fd, buf, count));
buf2 = buf;
if (*rabin_count) {
buf2 = (char *)buf + *rabin_count;
count -= *rabin_count;
}
rcount = Read(fd, buf2, count);
if (rcount > 0) {
rcount += *rabin_count;
*rabin_count = scan_rabin_chunks(rctx, buf, rcount, *rabin_count);
} else {
if (rcount == 0) rcount = *rabin_count;
*rabin_count = 0;
}
return (rcount);
}
ssize_t
Write(int fd, const void *buf, size_t count)
{

View file

@ -98,6 +98,8 @@ extern int parse_numeric(ssize_t *val, const char *str);
extern char *bytes_to_size(uint64_t bytes);
extern ssize_t Read(int fd, void *buf, size_t count);
extern ssize_t Write(int fd, const void *buf, size_t count);
extern ssize_t Read2(int fd, void *buf, size_t count,
ssize_t *rabin_count, void *ctx);
/*
* Roundup v to the nearest power of 2. From Bit Twiddling Hacks: