From 733923cbf2c146d18f1de974900d03a87a536924 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 21 Jun 2012 20:27:05 +0530 Subject: [PATCH] Add ability to adjust chunk boundary based on Rabin Fingerprinting to improve compression. Remove unnecessary checks in compression loop. --- Makefile | 33 +++++++--- main.c | 61 +++++++++++++----- rabin/rabin_polynomial.c | 135 +++++++++++++++++++++++++++++++++++++++ rabin/rabin_polynomial.h | 64 +++++++++++++++++++ utils.c | 28 ++++++++ utils.h | 2 + 6 files changed, 298 insertions(+), 25 deletions(-) create mode 100755 rabin/rabin_polynomial.c create mode 100644 rabin/rabin_polynomial.h diff --git a/Makefile b/Makefile index ee12b3e..13bd2ec 100644 --- a/Makefile +++ b/Makefile @@ -22,11 +22,18 @@ # PROG= pcompress -MAINSRCS= main.c utils.c allocator.c zlib_compress.c bzip2_compress.c \ +MAINSRCS = main.c utils.c allocator.c zlib_compress.c bzip2_compress.c \ lzma_compress.c ppmd_compress.c adaptive_compress.c +MAINHDRS = allocator.h pcompress.h utils.h MAINOBJS = $(MAINSRCS:.c=.o) +RABINSRCS = rabin/rabin_polynomial.c +RABINHDRS = rabin/rabin_polynomial.h utils.h +RABINOBJS = $(RABINSRCS:.c=.o) + LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c +LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \ + lzma/LzHash.h lzma/LzmaDec.h utils.h LZMAOBJS = $(LZMASRCS:.c=.o) PPMDSRCS = lzma/Ppmd8.c lzma/Ppmd8Enc.c lzma/Ppmd8Dec.c @@ -34,43 +41,49 @@ PPMDHDRS = lzma/Ppmd.h lzma/Ppmd8.h PPMDOBJS = $(PPMDSRCS:.c=.o) CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c +CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h CRCOBJS = $(CRCSRCS:.c=.o) BAKFILES = *~ lzma/*~ RM = rm -f -CPPFLAGS = -I. -I./lzma -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \ +CPPFLAGS = -I. -I./lzma -I./rabin -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \ -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 VEC_FLAGS = -ftree-vectorize LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm ifdef DEBUG -LINK = gcc -m64 -pthread -msse3 +LINK = g++ -m64 -pthread -msse3 COMPILE = gcc -m64 -g -msse3 -c +COMPILE_cpp = g++ -m64 -g -msse3 -c else -LINK = gcc -m64 -pthread -msse3 +LINK = g++ -m64 -pthread -msse3 COMPILE = gcc -m64 -O3 -msse3 -c +COMPILE_cpp = g++ -m64 -O3 -msse3 -c CPPFLAGS += -DNDEBUG endif all: $(PROG) -$(LZMAOBJS): $(LZMASRCS) +$(LZMAOBJS): $(LZMASRCS) $(LZMAHDRS) $(COMPILE) $(CPPFLAGS) $(@:.o=.c) -o $@ -$(CRCOBJS): $(CRCSRCS) +$(CRCOBJS): $(CRCSRCS) $(CRCHDRS) $(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ $(PPMDOBJS): $(PPMDSRCS) $(PPMDHDRS) $(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ -$(MAINOBJS): $(MAINSRCS) +$(RABINOBJS): $(RABINSRCS) $(RABINHDRS) + $(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ + +$(MAINOBJS): $(MAINSRCS) $(MAINHDRS) $(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ -$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) - $(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(LDLIBS) +$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) + $(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(LDLIBS) clean: - $(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(BAKFILES) + $(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BAKFILES) diff --git a/main.c b/main.c index 7c1000e..cc4dd5b 100644 --- a/main.c +++ b/main.c @@ -44,6 +44,7 @@ #include #include #include +#include /* Needed for CLzmaEncprops. */ #include @@ -75,12 +76,14 @@ static int pipe_mode = 0; static int nthreads = 0; static int hide_mem_stats = 1; static int hide_cmp_stats = 1; +static int enable_rabin_scan = 0; static unsigned int chunk_num; static uint64_t largest_chunk, smallest_chunk, avg_chunk; static const char *exec_name; static const char *algo = NULL; static int do_compress = 0; static int do_uncompress = 0; +static rabin_context_t *rctx; static void usage(void) @@ -109,17 +112,22 @@ usage(void) "2) To decompress a file compressed using above command:\n" " %s -d \n" "3) To operate as a pipe, read from stdin and write to stdout:\n" - " %s <-c ...|-d ...> -p\n" - "4) Number of threads can optionally be specified: -t <1 - 256 count>\n" - "5) Pass '-M' to display memory allocator statistics\n" - "6) Pass '-C' to display compression statistics\n\n", - exec_name, exec_name, exec_name); + " %s -p ...\n" + "4) To use Rabin Fingerprinting to adjust chunk boundaries:\n" + " %s -r -c ...\n" + " In this case will specify the max chunk size and chunks\n" + " will be variable-length delimited at the rabin boundary closest to\n" + " bytes. This should improve chunked compression.\n" + " This option is obviously valid only when compressing.\n" + "5) Number of threads can optionally be specified: -t <1 - 256 count>\n" + "6) Pass '-M' to display memory allocator statistics\n" + "7) Pass '-C' to display compression statistics\n\n", + exec_name, exec_name, exec_name, exec_name); } void show_compression_stats(uint64_t chunksize) { - chunk_num++; fprintf(stderr, "\nCompression Statistics\n"); fprintf(stderr, "======================\n"); fprintf(stderr, "Total chunks : %u\n", chunk_num); @@ -614,7 +622,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) char tmpfile[MAXPATHLEN]; char to_filename[MAXPATHLEN]; ssize_t compressed_chunksize; - ssize_t n_chunksize, rbytes; + ssize_t n_chunksize, rbytes, rabin_count; int version; struct stat sbuf; int compfd = -1, uncompfd = -1, err; @@ -640,6 +648,12 @@ start_compress(const char *filename, uint64_t chunksize, int level) sizeof (chunksize) + sizeof (uint64_t) + sizeof (chunksize); err = 0; + if (enable_rabin_scan) { + rctx = create_rabin_context(); + if (rctx == NULL) + err_exit(0, "Initializing Rabin Polynomial failed\n"); + } + /* A host of sanity checks. */ if (!pipe_mode) { if ((uncompfd = open(filename, O_RDWR, 0)) == -1) @@ -794,7 +808,8 @@ start_compress(const char *filename, uint64_t chunksize, int level) /* * Read the first chunk into a spare buffer (a simple double-buffering). */ - rbytes = Read(uncompfd, cread_buf, chunksize); + rabin_count = 0; + rbytes = Read2(uncompfd, cread_buf, chunksize, &rabin_count, rctx); while (!bail) { uchar_t *tmp; @@ -816,27 +831,33 @@ start_compress(const char *filename, uint64_t chunksize, int level) tdat->uncompressed_chunk = cread_buf; cread_buf = tmp; tdat->rbytes = rbytes; + if (rabin_count) { + memcpy(cread_buf, + tdat->uncompressed_chunk + rabin_count, + rbytes - rabin_count); + tdat->rbytes = rabin_count; + rabin_count = rbytes - rabin_count; + } if (rbytes < chunksize) { - bail = 1; if (rbytes < 0) { + bail = 1; perror("Read: "); COMP_BAIL; } else if (tdat->rbytes == 0) { /* EOF */ + bail = 1; break; } - np = nprocs + 1; - sem_post(&tdat->start_sem); - break; } /* Signal the compression thread to start */ sem_post(&tdat->start_sem); + chunk_num++; + /* * Read the next buffer we want to process while previous * buffer is in progress. */ - rbytes = Read(uncompfd, cread_buf, chunksize); - chunk_num++; + rbytes = Read2(uncompfd, cread_buf, chunksize, &rabin_count, rctx); } } @@ -996,7 +1017,7 @@ main(int argc, char *argv[]) level = 6; slab_init(); - while ((opt = getopt(argc, argv, "dc:s:l:pt:MC")) != -1) { + while ((opt = getopt(argc, argv, "dc:s:l:pt:MCr")) != -1) { int ovr; switch (opt) { @@ -1048,6 +1069,10 @@ main(int argc, char *argv[]) hide_cmp_stats = 0; break; + case 'r': + enable_rabin_scan = 1; + break; + case '?': default: usage(); @@ -1071,6 +1096,12 @@ main(int argc, char *argv[]) exit(1); } + if (enable_rabin_scan && !do_compress) { + fprintf(stderr, "Rabin Fingerprinting is only used during compression.\n"); + usage(); + exit(1); + } + if (num_rem == 0 && !pipe_mode) { usage(); /* At least 1 filename needed. */ exit(1); diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c new file mode 100755 index 0000000..1271231 --- /dev/null +++ b/rabin/rabin_polynomial.c @@ -0,0 +1,135 @@ +/* + * rabin_polynomial.c + * + * Created by Joel Lawrence Tucci on 09-March-2011. + * + * Copyright (c) 2011 Joel Lawrence Tucci + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * Neither the name of the project's author nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include + +#include "rabin_polynomial.h" + +unsigned int rabin_polynomial_max_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE; + +/* + * Initialize the algorithm with the default params. Not thread-safe. + */ +rabin_context_t * +create_rabin_context() { + rabin_context_t *ctx; + unsigned char *current_window_data; + + ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); + current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE); + if(ctx == NULL || current_window_data == NULL) { + fprintf(stderr, + "Could not allocate rabin polynomial context, out of memory\n"); + return (NULL); + } + + memset(current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); + ctx->current_window_data = current_window_data; + + /* + * We should compute the power for the window size. + * static uint64_t polynomial_pow; + * polynomial_pow = 1; + * for(index=0; indexwindow_pos = 0; + ctx->cur_roll_checksum = 0; + return (ctx); +} + +void +destroy_rabin_context(rabin_context_t *ctx) +{ + slab_free(NULL, ctx->current_window_data); + slab_free(NULL, ctx); +} + +/** + * Given a buffer compute all the rabin chunks and return the end offset of the + * last chunk in the buffer. The last chunk may not end at the buffer end. The + * bytes till the last chunk end is used as the compression chunk and remaining + * bytes are carried over to the next chunk. + */ +ssize_t +scan_rabin_chunks(rabin_context_t *ctx, void *buf, ssize_t size, ssize_t offset) +{ + size_t i, length, last_offset; + + length = 0; + last_offset = 0; + + for (i=offset; icurrent_window_data[ctx->window_pos]; + ctx->current_window_data[ctx->window_pos] = cur_byte; + /* + * We want to do: + * cur_roll_checksum = cur_roll_checksum * RAB_POLYNOMIAL_CONST + cur_byte; + * cur_roll_checksum -= pushed_out * polynomial_pow; + * + * However since RAB_POLYNOMIAL_CONST == 2, we use shifts. + */ + ctx->cur_roll_checksum = (ctx->cur_roll_checksum << 1) + cur_byte; + ctx->cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE); + + ctx->window_pos++; + length++; + + if (ctx->window_pos == RAB_POLYNOMIAL_WIN_SIZE) // Loop back around + ctx->window_pos=0; + + // If we hit our special value or reached the max block size create a new block + if ((ctx->cur_roll_checksum & RAB_POLYNOMIAL_AVG_BLOCK_MASK) == RAB_POLYNOMIAL_CONST || + length >= rabin_polynomial_max_block_size) { + last_offset = i+1; + length = 0; + } + } + if (last_offset == 0) last_offset = size; + + return last_offset; +} + diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h new file mode 100644 index 0000000..83799c4 --- /dev/null +++ b/rabin/rabin_polynomial.h @@ -0,0 +1,64 @@ +/* + * rabin_polynomial_constants.h + * + * Created by Joel Lawrence Tucci on 09-May-2011. + * + * Copyright (c) 2011 Joel Lawrence Tucci + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * Neither the name of the project's author nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +//List of constants, mostly constraints and defaults for various parameters +//to the Rabin Fingerprinting algorithm + +#define RAB_POLYNOMIAL_CONST 2 +// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size +// So we are always looking at power of 2 chunk sizes to avoid doing a modulus +// +// A value of 11 below gives block size of 2048 bytes +// +#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 11 +#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT) +#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1) +#define RAB_POLYNOMIAL_WIN_SIZE 32 +#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17 +#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63 + +typedef struct { + unsigned char *current_window_data; + int window_pos; + uint64_t cur_roll_checksum; +} rabin_context_t; + +extern rabin_context_t *create_rabin_context(); +extern void destroy_rabin_context(rabin_context_t *ctx); +extern ssize_t scan_rabin_chunks(rabin_context_t *ctx, void *buf, + ssize_t size, ssize_t offset); + diff --git a/utils.c b/utils.c index b7ef420..e80281c 100644 --- a/utils.c +++ b/utils.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "utils.h" @@ -171,12 +172,15 @@ bytes_to_size(uint64_t bytes) /* * Read/Write helpers to ensure a full chunk is read or written * unless there is an error. + * Additionally can be given an offset in the buf where the data + * should be inserted. */ ssize_t Read(int fd, void *buf, size_t count) { ssize_t rcount, rem; uchar_t *cbuf; + va_list args; rem = count; cbuf = (uchar_t *)buf; @@ -190,6 +194,30 @@ Read(int fd, void *buf, size_t count) return (count - rem); } +ssize_t +Read2(int fd, void *buf, size_t count, ssize_t *rabin_count, void *ctx) +{ + char *buf2; + ssize_t rcount; + rabin_context_t *rctx = (rabin_context_t *)ctx; + + if (!ctx) return (Read(fd, buf, count)); + buf2 = buf; + if (*rabin_count) { + buf2 = (char *)buf + *rabin_count; + count -= *rabin_count; + } + rcount = Read(fd, buf2, count); + if (rcount > 0) { + rcount += *rabin_count; + *rabin_count = scan_rabin_chunks(rctx, buf, rcount, *rabin_count); + } else { + if (rcount == 0) rcount = *rabin_count; + *rabin_count = 0; + } + return (rcount); +} + ssize_t Write(int fd, const void *buf, size_t count) { diff --git a/utils.h b/utils.h index 9a2bea2..8142e98 100644 --- a/utils.h +++ b/utils.h @@ -98,6 +98,8 @@ extern int parse_numeric(ssize_t *val, const char *str); extern char *bytes_to_size(uint64_t bytes); extern ssize_t Read(int fd, void *buf, size_t count); extern ssize_t Write(int fd, const void *buf, size_t count); +extern ssize_t Read2(int fd, void *buf, size_t count, + ssize_t *rabin_count, void *ctx); /* * Roundup v to the nearest power of 2. From Bit Twiddling Hacks: