Add ability to adjust chunk boundary based on Rabin Fingerprinting to improve compression.
Remove unnecessary checks in compression loop.
This commit is contained in:
parent
7e9f636f8d
commit
733923cbf2
6 changed files with 298 additions and 25 deletions
31
Makefile
31
Makefile
|
@ -24,9 +24,16 @@
|
||||||
PROG= pcompress
|
PROG= pcompress
|
||||||
MAINSRCS = main.c utils.c allocator.c zlib_compress.c bzip2_compress.c \
|
MAINSRCS = main.c utils.c allocator.c zlib_compress.c bzip2_compress.c \
|
||||||
lzma_compress.c ppmd_compress.c adaptive_compress.c
|
lzma_compress.c ppmd_compress.c adaptive_compress.c
|
||||||
|
MAINHDRS = allocator.h pcompress.h utils.h
|
||||||
MAINOBJS = $(MAINSRCS:.c=.o)
|
MAINOBJS = $(MAINSRCS:.c=.o)
|
||||||
|
|
||||||
|
RABINSRCS = rabin/rabin_polynomial.c
|
||||||
|
RABINHDRS = rabin/rabin_polynomial.h utils.h
|
||||||
|
RABINOBJS = $(RABINSRCS:.c=.o)
|
||||||
|
|
||||||
LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c
|
LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c
|
||||||
|
LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \
|
||||||
|
lzma/LzHash.h lzma/LzmaDec.h utils.h
|
||||||
LZMAOBJS = $(LZMASRCS:.c=.o)
|
LZMAOBJS = $(LZMASRCS:.c=.o)
|
||||||
|
|
||||||
PPMDSRCS = lzma/Ppmd8.c lzma/Ppmd8Enc.c lzma/Ppmd8Dec.c
|
PPMDSRCS = lzma/Ppmd8.c lzma/Ppmd8Enc.c lzma/Ppmd8Dec.c
|
||||||
|
@ -34,43 +41,49 @@ PPMDHDRS = lzma/Ppmd.h lzma/Ppmd8.h
|
||||||
PPMDOBJS = $(PPMDSRCS:.c=.o)
|
PPMDOBJS = $(PPMDSRCS:.c=.o)
|
||||||
|
|
||||||
CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c
|
CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c
|
||||||
|
CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h
|
||||||
CRCOBJS = $(CRCSRCS:.c=.o)
|
CRCOBJS = $(CRCSRCS:.c=.o)
|
||||||
|
|
||||||
BAKFILES = *~ lzma/*~
|
BAKFILES = *~ lzma/*~
|
||||||
|
|
||||||
RM = rm -f
|
RM = rm -f
|
||||||
CPPFLAGS = -I. -I./lzma -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
|
CPPFLAGS = -I. -I./lzma -I./rabin -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
|
||||||
-D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
|
-D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
|
||||||
VEC_FLAGS = -ftree-vectorize
|
VEC_FLAGS = -ftree-vectorize
|
||||||
LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
|
LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
|
||||||
LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm
|
LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm
|
||||||
|
|
||||||
ifdef DEBUG
|
ifdef DEBUG
|
||||||
LINK = gcc -m64 -pthread -msse3
|
LINK = g++ -m64 -pthread -msse3
|
||||||
COMPILE = gcc -m64 -g -msse3 -c
|
COMPILE = gcc -m64 -g -msse3 -c
|
||||||
|
COMPILE_cpp = g++ -m64 -g -msse3 -c
|
||||||
else
|
else
|
||||||
LINK = gcc -m64 -pthread -msse3
|
LINK = g++ -m64 -pthread -msse3
|
||||||
COMPILE = gcc -m64 -O3 -msse3 -c
|
COMPILE = gcc -m64 -O3 -msse3 -c
|
||||||
|
COMPILE_cpp = g++ -m64 -O3 -msse3 -c
|
||||||
CPPFLAGS += -DNDEBUG
|
CPPFLAGS += -DNDEBUG
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all: $(PROG)
|
all: $(PROG)
|
||||||
|
|
||||||
$(LZMAOBJS): $(LZMASRCS)
|
$(LZMAOBJS): $(LZMASRCS) $(LZMAHDRS)
|
||||||
$(COMPILE) $(CPPFLAGS) $(@:.o=.c) -o $@
|
$(COMPILE) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
$(CRCOBJS): $(CRCSRCS)
|
$(CRCOBJS): $(CRCSRCS) $(CRCHDRS)
|
||||||
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
$(PPMDOBJS): $(PPMDSRCS) $(PPMDHDRS)
|
$(PPMDOBJS): $(PPMDSRCS) $(PPMDHDRS)
|
||||||
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
$(MAINOBJS): $(MAINSRCS)
|
$(RABINOBJS): $(RABINSRCS) $(RABINHDRS)
|
||||||
|
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
|
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
|
||||||
$(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
$(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS)
|
$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS)
|
||||||
$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(LDLIBS)
|
$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(LDLIBS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(BAKFILES)
|
$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BAKFILES)
|
||||||
|
|
||||||
|
|
61
main.c
61
main.c
|
@ -44,6 +44,7 @@
|
||||||
#include <utils.h>
|
#include <utils.h>
|
||||||
#include <pcompress.h>
|
#include <pcompress.h>
|
||||||
#include <allocator.h>
|
#include <allocator.h>
|
||||||
|
#include <rabin_polynomial.h>
|
||||||
|
|
||||||
/* Needed for CLzmaEncprops. */
|
/* Needed for CLzmaEncprops. */
|
||||||
#include <LzmaEnc.h>
|
#include <LzmaEnc.h>
|
||||||
|
@ -75,12 +76,14 @@ static int pipe_mode = 0;
|
||||||
static int nthreads = 0;
|
static int nthreads = 0;
|
||||||
static int hide_mem_stats = 1;
|
static int hide_mem_stats = 1;
|
||||||
static int hide_cmp_stats = 1;
|
static int hide_cmp_stats = 1;
|
||||||
|
static int enable_rabin_scan = 0;
|
||||||
static unsigned int chunk_num;
|
static unsigned int chunk_num;
|
||||||
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
||||||
static const char *exec_name;
|
static const char *exec_name;
|
||||||
static const char *algo = NULL;
|
static const char *algo = NULL;
|
||||||
static int do_compress = 0;
|
static int do_compress = 0;
|
||||||
static int do_uncompress = 0;
|
static int do_uncompress = 0;
|
||||||
|
static rabin_context_t *rctx;
|
||||||
|
|
||||||
static void
|
static void
|
||||||
usage(void)
|
usage(void)
|
||||||
|
@ -109,17 +112,22 @@ usage(void)
|
||||||
"2) To decompress a file compressed using above command:\n"
|
"2) To decompress a file compressed using above command:\n"
|
||||||
" %s -d <compressed file> <target file>\n"
|
" %s -d <compressed file> <target file>\n"
|
||||||
"3) To operate as a pipe, read from stdin and write to stdout:\n"
|
"3) To operate as a pipe, read from stdin and write to stdout:\n"
|
||||||
" %s <-c ...|-d ...> -p\n"
|
" %s -p ...\n"
|
||||||
"4) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
"4) To use Rabin Fingerprinting to adjust chunk boundaries:\n"
|
||||||
"5) Pass '-M' to display memory allocator statistics\n"
|
" %s -r -c ...\n"
|
||||||
"6) Pass '-C' to display compression statistics\n\n",
|
" In this case <chunk_size> will specify the max chunk size and chunks\n"
|
||||||
exec_name, exec_name, exec_name);
|
" will be variable-length delimited at the rabin boundary closest to\n"
|
||||||
|
" <chunk_size> bytes. This should improve chunked compression.\n"
|
||||||
|
" This option is obviously valid only when compressing.\n"
|
||||||
|
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
||||||
|
"6) Pass '-M' to display memory allocator statistics\n"
|
||||||
|
"7) Pass '-C' to display compression statistics\n\n",
|
||||||
|
exec_name, exec_name, exec_name, exec_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
show_compression_stats(uint64_t chunksize)
|
show_compression_stats(uint64_t chunksize)
|
||||||
{
|
{
|
||||||
chunk_num++;
|
|
||||||
fprintf(stderr, "\nCompression Statistics\n");
|
fprintf(stderr, "\nCompression Statistics\n");
|
||||||
fprintf(stderr, "======================\n");
|
fprintf(stderr, "======================\n");
|
||||||
fprintf(stderr, "Total chunks : %u\n", chunk_num);
|
fprintf(stderr, "Total chunks : %u\n", chunk_num);
|
||||||
|
@ -614,7 +622,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
char tmpfile[MAXPATHLEN];
|
char tmpfile[MAXPATHLEN];
|
||||||
char to_filename[MAXPATHLEN];
|
char to_filename[MAXPATHLEN];
|
||||||
ssize_t compressed_chunksize;
|
ssize_t compressed_chunksize;
|
||||||
ssize_t n_chunksize, rbytes;
|
ssize_t n_chunksize, rbytes, rabin_count;
|
||||||
int version;
|
int version;
|
||||||
struct stat sbuf;
|
struct stat sbuf;
|
||||||
int compfd = -1, uncompfd = -1, err;
|
int compfd = -1, uncompfd = -1, err;
|
||||||
|
@ -640,6 +648,12 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
sizeof (chunksize) + sizeof (uint64_t) + sizeof (chunksize);
|
sizeof (chunksize) + sizeof (uint64_t) + sizeof (chunksize);
|
||||||
err = 0;
|
err = 0;
|
||||||
|
|
||||||
|
if (enable_rabin_scan) {
|
||||||
|
rctx = create_rabin_context();
|
||||||
|
if (rctx == NULL)
|
||||||
|
err_exit(0, "Initializing Rabin Polynomial failed\n");
|
||||||
|
}
|
||||||
|
|
||||||
/* A host of sanity checks. */
|
/* A host of sanity checks. */
|
||||||
if (!pipe_mode) {
|
if (!pipe_mode) {
|
||||||
if ((uncompfd = open(filename, O_RDWR, 0)) == -1)
|
if ((uncompfd = open(filename, O_RDWR, 0)) == -1)
|
||||||
|
@ -794,7 +808,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
/*
|
/*
|
||||||
* Read the first chunk into a spare buffer (a simple double-buffering).
|
* Read the first chunk into a spare buffer (a simple double-buffering).
|
||||||
*/
|
*/
|
||||||
rbytes = Read(uncompfd, cread_buf, chunksize);
|
rabin_count = 0;
|
||||||
|
rbytes = Read2(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||||
while (!bail) {
|
while (!bail) {
|
||||||
uchar_t *tmp;
|
uchar_t *tmp;
|
||||||
|
|
||||||
|
@ -816,27 +831,33 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
tdat->uncompressed_chunk = cread_buf;
|
tdat->uncompressed_chunk = cread_buf;
|
||||||
cread_buf = tmp;
|
cread_buf = tmp;
|
||||||
tdat->rbytes = rbytes;
|
tdat->rbytes = rbytes;
|
||||||
|
if (rabin_count) {
|
||||||
|
memcpy(cread_buf,
|
||||||
|
tdat->uncompressed_chunk + rabin_count,
|
||||||
|
rbytes - rabin_count);
|
||||||
|
tdat->rbytes = rabin_count;
|
||||||
|
rabin_count = rbytes - rabin_count;
|
||||||
|
}
|
||||||
if (rbytes < chunksize) {
|
if (rbytes < chunksize) {
|
||||||
bail = 1;
|
|
||||||
if (rbytes < 0) {
|
if (rbytes < 0) {
|
||||||
|
bail = 1;
|
||||||
perror("Read: ");
|
perror("Read: ");
|
||||||
COMP_BAIL;
|
COMP_BAIL;
|
||||||
|
|
||||||
} else if (tdat->rbytes == 0) { /* EOF */
|
} else if (tdat->rbytes == 0) { /* EOF */
|
||||||
|
bail = 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
np = nprocs + 1;
|
|
||||||
sem_post(&tdat->start_sem);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
/* Signal the compression thread to start */
|
/* Signal the compression thread to start */
|
||||||
sem_post(&tdat->start_sem);
|
sem_post(&tdat->start_sem);
|
||||||
|
chunk_num++;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read the next buffer we want to process while previous
|
* Read the next buffer we want to process while previous
|
||||||
* buffer is in progress.
|
* buffer is in progress.
|
||||||
*/
|
*/
|
||||||
rbytes = Read(uncompfd, cread_buf, chunksize);
|
rbytes = Read2(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||||
chunk_num++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -996,7 +1017,7 @@ main(int argc, char *argv[])
|
||||||
level = 6;
|
level = 6;
|
||||||
slab_init();
|
slab_init();
|
||||||
|
|
||||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MC")) != -1) {
|
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCr")) != -1) {
|
||||||
int ovr;
|
int ovr;
|
||||||
|
|
||||||
switch (opt) {
|
switch (opt) {
|
||||||
|
@ -1048,6 +1069,10 @@ main(int argc, char *argv[])
|
||||||
hide_cmp_stats = 0;
|
hide_cmp_stats = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'r':
|
||||||
|
enable_rabin_scan = 1;
|
||||||
|
break;
|
||||||
|
|
||||||
case '?':
|
case '?':
|
||||||
default:
|
default:
|
||||||
usage();
|
usage();
|
||||||
|
@ -1071,6 +1096,12 @@ main(int argc, char *argv[])
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (enable_rabin_scan && !do_compress) {
|
||||||
|
fprintf(stderr, "Rabin Fingerprinting is only used during compression.\n");
|
||||||
|
usage();
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
if (num_rem == 0 && !pipe_mode) {
|
if (num_rem == 0 && !pipe_mode) {
|
||||||
usage(); /* At least 1 filename needed. */
|
usage(); /* At least 1 filename needed. */
|
||||||
exit(1);
|
exit(1);
|
||||||
|
|
135
rabin/rabin_polynomial.c
Executable file
135
rabin/rabin_polynomial.c
Executable file
|
@ -0,0 +1,135 @@
|
||||||
|
/*
|
||||||
|
* rabin_polynomial.c
|
||||||
|
*
|
||||||
|
* Created by Joel Lawrence Tucci on 09-March-2011.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2011 Joel Lawrence Tucci
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* Neither the name of the project's author nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <allocator.h>
|
||||||
|
#include <utils.h>
|
||||||
|
|
||||||
|
#include "rabin_polynomial.h"
|
||||||
|
|
||||||
|
unsigned int rabin_polynomial_max_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize the algorithm with the default params. Not thread-safe.
|
||||||
|
*/
|
||||||
|
rabin_context_t *
|
||||||
|
create_rabin_context() {
|
||||||
|
rabin_context_t *ctx;
|
||||||
|
unsigned char *current_window_data;
|
||||||
|
|
||||||
|
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
|
||||||
|
current_window_data = slab_alloc(NULL, RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
|
if(ctx == NULL || current_window_data == NULL) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"Could not allocate rabin polynomial context, out of memory\n");
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
|
ctx->current_window_data = current_window_data;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We should compute the power for the window size.
|
||||||
|
* static uint64_t polynomial_pow;
|
||||||
|
* polynomial_pow = 1;
|
||||||
|
* for(index=0; index<RAB_POLYNOMIAL_WIN_SIZE; index++) {
|
||||||
|
* polynomial_pow *= RAB_POLYNOMIAL_CONST;
|
||||||
|
* }
|
||||||
|
* But since RAB_POLYNOMIAL_CONST == 2, any expression of the form
|
||||||
|
* x * polynomial_pow can we written as x << RAB_POLYNOMIAL_WIN_SIZE
|
||||||
|
*/
|
||||||
|
|
||||||
|
ctx->window_pos = 0;
|
||||||
|
ctx->cur_roll_checksum = 0;
|
||||||
|
return (ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
destroy_rabin_context(rabin_context_t *ctx)
|
||||||
|
{
|
||||||
|
slab_free(NULL, ctx->current_window_data);
|
||||||
|
slab_free(NULL, ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a buffer compute all the rabin chunks and return the end offset of the
|
||||||
|
* last chunk in the buffer. The last chunk may not end at the buffer end. The
|
||||||
|
* bytes till the last chunk end is used as the compression chunk and remaining
|
||||||
|
* bytes are carried over to the next chunk.
|
||||||
|
*/
|
||||||
|
ssize_t
|
||||||
|
scan_rabin_chunks(rabin_context_t *ctx, void *buf, ssize_t size, ssize_t offset)
|
||||||
|
{
|
||||||
|
size_t i, length, last_offset;
|
||||||
|
|
||||||
|
length = 0;
|
||||||
|
last_offset = 0;
|
||||||
|
|
||||||
|
for (i=offset; i<size; i++) {
|
||||||
|
char cur_byte = *((char *)(buf+i));
|
||||||
|
uint64_t pushed_out = ctx->current_window_data[ctx->window_pos];
|
||||||
|
ctx->current_window_data[ctx->window_pos] = cur_byte;
|
||||||
|
/*
|
||||||
|
* We want to do:
|
||||||
|
* cur_roll_checksum = cur_roll_checksum * RAB_POLYNOMIAL_CONST + cur_byte;
|
||||||
|
* cur_roll_checksum -= pushed_out * polynomial_pow;
|
||||||
|
*
|
||||||
|
* However since RAB_POLYNOMIAL_CONST == 2, we use shifts.
|
||||||
|
*/
|
||||||
|
ctx->cur_roll_checksum = (ctx->cur_roll_checksum << 1) + cur_byte;
|
||||||
|
ctx->cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
|
|
||||||
|
ctx->window_pos++;
|
||||||
|
length++;
|
||||||
|
|
||||||
|
if (ctx->window_pos == RAB_POLYNOMIAL_WIN_SIZE) // Loop back around
|
||||||
|
ctx->window_pos=0;
|
||||||
|
|
||||||
|
// If we hit our special value or reached the max block size create a new block
|
||||||
|
if ((ctx->cur_roll_checksum & RAB_POLYNOMIAL_AVG_BLOCK_MASK) == RAB_POLYNOMIAL_CONST ||
|
||||||
|
length >= rabin_polynomial_max_block_size) {
|
||||||
|
last_offset = i+1;
|
||||||
|
length = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (last_offset == 0) last_offset = size;
|
||||||
|
|
||||||
|
return last_offset;
|
||||||
|
}
|
||||||
|
|
64
rabin/rabin_polynomial.h
Normal file
64
rabin/rabin_polynomial.h
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* rabin_polynomial_constants.h
|
||||||
|
*
|
||||||
|
* Created by Joel Lawrence Tucci on 09-May-2011.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2011 Joel Lawrence Tucci
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* Neither the name of the project's author nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
//List of constants, mostly constraints and defaults for various parameters
|
||||||
|
//to the Rabin Fingerprinting algorithm
|
||||||
|
|
||||||
|
#define RAB_POLYNOMIAL_CONST 2
|
||||||
|
// 1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT = Average Rabin Chunk Size
|
||||||
|
// So we are always looking at power of 2 chunk sizes to avoid doing a modulus
|
||||||
|
//
|
||||||
|
// A value of 11 below gives block size of 2048 bytes
|
||||||
|
//
|
||||||
|
#define RAB_POLYNOMIAL_AVG_BLOCK_SHIFT 11
|
||||||
|
#define RAB_POLYNOMIAL_AVG_BLOCK_SIZE (1 << RAB_POLYNOMIAL_AVG_BLOCK_SHIFT)
|
||||||
|
#define RAB_POLYNOMIAL_AVG_BLOCK_MASK (RAB_POLYNOMIAL_AVG_BLOCK_SIZE - 1)
|
||||||
|
#define RAB_POLYNOMIAL_WIN_SIZE 32
|
||||||
|
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
|
||||||
|
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
unsigned char *current_window_data;
|
||||||
|
int window_pos;
|
||||||
|
uint64_t cur_roll_checksum;
|
||||||
|
} rabin_context_t;
|
||||||
|
|
||||||
|
extern rabin_context_t *create_rabin_context();
|
||||||
|
extern void destroy_rabin_context(rabin_context_t *ctx);
|
||||||
|
extern ssize_t scan_rabin_chunks(rabin_context_t *ctx, void *buf,
|
||||||
|
ssize_t size, ssize_t offset);
|
||||||
|
|
28
utils.c
28
utils.c
|
@ -31,6 +31,7 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <link.h>
|
#include <link.h>
|
||||||
|
#include <rabin_polynomial.h>
|
||||||
|
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
|
@ -171,12 +172,15 @@ bytes_to_size(uint64_t bytes)
|
||||||
/*
|
/*
|
||||||
* Read/Write helpers to ensure a full chunk is read or written
|
* Read/Write helpers to ensure a full chunk is read or written
|
||||||
* unless there is an error.
|
* unless there is an error.
|
||||||
|
* Additionally can be given an offset in the buf where the data
|
||||||
|
* should be inserted.
|
||||||
*/
|
*/
|
||||||
ssize_t
|
ssize_t
|
||||||
Read(int fd, void *buf, size_t count)
|
Read(int fd, void *buf, size_t count)
|
||||||
{
|
{
|
||||||
ssize_t rcount, rem;
|
ssize_t rcount, rem;
|
||||||
uchar_t *cbuf;
|
uchar_t *cbuf;
|
||||||
|
va_list args;
|
||||||
|
|
||||||
rem = count;
|
rem = count;
|
||||||
cbuf = (uchar_t *)buf;
|
cbuf = (uchar_t *)buf;
|
||||||
|
@ -190,6 +194,30 @@ Read(int fd, void *buf, size_t count)
|
||||||
return (count - rem);
|
return (count - rem);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ssize_t
|
||||||
|
Read2(int fd, void *buf, size_t count, ssize_t *rabin_count, void *ctx)
|
||||||
|
{
|
||||||
|
char *buf2;
|
||||||
|
ssize_t rcount;
|
||||||
|
rabin_context_t *rctx = (rabin_context_t *)ctx;
|
||||||
|
|
||||||
|
if (!ctx) return (Read(fd, buf, count));
|
||||||
|
buf2 = buf;
|
||||||
|
if (*rabin_count) {
|
||||||
|
buf2 = (char *)buf + *rabin_count;
|
||||||
|
count -= *rabin_count;
|
||||||
|
}
|
||||||
|
rcount = Read(fd, buf2, count);
|
||||||
|
if (rcount > 0) {
|
||||||
|
rcount += *rabin_count;
|
||||||
|
*rabin_count = scan_rabin_chunks(rctx, buf, rcount, *rabin_count);
|
||||||
|
} else {
|
||||||
|
if (rcount == 0) rcount = *rabin_count;
|
||||||
|
*rabin_count = 0;
|
||||||
|
}
|
||||||
|
return (rcount);
|
||||||
|
}
|
||||||
|
|
||||||
ssize_t
|
ssize_t
|
||||||
Write(int fd, const void *buf, size_t count)
|
Write(int fd, const void *buf, size_t count)
|
||||||
{
|
{
|
||||||
|
|
2
utils.h
2
utils.h
|
@ -98,6 +98,8 @@ extern int parse_numeric(ssize_t *val, const char *str);
|
||||||
extern char *bytes_to_size(uint64_t bytes);
|
extern char *bytes_to_size(uint64_t bytes);
|
||||||
extern ssize_t Read(int fd, void *buf, size_t count);
|
extern ssize_t Read(int fd, void *buf, size_t count);
|
||||||
extern ssize_t Write(int fd, const void *buf, size_t count);
|
extern ssize_t Write(int fd, const void *buf, size_t count);
|
||||||
|
extern ssize_t Read2(int fd, void *buf, size_t count,
|
||||||
|
ssize_t *rabin_count, void *ctx);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Roundup v to the nearest power of 2. From Bit Twiddling Hacks:
|
* Roundup v to the nearest power of 2. From Bit Twiddling Hacks:
|
||||||
|
|
Loading…
Reference in a new issue