diff --git a/Makefile b/Makefile index d02699f..4b092db 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,10 @@ RABINSRCS = rabin/rabin_polynomial.c RABINHDRS = rabin/rabin_polynomial.h utils.h RABINOBJS = $(RABINSRCS:.c=.o) +BSDIFFSRCS = bsdiff/bsdiff.c bsdiff/bspatch.c bsdiff/rle_encoder.c +BSDIFFHDRS = bsdiff/bscommon.h utils.h allocator.h +BSDIFFOBJS = $(BSDIFFSRCS:.c=.o) + LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \ lzma/LzHash.h lzma/LzmaDec.h utils.h @@ -44,10 +48,10 @@ CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h CRCOBJS = $(CRCSRCS:.c=.o) -BAKFILES = *~ lzma/*~ rabin/*~ +BAKFILES = *~ lzma/*~ rabin/*~ bsdiff/*~ RM = rm -f -CPPFLAGS = -I. -I./lzma -I./rabin -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \ +CPPFLAGS = -I. -I./lzma -I./rabin -I./bsdiff -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \ -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 VEC_FLAGS = -ftree-vectorize LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block @@ -57,6 +61,7 @@ ifdef DEBUG LINK = g++ -m64 -pthread -msse3 COMPILE = gcc -m64 -O -g -msse3 -c COMPILE_cpp = g++ -m64 -O -g -msse3 -c +VEC_FLAGS = ifdef DEBUG_NO_SLAB CPPFLAGS += -DDEBUG_NO_SLAB endif @@ -84,12 +89,15 @@ $(PPMDOBJS): $(PPMDSRCS) $(PPMDHDRS) $(RABINOBJS): $(RABINSRCS) $(RABINHDRS) $(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ +$(BSDIFFOBJS): $(BSDIFFSRCS) $(BSDIFFHDRS) + $(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ + $(MAINOBJS): $(MAINSRCS) $(MAINHDRS) $(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ -$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) - $(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(LDLIBS) +$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) + $(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) $(LDLIBS) clean: - $(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BAKFILES) + $(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) $(BAKFILES) diff --git a/bsdiff/bscommon.h b/bsdiff/bscommon.h new file mode 100644 index 0000000..d5a6dfe --- /dev/null +++ b/bsdiff/bscommon.h @@ -0,0 +1,132 @@ +/*- + * Copyright 2012 Moinak Ghosh + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted providing that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + * + */ + +#ifndef _BS_COMMON_ +#define _BS_COMMON_ + +#include +#include + +// Simple stream I/O to buffer +typedef struct { + uchar_t *buf; + bsize_t pos; + bsize_t buflen; +} bufio_t; + +static int +BUFOPEN(bufio_t *bio, uchar_t *buf, bsize_t len) +{ + bio->buf = buf; bio->pos = 0; bio->buflen = len; + return (0); +} +static bsize_t +BUFWRITE(bufio_t *bio, uchar_t *buf, bsize_t len) +{ + if (bio->pos + len < bio->buflen) { + memcpy(bio->buf + bio->pos, buf, len); + bio->pos += len; + return (len); + } else { + return (-1); + } +} + +static bsize_t +BUFREAD(bufio_t *bio, uchar_t *buf, bsize_t len) +{ + bsize_t actual; +int i; + + actual = len; + if (bio->pos + len > bio->buflen) { + actual = bio->buflen - bio->pos; + } + if (actual == 0) return (0); + memcpy(buf, bio->buf + bio->pos, actual); + bio->pos += actual; + return (actual); +} + +static bsize_t +BUFTELL(bufio_t *bio) +{ + return (bio->pos); +} + +static void * +BUFPTR(bufio_t *bio) +{ + return (bio->buf + bio->pos); +} + +static int +BUFSEEK(bufio_t *bio, bsize_t pos, int typ) +{ + if (typ == SEEK_SET) { + bio->pos = pos; + + } else if (typ == SEEK_CUR) { + bio->pos += pos; + + } else { + if (pos > 0) { + fprintf(stderr, "Cannot seek beyond buffer end.\n"); + return (-1); + } else { + bio->pos = bio->buflen + pos; + } + } + return (0); +} + +extern int zero_rle_encode(const void *const ibuf, const unsigned int ilen, + void *obuf, unsigned int *const olen); +extern int zero_rle_decode(const void* ibuf, unsigned int ilen, + void* obuf, unsigned int *olen); + +#endif diff --git a/bsdiff/bsdiff.c b/bsdiff/bsdiff.c new file mode 100644 index 0000000..ebc8148 --- /dev/null +++ b/bsdiff/bsdiff.c @@ -0,0 +1,402 @@ +/*- + * Copyright 2003-2005 Colin Percival + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted providing that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + * + * This is a somewhat modified bsdiff implementation. It has been modified + * to do buffer to buffer diffing instead of file to file and also use + * a custom RLE encoding rather than Bzip2 on the diff output. + */ + +#if 0 +__FBSDID("$FreeBSD: src/usr.bin/bsdiff/bsdiff/bsdiff.c,v 1.1 2005/08/06 01:59:05 cperciva Exp $"); +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bscommon.h" + +#define MIN(x,y) (((x)<(y)) ? (x) : (y)) + +static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h) +{ + bsize_t i,j,k,x,tmp,jj,kk; + + if(len<16) { + for(k=start;kstart) split(I,V,start,jj-start,h); + + for(i=0;ikk) split(I,V,kk,start+len-kk,h); +} + +static void qsufsort(bsize_t *I,bsize_t *V,u_char *old,bsize_t oldsize) +{ + bsize_t buckets[256]; + bsize_t i,h,len; + + for(i=0;i<256;i++) buckets[i]=0; + for(i=0;i0;i--) buckets[i]=buckets[i-1]; + buckets[0]=0; + + for(i=0;iy) { + *pos=I[st]; + return x; + } else { + *pos=I[en]; + return y; + } + }; + + x=st+(en-st)/2; + if(memcmp(old+I[x],new,MIN(oldsize-I[x],newsize))<0) { + return search(I,old,oldsize,new,newsize,x,en,pos); + } else { + return search(I,old,oldsize,new,newsize,st,x,pos); + }; +} + +static void +valout(bsize_t x, u_char *buf) +{ + *((bsize_t *)buf) = htonll(x); +} + +static void +valouti32(bsize_t x, u_char *buf) +{ + int32_t val; + val = x; + *((int32_t *)buf) = htonl(val); +} + +bsize_t +bsdiff(u_char *old, bsize_t oldsize, u_char *new, bsize_t newsize, + u_char *diff, u_char *scratch, bsize_t scratchsize) +{ + bsize_t *I,*V; + bsize_t scan,pos,len; + bsize_t lastscan,lastpos,lastoffset; + bsize_t oldscore,scsc; + bsize_t s,Sf,lenf,Sb,lenb; + bsize_t overlap,Ss,lens; + bsize_t i, rv; + bsize_t dblen,eblen; + u_char *db,*eb; + u_char buf[sizeof (bsize_t)]; + u_char header[48]; + unsigned int sz, hdrsz, ulen; + bufio_t pf; + + sz = sizeof (bsize_t); + I = slab_alloc(NULL, (oldsize+1)*sz); + V = slab_alloc(NULL, (oldsize+1)*sz); + if(I == NULL || V == NULL) return (0); + + qsufsort(I,V,old,oldsize); + slab_free(NULL, V); + + if(((db=slab_alloc(NULL, newsize+1))==NULL) || + ((eb=slab_alloc(NULL, newsize+1))==NULL)) { + fprintf(stderr, "bsdiff: Memory allocation error.\n"); + slab_free(NULL, I); + slab_free(NULL, V); + return (0); + } + dblen=0; + eblen=0; + BUFOPEN(&pf, diff, newsize); + + /* Header is + 0 8 length of ctrl block + 8 8 compressed length of diff block + 16 8 actual length of diff block + 24 8 compressed length of extra block + 32 8 actual length of extra block + 40 8 length of new file */ + /* File is + 0 32 Header + 32 ?? ctrl block + ?? ?? diff block + ?? ?? extra block */ + valout(0, header); + valout(0, header + sz); + valout(0, header + sz*2); + valout(0, header + sz*3); + valout(0, header + sz*4); + valout(newsize, header + sz*5); + if (BUFWRITE(&pf, header, sz*6) != sz*6) { + fprintf(stderr, "bsdiff: Write to compressed buffer failed.\n"); + rv = 0; + goto out; + } + hdrsz = sz*6; + + /* Compute the differences, writing ctrl as we go */ + scan=0;len=0; + lastscan=0;lastpos=0;lastoffset=0; + while(scanoldscore+sz)) break; + + if((scan+lastoffsetSf*2-lenf) { Sf=s; lenf=i; }; + }; + + lenb=0; + if(scan=lastscan+i)&&(pos>=i);i++) { + if(old[pos-i]==new[scan-i]) s++; + if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; }; + }; + }; + + if(lastscan+lenf>scan-lenb) { + overlap=(lastscan+lenf)-(scan-lenb); + s=0;Ss=0;lens=0; + for(i=0;iSs) { Ss=s; lens=i+1; }; + }; + + lenf+=lens-overlap; + lenb-=lens; + }; + + for(i=0;i newsize/2) { + rv = 0; + goto out; + } + + /* Compute size of ctrl data */ + len = BUFTELL(&pf); + valout(len-hdrsz, header); + rv = len; + + /* Write diff data */ + len = newsize - rv; + ulen = len; + if (zero_rle_encode(db, dblen, BUFPTR(&pf), &ulen) == -1) { + rv = 0; + goto out; + } + /* Output size of diff data */ + len = ulen; + valout(len, header + sz); + valout(dblen, header + sz*2); + rv += len; + BUFSEEK(&pf, len, SEEK_CUR); + + /* Write extra data */ + len = newsize - rv; + ulen = len; + if (zero_rle_encode(eb, eblen, BUFPTR(&pf), &ulen) == -1) { + rv = 0; + goto out; + } + /* Output size of extra data */ + len = ulen; + valout(len, header + sz*3); + valout(eblen, header + sz*4); + rv += len; + + /* Seek to the beginning, re-write the header.*/ + BUFSEEK(&pf, 0, SEEK_SET); + BUFWRITE(&pf, header, hdrsz); + +out: + /* Free the memory we used */ + slab_free(NULL, db); + slab_free(NULL, eb); + slab_free(NULL, I); + + return (rv); +} diff --git a/bsdiff/bspatch.c b/bsdiff/bspatch.c new file mode 100644 index 0000000..7c9f53e --- /dev/null +++ b/bsdiff/bspatch.c @@ -0,0 +1,218 @@ +/*- + * Copyright 2003-2005 Colin Percival + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted providing that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if 0 +__FBSDID("$FreeBSD: src/usr.bin/bsdiff/bspatch/bspatch.c,v 1.1 2005/08/06 01:59:06 cperciva Exp $"); +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bscommon.h" + +static bsize_t +valin(u_char *buf) +{ + return ntohll(*((bsize_t *)buf)); +} + +static int32_t +valini32(u_char *buf) +{ + return ntohl(*((int32_t *)buf)); +} + +bsize_t +get_bsdiff_sz(u_char *pbuf) { + bsize_t newsize; + bsize_t ctrllen, lzdatalen, datalen, lzextralen, extralen; + int sz, hdrsz, rv; + + sz = sizeof (bsize_t); + hdrsz = sz*6; + + ctrllen = valin(pbuf); + lzdatalen = valin(pbuf+sz); + datalen = valin(pbuf+sz*2); + lzextralen = valin(pbuf+sz*3); + extralen = valin(pbuf+sz*4); + newsize = valin(pbuf+sz*5); + return (ctrllen + lzdatalen + lzextralen + hdrsz); +} + +int +bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new, bsize_t *_newsize) +{ + bsize_t newsize; + bsize_t ctrllen, lzdatalen, datalen, lzextralen, extralen; + u_char buf[8]; + u_char *diffdata, *extradata; + bsize_t oldpos,newpos; + bsize_t ctrl[3]; + bsize_t lenread; + bsize_t i; + bufio_t cpf, dpf, epf; + int sz, hdrsz, rv; + unsigned int len; + + /* + File format: + 0 8 length of ctrl block (X) + 8 8 compressed length of diff block (Y) + 16 8 actual length of diff block + 24 8 compressed length of extra block (Z) + 32 8 actual length of extra block + 40 8 length of new file + 48 X control block + 48+X Y lzfx(diff block) + 48+X+Y Z lzfx(extra block) + with control block a set of triples (x,y,z) meaning "add x bytes + from oldfile to x bytes from the diff block; copy y bytes from the + extra block; seek forwards in oldfile by z bytes". + */ + sz = sizeof (bsize_t); + hdrsz = sz*6; + rv = 1; + + /* Read lengths from header first. */ + ctrllen = valin(pbuf); + lzdatalen = valin(pbuf+sz); + datalen = valin(pbuf+sz*2); + lzextralen = valin(pbuf+sz*3); + extralen = valin(pbuf+sz*4); + newsize = valin(pbuf+sz*5); + + if((ctrllen<0) || (lzdatalen<0) || (newsize<0) || (lzextralen<0)) { + fprintf(stderr, "1: Corrupt patch\n"); + return (0); + } + if (newsize > *_newsize) { + fprintf(stderr, "Output buffer too small.\n"); + return (0); + } + *_newsize = newsize; + + /* Allocate buffers. */ + diffdata = malloc(datalen); + extradata = malloc(extralen); + if (diffdata == NULL || extradata == NULL) { + fprintf(stderr, "bspatch: Out of memory.\n"); + if (diffdata) free(diffdata); + if (extradata) free(extradata); + return (0); + } + + /* Decompress diffdata and extradata. */ + len = datalen; + if (zero_rle_decode(pbuf + hdrsz + ctrllen, lzdatalen, diffdata, &len) == -1 || + len != datalen) { + fprintf(stderr, "bspatch: Failed to decompress diff data.\n"); + rv = 0; + goto out; + } + datalen = len; + + len = extralen; + if (zero_rle_decode(pbuf + hdrsz + ctrllen + lzdatalen, lzextralen, extradata, &len) == -1 || + len != extralen) { + fprintf(stderr, "bspatch: Failed to decompress extra data.\n"); + rv = 0; + goto out; + } + extralen = len; + BUFOPEN(&cpf, pbuf + hdrsz, ctrllen); + BUFOPEN(&dpf, diffdata, datalen); + BUFOPEN(&epf, extradata, extralen); + + oldpos=0;newpos=0; + while(newposnewsize) { + fprintf(stderr, "3: Corrupt diff data\n"); + rv = 0; + goto out; + } + + /* Read diff string */ + lenread = BUFREAD(&dpf, new + newpos, ctrl[0]); + if (lenread < ctrl[0]) { + fprintf(stderr, "4: Corrupt diff data\n"); + rv = 0; + goto out; + } + + /* Add old data to diff string */ + for(i=0;i=0) && (oldpos+inewsize) { + fprintf(stderr, "5: Corrupt diff data\n"); + rv = 0; + goto out; + } + + /* Read extra string */ + lenread = BUFREAD(&epf, new + newpos, ctrl[1]); + if (lenread < ctrl[1]) { + fprintf(stderr, "6: Corrupt diff data\n"); + rv = 0; + goto out; + } + + /* Adjust pointers */ + newpos+=ctrl[1]; + oldpos+=ctrl[2]; + }; + +out: + free(diffdata); + free(extradata); + + return (rv); +} diff --git a/bsdiff/rle_encoder.c b/bsdiff/rle_encoder.c new file mode 100644 index 0000000..fdc5f15 --- /dev/null +++ b/bsdiff/rle_encoder.c @@ -0,0 +1,112 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + * + * This RLE encoder is a simple approach to encode long runs of '0' + * bytes that typically are found in a bsdiff patch output. This + * does not encode repeating runs of other characters. + */ + +#include +#include + +#define ZERO_MASK (32768) +#define DATA_MASK (32767) +#define COUNT_MAX (32767) + +int +zero_rle_encode(const void *const ibuf, const unsigned int ilen, + void *obuf, unsigned int *const olen) +{ + unsigned int pos1, pos2; + unsigned short count; + const uchar_t *const ib = ibuf; + uchar_t *ob = obuf; + + pos2 = 0; + for (pos1=0; pos1 *olen) break; + + state = 0; + for (;pos1= 4) break; + } + ob[pos2++] = ib[pos1++]; + count++; + } + *((unsigned short *)(ob + pos3)) = htons(count); + } + } + *olen = pos2; + if (pos1 < ilen) { + return (-1); + } else { + return (0); + } +} + +int +zero_rle_decode(const void* ibuf, unsigned int ilen, + void* obuf, unsigned int *olen) +{ + unsigned int pos1, pos2, i; + unsigned short count; + const uchar_t *ib = ibuf; + uchar_t *ob = obuf; + + pos2 = 0; + pos1 = 0; + for (; pos1\n" - "6) Pass '-M' to display memory allocator statistics\n" - "7) Pass '-C' to display compression statistics\n\n", - exec_name, exec_name, exec_name, exec_name); + "5) Perform Delta Encoding in addition to Exact Dedup:\n" + " %s -E ... - This also implies '-D'.\n" + "6) Number of threads can optionally be specified: -t <1 - 256 count>\n" + "7) Pass '-M' to display memory allocator statistics\n" + "8) Pass '-C' to display compression statistics\n\n", + exec_name, exec_name, exec_name, exec_name, exec_name, exec_name); } void @@ -436,7 +439,8 @@ start_decompress(const char *filename, const char *to_filename) if (_init_func) _init_func(&(tdat->data), &(tdat->level), chunksize); if (enable_rabin_scan) - tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, algo); + tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, + algo, enable_delta_encode); else tdat->rctx = NULL; if (pthread_create(&(tdat->thr), NULL, perform_decompress, @@ -905,7 +909,8 @@ start_compress(const char *filename, uint64_t chunksize, int level) if (_init_func) _init_func(&(tdat->data), &(tdat->level), chunksize); if (enable_rabin_scan) - tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, algo); + tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, + algo, enable_delta_encode); else tdat->rctx = NULL; @@ -965,7 +970,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) * Read the first chunk into a spare buffer (a simple double-buffering). */ if (enable_rabin_split) { - rctx = create_rabin_context(chunksize, 0, algo); + rctx = create_rabin_context(chunksize, 0, algo, enable_delta_encode); rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx); } else { rbytes = Read(uncompfd, cread_buf, chunksize); @@ -1203,7 +1208,7 @@ main(int argc, char *argv[]) level = 6; slab_init(); - while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDr")) != -1) { + while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEr")) != -1) { int ovr; switch (opt) { @@ -1259,6 +1264,11 @@ main(int argc, char *argv[]) enable_rabin_scan = 1; break; + case 'E': + enable_rabin_scan = 1; + enable_delta_encode = 1; + break; + case 'r': enable_rabin_split = 0; break; diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c index a275186..ffb9ba5 100755 --- a/rabin/rabin_polynomial.c +++ b/rabin/rabin_polynomial.c @@ -1,9 +1,12 @@ /* * rabin_polynomial.c * - * Created by Joel Lawrence Tucci on 09-March-2011. + * The rabin polynomial computation is derived from: + * http://code.google.com/p/rabin-fingerprint-c/ * - * Copyright (c) 2011 Joel Lawrence Tucci + * originally created by Joel Lawrence Tucci on 09-March-2011. + * + * Rabin polynomial portions Copyright (c) 2011 Joel Lawrence Tucci * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -70,6 +73,11 @@ extern int lzma_compress(void *src, size_t srclen, void *dst, extern int lzma_decompress(void *src, size_t srclen, void *dst, size_t *dstlen, int level, uchar_t chdr, void *data); extern int lzma_deinit(void **data); +extern int bsdiff(u_char *old, bsize_t oldsize, u_char *new, bsize_t newsize, + u_char *diff, u_char *scratch, bsize_t scratchsize); +extern bsize_t get_bsdiff_sz(u_char *pbuf); +extern int bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new, + bsize_t *_newsize); uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; @@ -77,11 +85,10 @@ uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; * Initialize the algorithm with the default params. */ rabin_context_t * -create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo) { +create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo, int delta_flag) { rabin_context_t *ctx; unsigned char *current_window_data; uint32_t blknum; - int level = 14; /* * Rabin window size must be power of 2 for optimization. @@ -90,14 +97,22 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n"); return (NULL); } + + if (chunksize < RAB_MIN_CHUNK_SIZE) { + fprintf(stderr, "Minimum chunk size for Dedup must be %l bytes\n", + RAB_MIN_CHUNK_SIZE); + return (NULL); + } + /* - * For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin - * block size. For everything else it is 1K based on experimentation. + * For LZMA with chunksize <= LZMA Window size and/or Delta enabled we + * use 4K minimum Rabin block size. For everything else it is 2K based + * on experimentation. */ ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; - if ((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) && - chunksize <= LZMA_WINDOW_MAX) { + if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) && + chunksize <= LZMA_WINDOW_MAX) || delta_flag) { ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE; ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK; ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE; @@ -132,11 +147,12 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al } ctx->lzma_data = NULL; + ctx->level = 14; if (real_chunksize > 0) { lzma_init(&(ctx->lzma_data), &(ctx->level), chunksize); if (!(ctx->lzma_data)) { fprintf(stderr, - "Could not allocate rabin polynomial context, out of memory\n"); + "Could not initialize LZMA data for rabin index, out of memory\n"); destroy_rabin_context(ctx); return (NULL); } @@ -154,6 +170,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al ctx->current_window_data = current_window_data; ctx->real_chunksize = real_chunksize; + ctx->delta_flag = delta_flag; reset_rabin_context(ctx); return (ctx); } @@ -185,12 +202,24 @@ cmpblks(const void *a, const void *b) rabin_blockentry_t *a1 = (rabin_blockentry_t *)a; rabin_blockentry_t *b1 = (rabin_blockentry_t *)b; - if (a1->cksum_n_offset < b1->cksum_n_offset) + if (a1->cksum_n_offset < b1->cksum_n_offset) { return (-1); - else if (a1->cksum_n_offset == b1->cksum_n_offset) - return (0); - else if (a1->cksum_n_offset > b1->cksum_n_offset) + } else if (a1->cksum_n_offset == b1->cksum_n_offset) { + /* + * If fingerprints match then compare lengths. Length match makes + * for strong exact detection/ordering during sort while stopping + * short of expensive memcmp(). + */ + if (a1->length < b1->length) { + return (-1); + } else if (a1->length == b1->length) { + return (0); + } else if (a1->length > b1->length) { + return (1); + } + } else if (a1->cksum_n_offset > b1->cksum_n_offset) { return (1); + } } /** @@ -200,19 +229,32 @@ cmpblks(const void *a, const void *b) uint32_t rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos) { - ssize_t i, last_offset, j; + ssize_t i, last_offset, j, fplist_sz; uint32_t blknum; char *buf1 = (char *)buf; uint32_t length; - uint64_t cur_roll_checksum[2]; + uint64_t cur_roll_checksum, cur_sketch; + uint64_t *fplist; + uint32_t len1, fpos; + if (rabin_pos == NULL) { + /* + * Initialize arrays for sketch computation. We re-use memory allocated + * for the compressed chunk temporarily. + */ + fplist_sz = 8 * ctx->rabin_poly_avg_block_size; + fplist = (uint64_t *)(ctx->cbuf + ctx->real_chunksize - fplist_sz); + memset(fplist, 0, fplist_sz); + fpos = 0; + len1 = 0; + } length = offset; last_offset = 0; blknum = 0; ctx->valid = 0; - cur_roll_checksum[0] = 0; - cur_roll_checksum[1] = 0; + cur_roll_checksum = 0; j = 0; + cur_sketch = 0; /* * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary @@ -234,13 +276,39 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s * * However since RAB_POLYNOMIAL_CONST == 2, we use shifts. */ - cur_roll_checksum[1] = (cur_roll_checksum[1] << 1) + cur_byte; - cur_roll_checksum[1] -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE); - - // Compute Sum 0 mod 25 Sketch. We are avoiding a branch here. - // See: http://www.armedia.com/wp/SimilarityIndex.pdf - j += cur_roll_checksum[(cur_roll_checksum[1] % 25 == 0)]; + cur_roll_checksum = (cur_roll_checksum << 1) + cur_byte; + cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE); + /* + * Compute a super sketch value of the block. We store a sum of relative + * maximal rabin hash values per 1K(SKETCH_BASIC_BLOCK_SZ) of data. So we + * get upto 128 sums for a max block size of 128K. This is a representative + * fingerprint sketch of the block. Storing and comparing upto 128 fingerprints + * per block is very expensive (compute & RAM) so we eventually sum all the + * fingerprints for the block to create a single super sketch value representing + * maximal features of the block. + * + * This value can be used for similarity detection for delta encoding. Exact + * match for deduplication is additionally detected via a memcmp(). This is a + * variant of some approaches detailed in: + * http://www.armedia.com/wp/SimilarityIndex.pdf + */ + if (rabin_pos == NULL) { + len1++; + j = cur_roll_checksum & ctx->rabin_avg_block_mask; + fplist[j] += cur_roll_checksum; + if (fplist[j] > fplist[fpos]) fpos = j; + if (len1 == SKETCH_BASIC_BLOCK_SZ) { + /* + * Compute the super sketch value by summing all the representative + * fingerprints of the block. + */ + cur_sketch += fplist[fpos]; + memset(fplist, 0, fplist_sz); + fpos = 0; + len1 = 0; + } + } /* * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 * We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE @@ -252,14 +320,19 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s if (length < ctx->rabin_poly_min_block_size) continue; // If we hit our special value or reached the max block size update block offset - if ((cur_roll_checksum[1] & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt || + if ((cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt || length >= rabin_polynomial_max_block_size) { if (rabin_pos == NULL) { ctx->blocks[blknum].offset = last_offset; ctx->blocks[blknum].index = blknum; // Need to store for sorting - ctx->blocks[blknum].cksum_n_offset = j; ctx->blocks[blknum].length = length; ctx->blocks[blknum].refcount = 0; + ctx->blocks[blknum].similar = 0; + ctx->blocks[blknum].cksum_n_offset = cur_sketch; + memset(fplist, 0, fplist_sz); + fpos = 0; + len1 = 0; + cur_sketch = 0; blknum++; } last_offset = i+1; @@ -287,9 +360,10 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s if (last_offset < *size) { ctx->blocks[blknum].offset = last_offset; ctx->blocks[blknum].index = blknum; - ctx->blocks[blknum].cksum_n_offset = j; ctx->blocks[blknum].length = *size - last_offset; ctx->blocks[blknum].refcount = 0; + ctx->blocks[blknum].similar = 0; + ctx->blocks[blknum].cksum_n_offset = cur_sketch; blknum++; last_offset = *size; } @@ -302,8 +376,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s /* * Now sort the block array based on checksums. This will bring virtually * all similar block entries together. Effectiveness depends on how strong - * our checksum is. We are using CRC64 here so we should be pretty okay. - * TODO: Test with a heavily optimized MD5 (from OpenSSL?) later. + * our checksum is. We are using a maximal super-sketch value. */ qsort(ctx->blocks, blknum, sizeof (rabin_blockentry_t), cmpblks); rabin_index = (uint32_t *)(ctx->cbuf + RABIN_HDR_SIZE); @@ -332,7 +405,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s if (blk > 0 && ctx->blocks[blk].cksum_n_offset == prev_cksum && ctx->blocks[blk].length == prev_length && memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) { - ctx->blocks[blk].length = 0; + ctx->blocks[blk].similar = SIMILAR_EXACT; ctx->blocks[blk].index = prev_index; (ctx->blocks[prev_blk].refcount)++; matchlen += prev_length; @@ -344,10 +417,32 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s prev_index = ctx->blocks[blk].index; prev_blk = blk; } + + if (ctx->delta_flag) { + for (blk = 0; blk < blknum; blk++) { + if (ctx->blocks[blk].similar) continue; + + if (blk > 0 && ctx->blocks[blk].refcount == 0 && + ctx->blocks[blk].cksum_n_offset == prev_cksum) { + ssize_t sz1, sz2; + ctx->blocks[blk].index = prev_index; + ctx->blocks[blk].similar = SIMILAR_PARTIAL; + (ctx->blocks[prev_blk].refcount)++; + matchlen += prev_length/2; + continue; + } + prev_offset = buf1 + ctx->blocks[blk].offset; + prev_cksum = ctx->blocks[blk].cksum_n_offset; + prev_length = ctx->blocks[blk].length; + prev_index = ctx->blocks[blk].index; + prev_blk = blk; + } + } if (matchlen < rabin_index_sz) { ctx->valid = 0; return; } + /* * Another pass, this time through the block index in the chunk. We insert * block length into unique block entries. For block entries that are @@ -362,11 +457,12 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s rabin_blockentry_t *be; be = &(ctx->blocks[blkarr[blk]]); - if (be->length > 0) { + if (be->similar == 0) { /* * Update Index entry with the length. Also try to merge runs - * of unique (non-duplicate) blocks into a single block entry - * as long as the total length does not exceed max block size. + * of unique (non-duplicate/similar) blocks into a single block + * entry as long as the total length does not exceed max block + * size. */ if (prev_index == 0) { if (be->refcount == 0) { @@ -402,32 +498,63 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s } else { prev_index = 0; prev_length = 0; - rabin_index[pos] = be->index | RABIN_INDEX_FLAG; + ctx->blocks[pos].cksum_n_offset = be->offset; + ctx->blocks[pos].new_length = be->length; trans[blk] = pos; + + if (be->similar == SIMILAR_EXACT) { + rabin_index[pos] = (blkarr[be->index] | RABIN_INDEX_FLAG) & + CLEAR_SIMILARITY_FLAG; + } else { + rabin_index[pos] = blkarr[be->index] | RABIN_INDEX_FLAG | + SET_SIMILARITY_FLAG; + } pos++; } } /* - * Final pass, copy the data. + * Final pass, copy the data and perform delta encoding. */ blknum = pos; rabin_index_sz = (ssize_t)pos * RABIN_ENTRY_SIZE; pos1 = rabin_index_sz + RABIN_HDR_SIZE; for (blk = 0; blk < blknum; blk++) { + uchar_t *old, *new; + int32_t bsz; + + /* + * If blocks are overflowing the allowed chunk size then dedup did not + * help at all. We invalidate the dedup operation. + */ + if (pos1 > last_offset) { + valid = 0; + break; + } if (rabin_index[blk] & RABIN_INDEX_FLAG) { j = rabin_index[blk] & RABIN_INDEX_VALUE; - rabin_index[blk] = htonl(trans[j] | RABIN_INDEX_FLAG); - } else { - /* - * If blocks are overflowing the allowed chunk size then dedup did not - * help at all. We invalidate the dedup operation. - */ - if (pos1 > last_offset) { - valid = 0; - break; + i = ctx->blocks[j].index; + + if (rabin_index[blk] & GET_SIMILARITY_FLAG) { + old = buf1 + ctx->blocks[j].offset; + new = buf1 + ctx->blocks[blk].cksum_n_offset; + bsz = bsdiff(old, ctx->blocks[j].length, new, + ctx->blocks[blk].new_length, ctx->cbuf + pos1, 0, 0); + if (bsz == 0) { + memcpy(ctx->cbuf + pos1, new, ctx->blocks[blk].new_length); + rabin_index[blk] = htonl(ctx->blocks[blk].new_length); + pos1 += ctx->blocks[blk].new_length; + } else { + rabin_index[blk] = htonl(trans[i] | + RABIN_INDEX_FLAG | SET_SIMILARITY_FLAG); + pos1 += bsz; + } + } else { + rabin_index[blk] = htonl(trans[i] | RABIN_INDEX_FLAG); } - memcpy(ctx->cbuf + pos1, buf1 + ctx->blocks[blk].cksum_n_offset, rabin_index[blk]); + } else { + memcpy(ctx->cbuf + pos1, buf1 + ctx->blocks[blk].cksum_n_offset, + rabin_index[blk]); pos1 += rabin_index[blk]; rabin_index[blk] = htonl(rabin_index[blk]); } @@ -512,29 +639,66 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size) ctx->blocks[blk].offset = pos1; pos1 += len; } else { + bsize_t blen; + ctx->blocks[blk].length = 0; - ctx->blocks[blk].index = len & RABIN_INDEX_VALUE; + if (len & GET_SIMILARITY_FLAG) { + ctx->blocks[blk].offset = pos1; + ctx->blocks[blk].index = (len & RABIN_INDEX_VALUE) | SET_SIMILARITY_FLAG; + blen = get_bsdiff_sz(buf + pos1); + pos1 += blen; + } else { + ctx->blocks[blk].index = len & RABIN_INDEX_VALUE; + } } } + for (blk = 0; blk < blknum; blk++) { + int rv; + bsize_t newsz; + if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue; if (ctx->blocks[blk].length > 0) { len = ctx->blocks[blk].length; pos1 = ctx->blocks[blk].offset; } else { oblk = ctx->blocks[blk].index; - len = ctx->blocks[oblk].length; - pos1 = ctx->blocks[oblk].offset; + + if (oblk & GET_SIMILARITY_FLAG) { + oblk = oblk & CLEAR_SIMILARITY_FLAG; + len = ctx->blocks[oblk].length; + pos1 = ctx->blocks[oblk].offset; + newsz = data_sz - sz; + rv = bspatch(buf + ctx->blocks[blk].offset, buf + pos1, len, pos2, &newsz); + if (rv == 0) { + fprintf(stderr, "Failed to bspatch block.\n"); + ctx->valid = 0; + break; + } + pos2 += newsz; + sz += newsz; + if (sz > data_sz) { + fprintf(stderr, "Dedup data overflows chunk.\n"); + ctx->valid = 0; + break; + } + continue; + } else { + len = ctx->blocks[oblk].length; + pos1 = ctx->blocks[oblk].offset; + } } memcpy(pos2, buf + pos1, len); pos2 += len; sz += len; if (sz > data_sz) { + fprintf(stderr, "Dedup data overflows chunk.\n"); ctx->valid = 0; break; } } if (ctx->valid && sz < data_sz) { + fprintf(stderr, "Too little dedup data processed.\n"); ctx->valid = 0; } *size = data_sz; diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h index ffe642f..fe67122 100644 --- a/rabin/rabin_polynomial.h +++ b/rabin/rabin_polynomial.h @@ -84,19 +84,15 @@ #define RAB_POLYNOMIAL_MIN_WIN_SIZE 8 #define RAB_POLYNOMIAL_MAX_WIN_SIZE 64 -typedef struct { - ssize_t offset; - uint64_t cksum_n_offset; // Dual purpose variable - unsigned int index; - unsigned int length; - unsigned short refcount; -} rabin_blockentry_t; +// Minimum practical chunk size when doing dedup +#define RAB_MIN_CHUNK_SIZE (1048576L) + +// Number of bytes to compute one maximal fingerprint value +#define SKETCH_BASIC_BLOCK_SZ (1024) // An entry in the Rabin block array in the chunk. -// It is either a length value <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE or -// if value > RAB_POLYNOMIAL_MAX_BLOCK_SIZE then -// value - RAB_POLYNOMIAL_MAX_BLOCK_SIZE is index of block with which -// this block is a duplicate. +// It is either a length value <= RABIN_MAX_BLOCK_SIZE or an index value with +// which this block is a duplicate/similar. The entries are variable sized. // Offset can be dynamically calculated. // #define RABIN_ENTRY_SIZE (sizeof (unsigned int)) @@ -106,20 +102,43 @@ typedef struct { // size of deduped data, size of compressed data #define RABIN_HDR_SIZE (sizeof (unsigned int) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t)) -// Maximum number of dedup blocks supported (2^31 - 1) -#define RABIN_MAX_BLOCKS (0x7fffffff) +// Maximum number of dedup blocks supported (2^30 - 1) +#define RABIN_MAX_BLOCKS (0x3FFFFFFFUL) // Maximum possible block size for a single rabin block. This is a hard limit much // larger than RAB_POLYNOMIAL_MAX_BLOCK_SIZE. Useful when merging non-duplicate blocks. // This is also 2^31 - 1. -#define RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS) +#define RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS) -// Mask to determine whether Rabin index entry is a length value or index value. +// Masks to determine whether Rabin index entry is a length value, duplicate index value +// or similar index value. // MSB = 1 : Index // MSB = 0 : Length -#define RABIN_INDEX_FLAG (0x80000000) +// MSB-1 = 1: Similarity Index +// MSB-1 = 0: Exact Duplicate Index +#define RABIN_INDEX_FLAG (0x80000000UL) +#define SET_SIMILARITY_FLAG (0x40000000UL) +#define GET_SIMILARITY_FLAG SET_SIMILARITY_FLAG +#define CLEAR_SIMILARITY_FLAG (0xBFFFFFFFUL) + // Mask to extract value from a rabin index entry -#define RABIN_INDEX_VALUE (0x7fffffff) +#define RABIN_INDEX_VALUE (0x3FFFFFFFUL) + +// Tolerance for partial similarity check. We expect 80% similarity for +// delta compression. See: http://www.armedia.com/wp/SimilarityIndex.pdf +#define SIMILARITY_TOLERANCE (0.2f) +#define SIMILAR_EXACT 1 +#define SIMILAR_PARTIAL 2 + +typedef struct { + ssize_t offset; + uint64_t cksum_n_offset; // Dual purpose variable + unsigned int index; + unsigned int length; + unsigned int new_length; + unsigned short refcount; + short similar; +} rabin_blockentry_t; typedef struct { unsigned char *current_window_data; @@ -134,11 +153,11 @@ typedef struct { uint64_t real_chunksize; short valid; void *lzma_data; - int level; + int level, delta_flag; } rabin_context_t; extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, - const char *algo); + const char *algo, int delta_flag); extern void destroy_rabin_context(rabin_context_t *ctx); extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos); diff --git a/utils.h b/utils.h index 03b0e0b..bbf7e58 100644 --- a/utils.h +++ b/utils.h @@ -52,6 +52,7 @@ extern "C" { # endif #endif typedef unsigned long uintptr_t; +typedef ssize_t bsize_t; #undef WORDS_BIGENDIAN #if BYTE_ORDER == BIG_ENDIAN