Implement Delta Encoding based on modified bsdiff.
Change to more accurate Sketch value computation approach.
This commit is contained in:
parent
1da2c40888
commit
e788eb43b8
9 changed files with 1145 additions and 79 deletions
18
Makefile
18
Makefile
|
@ -31,6 +31,10 @@ RABINSRCS = rabin/rabin_polynomial.c
|
||||||
RABINHDRS = rabin/rabin_polynomial.h utils.h
|
RABINHDRS = rabin/rabin_polynomial.h utils.h
|
||||||
RABINOBJS = $(RABINSRCS:.c=.o)
|
RABINOBJS = $(RABINSRCS:.c=.o)
|
||||||
|
|
||||||
|
BSDIFFSRCS = bsdiff/bsdiff.c bsdiff/bspatch.c bsdiff/rle_encoder.c
|
||||||
|
BSDIFFHDRS = bsdiff/bscommon.h utils.h allocator.h
|
||||||
|
BSDIFFOBJS = $(BSDIFFSRCS:.c=.o)
|
||||||
|
|
||||||
LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c
|
LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c
|
||||||
LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \
|
LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \
|
||||||
lzma/LzHash.h lzma/LzmaDec.h utils.h
|
lzma/LzHash.h lzma/LzmaDec.h utils.h
|
||||||
|
@ -44,10 +48,10 @@ CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c
|
||||||
CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h
|
CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h
|
||||||
CRCOBJS = $(CRCSRCS:.c=.o)
|
CRCOBJS = $(CRCSRCS:.c=.o)
|
||||||
|
|
||||||
BAKFILES = *~ lzma/*~ rabin/*~
|
BAKFILES = *~ lzma/*~ rabin/*~ bsdiff/*~
|
||||||
|
|
||||||
RM = rm -f
|
RM = rm -f
|
||||||
CPPFLAGS = -I. -I./lzma -I./rabin -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
|
CPPFLAGS = -I. -I./lzma -I./rabin -I./bsdiff -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
|
||||||
-D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
|
-D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
|
||||||
VEC_FLAGS = -ftree-vectorize
|
VEC_FLAGS = -ftree-vectorize
|
||||||
LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
|
LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
|
||||||
|
@ -57,6 +61,7 @@ ifdef DEBUG
|
||||||
LINK = g++ -m64 -pthread -msse3
|
LINK = g++ -m64 -pthread -msse3
|
||||||
COMPILE = gcc -m64 -O -g -msse3 -c
|
COMPILE = gcc -m64 -O -g -msse3 -c
|
||||||
COMPILE_cpp = g++ -m64 -O -g -msse3 -c
|
COMPILE_cpp = g++ -m64 -O -g -msse3 -c
|
||||||
|
VEC_FLAGS =
|
||||||
ifdef DEBUG_NO_SLAB
|
ifdef DEBUG_NO_SLAB
|
||||||
CPPFLAGS += -DDEBUG_NO_SLAB
|
CPPFLAGS += -DDEBUG_NO_SLAB
|
||||||
endif
|
endif
|
||||||
|
@ -84,12 +89,15 @@ $(PPMDOBJS): $(PPMDSRCS) $(PPMDHDRS)
|
||||||
$(RABINOBJS): $(RABINSRCS) $(RABINHDRS)
|
$(RABINOBJS): $(RABINSRCS) $(RABINHDRS)
|
||||||
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
|
$(BSDIFFOBJS): $(BSDIFFSRCS) $(BSDIFFHDRS)
|
||||||
|
$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
|
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
|
||||||
$(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
$(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
|
||||||
|
|
||||||
$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS)
|
$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS)
|
||||||
$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(LDLIBS)
|
$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) $(LDLIBS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BAKFILES)
|
$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) $(BAKFILES)
|
||||||
|
|
||||||
|
|
132
bsdiff/bscommon.h
Normal file
132
bsdiff/bscommon.h
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
/*-
|
||||||
|
* Copyright 2012 Moinak Ghosh
|
||||||
|
* All rights reserved
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted providing that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||||
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||||
|
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is a part of Pcompress, a chunked parallel multi-
|
||||||
|
* algorithm lossless compression and decompression program.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
|
||||||
|
* Use is subject to license terms.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 3 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _BS_COMMON_
|
||||||
|
#define _BS_COMMON_
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <utils.h>
|
||||||
|
|
||||||
|
// Simple stream I/O to buffer
|
||||||
|
typedef struct {
|
||||||
|
uchar_t *buf;
|
||||||
|
bsize_t pos;
|
||||||
|
bsize_t buflen;
|
||||||
|
} bufio_t;
|
||||||
|
|
||||||
|
static int
|
||||||
|
BUFOPEN(bufio_t *bio, uchar_t *buf, bsize_t len)
|
||||||
|
{
|
||||||
|
bio->buf = buf; bio->pos = 0; bio->buflen = len;
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
static bsize_t
|
||||||
|
BUFWRITE(bufio_t *bio, uchar_t *buf, bsize_t len)
|
||||||
|
{
|
||||||
|
if (bio->pos + len < bio->buflen) {
|
||||||
|
memcpy(bio->buf + bio->pos, buf, len);
|
||||||
|
bio->pos += len;
|
||||||
|
return (len);
|
||||||
|
} else {
|
||||||
|
return (-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bsize_t
|
||||||
|
BUFREAD(bufio_t *bio, uchar_t *buf, bsize_t len)
|
||||||
|
{
|
||||||
|
bsize_t actual;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
actual = len;
|
||||||
|
if (bio->pos + len > bio->buflen) {
|
||||||
|
actual = bio->buflen - bio->pos;
|
||||||
|
}
|
||||||
|
if (actual == 0) return (0);
|
||||||
|
memcpy(buf, bio->buf + bio->pos, actual);
|
||||||
|
bio->pos += actual;
|
||||||
|
return (actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bsize_t
|
||||||
|
BUFTELL(bufio_t *bio)
|
||||||
|
{
|
||||||
|
return (bio->pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void *
|
||||||
|
BUFPTR(bufio_t *bio)
|
||||||
|
{
|
||||||
|
return (bio->buf + bio->pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
BUFSEEK(bufio_t *bio, bsize_t pos, int typ)
|
||||||
|
{
|
||||||
|
if (typ == SEEK_SET) {
|
||||||
|
bio->pos = pos;
|
||||||
|
|
||||||
|
} else if (typ == SEEK_CUR) {
|
||||||
|
bio->pos += pos;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
if (pos > 0) {
|
||||||
|
fprintf(stderr, "Cannot seek beyond buffer end.\n");
|
||||||
|
return (-1);
|
||||||
|
} else {
|
||||||
|
bio->pos = bio->buflen + pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern int zero_rle_encode(const void *const ibuf, const unsigned int ilen,
|
||||||
|
void *obuf, unsigned int *const olen);
|
||||||
|
extern int zero_rle_decode(const void* ibuf, unsigned int ilen,
|
||||||
|
void* obuf, unsigned int *olen);
|
||||||
|
|
||||||
|
#endif
|
402
bsdiff/bsdiff.c
Normal file
402
bsdiff/bsdiff.c
Normal file
|
@ -0,0 +1,402 @@
|
||||||
|
/*-
|
||||||
|
* Copyright 2003-2005 Colin Percival
|
||||||
|
* All rights reserved
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted providing that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||||
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||||
|
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is a part of Pcompress, a chunked parallel multi-
|
||||||
|
* algorithm lossless compression and decompression program.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
|
||||||
|
* Use is subject to license terms.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 3 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
||||||
|
*
|
||||||
|
* This is a somewhat modified bsdiff implementation. It has been modified
|
||||||
|
* to do buffer to buffer diffing instead of file to file and also use
|
||||||
|
* a custom RLE encoding rather than Bzip2 on the diff output.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
__FBSDID("$FreeBSD: src/usr.bin/bsdiff/bsdiff/bsdiff.c,v 1.1 2005/08/06 01:59:05 cperciva Exp $");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <err.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <allocator.h>
|
||||||
|
#include <utils.h>
|
||||||
|
#include "bscommon.h"
|
||||||
|
|
||||||
|
#define MIN(x,y) (((x)<(y)) ? (x) : (y))
|
||||||
|
|
||||||
|
static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
|
||||||
|
{
|
||||||
|
bsize_t i,j,k,x,tmp,jj,kk;
|
||||||
|
|
||||||
|
if(len<16) {
|
||||||
|
for(k=start;k<start+len;k+=j) {
|
||||||
|
j=1;x=V[I[k]+h];
|
||||||
|
for(i=1;k+i<start+len;i++) {
|
||||||
|
if(V[I[k+i]+h]<x) {
|
||||||
|
x=V[I[k+i]+h];
|
||||||
|
j=0;
|
||||||
|
};
|
||||||
|
if(V[I[k+i]+h]==x) {
|
||||||
|
tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
|
||||||
|
j++;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
|
||||||
|
if(j==1) I[k]=-1;
|
||||||
|
};
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
x=V[I[start+len/2]+h];
|
||||||
|
jj=0;kk=0;
|
||||||
|
for(i=start;i<start+len;i++) {
|
||||||
|
if(V[I[i]+h]<x) jj++;
|
||||||
|
if(V[I[i]+h]==x) kk++;
|
||||||
|
};
|
||||||
|
jj+=start;kk+=jj;
|
||||||
|
|
||||||
|
i=start;j=0;k=0;
|
||||||
|
while(i<jj) {
|
||||||
|
if(V[I[i]+h]<x) {
|
||||||
|
i++;
|
||||||
|
} else if(V[I[i]+h]==x) {
|
||||||
|
tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
|
||||||
|
j++;
|
||||||
|
} else {
|
||||||
|
tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
|
||||||
|
k++;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
while(jj+j<kk) {
|
||||||
|
if(V[I[jj+j]+h]==x) {
|
||||||
|
j++;
|
||||||
|
} else {
|
||||||
|
tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
|
||||||
|
k++;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
if(jj>start) split(I,V,start,jj-start,h);
|
||||||
|
|
||||||
|
for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
|
||||||
|
if(jj==kk-1) I[jj]=-1;
|
||||||
|
|
||||||
|
if(start+len>kk) split(I,V,kk,start+len-kk,h);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void qsufsort(bsize_t *I,bsize_t *V,u_char *old,bsize_t oldsize)
|
||||||
|
{
|
||||||
|
bsize_t buckets[256];
|
||||||
|
bsize_t i,h,len;
|
||||||
|
|
||||||
|
for(i=0;i<256;i++) buckets[i]=0;
|
||||||
|
for(i=0;i<oldsize;i++) buckets[old[i]]++;
|
||||||
|
for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
|
||||||
|
for(i=255;i>0;i--) buckets[i]=buckets[i-1];
|
||||||
|
buckets[0]=0;
|
||||||
|
|
||||||
|
for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
|
||||||
|
I[0]=oldsize;
|
||||||
|
for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
|
||||||
|
V[oldsize]=0;
|
||||||
|
for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
|
||||||
|
I[0]=-1;
|
||||||
|
|
||||||
|
for(h=1;I[0]!=-(oldsize+1);h+=h) {
|
||||||
|
len=0;
|
||||||
|
for(i=0;i<oldsize+1;) {
|
||||||
|
if(I[i]<0) {
|
||||||
|
len-=I[i];
|
||||||
|
i-=I[i];
|
||||||
|
} else {
|
||||||
|
if(len) I[i-len]=-len;
|
||||||
|
len=V[I[i]]+1-i;
|
||||||
|
split(I,V,i,len,h);
|
||||||
|
i+=len;
|
||||||
|
len=0;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
if(len) I[i-len]=-len;
|
||||||
|
};
|
||||||
|
|
||||||
|
for(i=0;i<oldsize+1;i++) I[V[i]]=i;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bsize_t matchlen(u_char *old,bsize_t oldsize,u_char *new,bsize_t newsize)
|
||||||
|
{
|
||||||
|
bsize_t i;
|
||||||
|
|
||||||
|
for(i=0;(i<oldsize)&&(i<newsize);i++)
|
||||||
|
if(old[i]!=new[i]) break;
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bsize_t search(bsize_t *I,u_char *old,bsize_t oldsize,
|
||||||
|
u_char *new,bsize_t newsize,bsize_t st,bsize_t en,bsize_t *pos)
|
||||||
|
{
|
||||||
|
bsize_t x,y;
|
||||||
|
|
||||||
|
if(en-st<2) {
|
||||||
|
x=matchlen(old+I[st],oldsize-I[st],new,newsize);
|
||||||
|
y=matchlen(old+I[en],oldsize-I[en],new,newsize);
|
||||||
|
|
||||||
|
if(x>y) {
|
||||||
|
*pos=I[st];
|
||||||
|
return x;
|
||||||
|
} else {
|
||||||
|
*pos=I[en];
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
x=st+(en-st)/2;
|
||||||
|
if(memcmp(old+I[x],new,MIN(oldsize-I[x],newsize))<0) {
|
||||||
|
return search(I,old,oldsize,new,newsize,x,en,pos);
|
||||||
|
} else {
|
||||||
|
return search(I,old,oldsize,new,newsize,st,x,pos);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
valout(bsize_t x, u_char *buf)
|
||||||
|
{
|
||||||
|
*((bsize_t *)buf) = htonll(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
valouti32(bsize_t x, u_char *buf)
|
||||||
|
{
|
||||||
|
int32_t val;
|
||||||
|
val = x;
|
||||||
|
*((int32_t *)buf) = htonl(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
bsize_t
|
||||||
|
bsdiff(u_char *old, bsize_t oldsize, u_char *new, bsize_t newsize,
|
||||||
|
u_char *diff, u_char *scratch, bsize_t scratchsize)
|
||||||
|
{
|
||||||
|
bsize_t *I,*V;
|
||||||
|
bsize_t scan,pos,len;
|
||||||
|
bsize_t lastscan,lastpos,lastoffset;
|
||||||
|
bsize_t oldscore,scsc;
|
||||||
|
bsize_t s,Sf,lenf,Sb,lenb;
|
||||||
|
bsize_t overlap,Ss,lens;
|
||||||
|
bsize_t i, rv;
|
||||||
|
bsize_t dblen,eblen;
|
||||||
|
u_char *db,*eb;
|
||||||
|
u_char buf[sizeof (bsize_t)];
|
||||||
|
u_char header[48];
|
||||||
|
unsigned int sz, hdrsz, ulen;
|
||||||
|
bufio_t pf;
|
||||||
|
|
||||||
|
sz = sizeof (bsize_t);
|
||||||
|
I = slab_alloc(NULL, (oldsize+1)*sz);
|
||||||
|
V = slab_alloc(NULL, (oldsize+1)*sz);
|
||||||
|
if(I == NULL || V == NULL) return (0);
|
||||||
|
|
||||||
|
qsufsort(I,V,old,oldsize);
|
||||||
|
slab_free(NULL, V);
|
||||||
|
|
||||||
|
if(((db=slab_alloc(NULL, newsize+1))==NULL) ||
|
||||||
|
((eb=slab_alloc(NULL, newsize+1))==NULL)) {
|
||||||
|
fprintf(stderr, "bsdiff: Memory allocation error.\n");
|
||||||
|
slab_free(NULL, I);
|
||||||
|
slab_free(NULL, V);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
dblen=0;
|
||||||
|
eblen=0;
|
||||||
|
BUFOPEN(&pf, diff, newsize);
|
||||||
|
|
||||||
|
/* Header is
|
||||||
|
0 8 length of ctrl block
|
||||||
|
8 8 compressed length of diff block
|
||||||
|
16 8 actual length of diff block
|
||||||
|
24 8 compressed length of extra block
|
||||||
|
32 8 actual length of extra block
|
||||||
|
40 8 length of new file */
|
||||||
|
/* File is
|
||||||
|
0 32 Header
|
||||||
|
32 ?? ctrl block
|
||||||
|
?? ?? diff block
|
||||||
|
?? ?? extra block */
|
||||||
|
valout(0, header);
|
||||||
|
valout(0, header + sz);
|
||||||
|
valout(0, header + sz*2);
|
||||||
|
valout(0, header + sz*3);
|
||||||
|
valout(0, header + sz*4);
|
||||||
|
valout(newsize, header + sz*5);
|
||||||
|
if (BUFWRITE(&pf, header, sz*6) != sz*6) {
|
||||||
|
fprintf(stderr, "bsdiff: Write to compressed buffer failed.\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
hdrsz = sz*6;
|
||||||
|
|
||||||
|
/* Compute the differences, writing ctrl as we go */
|
||||||
|
scan=0;len=0;
|
||||||
|
lastscan=0;lastpos=0;lastoffset=0;
|
||||||
|
while(scan<newsize) {
|
||||||
|
oldscore=0;
|
||||||
|
|
||||||
|
for(scsc=scan+=len;scan<newsize;scan++) {
|
||||||
|
len=search(I,old,oldsize,new+scan,newsize-scan,
|
||||||
|
0,oldsize,&pos);
|
||||||
|
|
||||||
|
for(;scsc<scan+len;scsc++)
|
||||||
|
if((scsc+lastoffset<oldsize) &&
|
||||||
|
(old[scsc+lastoffset] == new[scsc]))
|
||||||
|
oldscore++;
|
||||||
|
|
||||||
|
if(((len==oldscore) && (len!=0)) ||
|
||||||
|
(len>oldscore+sz)) break;
|
||||||
|
|
||||||
|
if((scan+lastoffset<oldsize) &&
|
||||||
|
(old[scan+lastoffset] == new[scan]))
|
||||||
|
oldscore--;
|
||||||
|
};
|
||||||
|
|
||||||
|
if((len!=oldscore) || (scan==newsize)) {
|
||||||
|
s=0;Sf=0;lenf=0;
|
||||||
|
for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
|
||||||
|
if(old[lastpos+i]==new[lastscan+i]) s++;
|
||||||
|
i++;
|
||||||
|
if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
|
||||||
|
};
|
||||||
|
|
||||||
|
lenb=0;
|
||||||
|
if(scan<newsize) {
|
||||||
|
s=0;Sb=0;
|
||||||
|
for(i=1;(scan>=lastscan+i)&&(pos>=i);i++) {
|
||||||
|
if(old[pos-i]==new[scan-i]) s++;
|
||||||
|
if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; };
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
if(lastscan+lenf>scan-lenb) {
|
||||||
|
overlap=(lastscan+lenf)-(scan-lenb);
|
||||||
|
s=0;Ss=0;lens=0;
|
||||||
|
for(i=0;i<overlap;i++) {
|
||||||
|
if(new[lastscan+lenf-overlap+i]==
|
||||||
|
old[lastpos+lenf-overlap+i]) s++;
|
||||||
|
if(new[scan-lenb+i]==
|
||||||
|
old[pos-lenb+i]) s--;
|
||||||
|
if(s>Ss) { Ss=s; lens=i+1; };
|
||||||
|
};
|
||||||
|
|
||||||
|
lenf+=lens-overlap;
|
||||||
|
lenb-=lens;
|
||||||
|
};
|
||||||
|
|
||||||
|
for(i=0;i<lenf;i++)
|
||||||
|
db[dblen+i]=new[lastscan+i]-old[lastpos+i];
|
||||||
|
for(i=0;i<(scan-lenb)-(lastscan+lenf);i++)
|
||||||
|
eb[eblen+i]=new[lastscan+lenf+i];
|
||||||
|
|
||||||
|
dblen+=lenf;
|
||||||
|
eblen+=(scan-lenb)-(lastscan+lenf);
|
||||||
|
|
||||||
|
valouti32(lenf, buf);
|
||||||
|
BUFWRITE(&pf, buf, 4);
|
||||||
|
valouti32((scan-lenb)-(lastscan+lenf),buf);
|
||||||
|
BUFWRITE(&pf, buf, 4);
|
||||||
|
valouti32((pos-lenb)-(lastpos+lenf),buf);
|
||||||
|
BUFWRITE(&pf, buf, 4);
|
||||||
|
|
||||||
|
lastscan=scan-lenb;
|
||||||
|
lastpos=pos-lenb;
|
||||||
|
lastoffset=pos-scan;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (eblen > newsize/2) {
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compute size of ctrl data */
|
||||||
|
len = BUFTELL(&pf);
|
||||||
|
valout(len-hdrsz, header);
|
||||||
|
rv = len;
|
||||||
|
|
||||||
|
/* Write diff data */
|
||||||
|
len = newsize - rv;
|
||||||
|
ulen = len;
|
||||||
|
if (zero_rle_encode(db, dblen, BUFPTR(&pf), &ulen) == -1) {
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
/* Output size of diff data */
|
||||||
|
len = ulen;
|
||||||
|
valout(len, header + sz);
|
||||||
|
valout(dblen, header + sz*2);
|
||||||
|
rv += len;
|
||||||
|
BUFSEEK(&pf, len, SEEK_CUR);
|
||||||
|
|
||||||
|
/* Write extra data */
|
||||||
|
len = newsize - rv;
|
||||||
|
ulen = len;
|
||||||
|
if (zero_rle_encode(eb, eblen, BUFPTR(&pf), &ulen) == -1) {
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
/* Output size of extra data */
|
||||||
|
len = ulen;
|
||||||
|
valout(len, header + sz*3);
|
||||||
|
valout(eblen, header + sz*4);
|
||||||
|
rv += len;
|
||||||
|
|
||||||
|
/* Seek to the beginning, re-write the header.*/
|
||||||
|
BUFSEEK(&pf, 0, SEEK_SET);
|
||||||
|
BUFWRITE(&pf, header, hdrsz);
|
||||||
|
|
||||||
|
out:
|
||||||
|
/* Free the memory we used */
|
||||||
|
slab_free(NULL, db);
|
||||||
|
slab_free(NULL, eb);
|
||||||
|
slab_free(NULL, I);
|
||||||
|
|
||||||
|
return (rv);
|
||||||
|
}
|
218
bsdiff/bspatch.c
Normal file
218
bsdiff/bspatch.c
Normal file
|
@ -0,0 +1,218 @@
|
||||||
|
/*-
|
||||||
|
* Copyright 2003-2005 Colin Percival
|
||||||
|
* All rights reserved
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted providing that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||||
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||||
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||||
|
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
__FBSDID("$FreeBSD: src/usr.bin/bsdiff/bspatch/bspatch.c,v 1.1 2005/08/06 01:59:06 cperciva Exp $");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <bzlib.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <err.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <allocator.h>
|
||||||
|
#include <utils.h>
|
||||||
|
#include "bscommon.h"
|
||||||
|
|
||||||
|
static bsize_t
|
||||||
|
valin(u_char *buf)
|
||||||
|
{
|
||||||
|
return ntohll(*((bsize_t *)buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
static int32_t
|
||||||
|
valini32(u_char *buf)
|
||||||
|
{
|
||||||
|
return ntohl(*((int32_t *)buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
bsize_t
|
||||||
|
get_bsdiff_sz(u_char *pbuf) {
|
||||||
|
bsize_t newsize;
|
||||||
|
bsize_t ctrllen, lzdatalen, datalen, lzextralen, extralen;
|
||||||
|
int sz, hdrsz, rv;
|
||||||
|
|
||||||
|
sz = sizeof (bsize_t);
|
||||||
|
hdrsz = sz*6;
|
||||||
|
|
||||||
|
ctrllen = valin(pbuf);
|
||||||
|
lzdatalen = valin(pbuf+sz);
|
||||||
|
datalen = valin(pbuf+sz*2);
|
||||||
|
lzextralen = valin(pbuf+sz*3);
|
||||||
|
extralen = valin(pbuf+sz*4);
|
||||||
|
newsize = valin(pbuf+sz*5);
|
||||||
|
return (ctrllen + lzdatalen + lzextralen + hdrsz);
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new, bsize_t *_newsize)
|
||||||
|
{
|
||||||
|
bsize_t newsize;
|
||||||
|
bsize_t ctrllen, lzdatalen, datalen, lzextralen, extralen;
|
||||||
|
u_char buf[8];
|
||||||
|
u_char *diffdata, *extradata;
|
||||||
|
bsize_t oldpos,newpos;
|
||||||
|
bsize_t ctrl[3];
|
||||||
|
bsize_t lenread;
|
||||||
|
bsize_t i;
|
||||||
|
bufio_t cpf, dpf, epf;
|
||||||
|
int sz, hdrsz, rv;
|
||||||
|
unsigned int len;
|
||||||
|
|
||||||
|
/*
|
||||||
|
File format:
|
||||||
|
0 8 length of ctrl block (X)
|
||||||
|
8 8 compressed length of diff block (Y)
|
||||||
|
16 8 actual length of diff block
|
||||||
|
24 8 compressed length of extra block (Z)
|
||||||
|
32 8 actual length of extra block
|
||||||
|
40 8 length of new file
|
||||||
|
48 X control block
|
||||||
|
48+X Y lzfx(diff block)
|
||||||
|
48+X+Y Z lzfx(extra block)
|
||||||
|
with control block a set of triples (x,y,z) meaning "add x bytes
|
||||||
|
from oldfile to x bytes from the diff block; copy y bytes from the
|
||||||
|
extra block; seek forwards in oldfile by z bytes".
|
||||||
|
*/
|
||||||
|
sz = sizeof (bsize_t);
|
||||||
|
hdrsz = sz*6;
|
||||||
|
rv = 1;
|
||||||
|
|
||||||
|
/* Read lengths from header first. */
|
||||||
|
ctrllen = valin(pbuf);
|
||||||
|
lzdatalen = valin(pbuf+sz);
|
||||||
|
datalen = valin(pbuf+sz*2);
|
||||||
|
lzextralen = valin(pbuf+sz*3);
|
||||||
|
extralen = valin(pbuf+sz*4);
|
||||||
|
newsize = valin(pbuf+sz*5);
|
||||||
|
|
||||||
|
if((ctrllen<0) || (lzdatalen<0) || (newsize<0) || (lzextralen<0)) {
|
||||||
|
fprintf(stderr, "1: Corrupt patch\n");
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
if (newsize > *_newsize) {
|
||||||
|
fprintf(stderr, "Output buffer too small.\n");
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
*_newsize = newsize;
|
||||||
|
|
||||||
|
/* Allocate buffers. */
|
||||||
|
diffdata = malloc(datalen);
|
||||||
|
extradata = malloc(extralen);
|
||||||
|
if (diffdata == NULL || extradata == NULL) {
|
||||||
|
fprintf(stderr, "bspatch: Out of memory.\n");
|
||||||
|
if (diffdata) free(diffdata);
|
||||||
|
if (extradata) free(extradata);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Decompress diffdata and extradata. */
|
||||||
|
len = datalen;
|
||||||
|
if (zero_rle_decode(pbuf + hdrsz + ctrllen, lzdatalen, diffdata, &len) == -1 ||
|
||||||
|
len != datalen) {
|
||||||
|
fprintf(stderr, "bspatch: Failed to decompress diff data.\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
datalen = len;
|
||||||
|
|
||||||
|
len = extralen;
|
||||||
|
if (zero_rle_decode(pbuf + hdrsz + ctrllen + lzdatalen, lzextralen, extradata, &len) == -1 ||
|
||||||
|
len != extralen) {
|
||||||
|
fprintf(stderr, "bspatch: Failed to decompress extra data.\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
extralen = len;
|
||||||
|
BUFOPEN(&cpf, pbuf + hdrsz, ctrllen);
|
||||||
|
BUFOPEN(&dpf, diffdata, datalen);
|
||||||
|
BUFOPEN(&epf, extradata, extralen);
|
||||||
|
|
||||||
|
oldpos=0;newpos=0;
|
||||||
|
while(newpos<newsize) {
|
||||||
|
/* Read control data */
|
||||||
|
for(i=0;i<=2;i++) {
|
||||||
|
lenread = BUFREAD(&cpf, buf, 4);
|
||||||
|
if (lenread < 4) {
|
||||||
|
fprintf(stderr, "2: Corrupt diff data\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
ctrl[i]=valini32(buf);
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Sanity-check */
|
||||||
|
if(newpos+ctrl[0]>newsize) {
|
||||||
|
fprintf(stderr, "3: Corrupt diff data\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read diff string */
|
||||||
|
lenread = BUFREAD(&dpf, new + newpos, ctrl[0]);
|
||||||
|
if (lenread < ctrl[0]) {
|
||||||
|
fprintf(stderr, "4: Corrupt diff data\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add old data to diff string */
|
||||||
|
for(i=0;i<ctrl[0];i++)
|
||||||
|
if((oldpos+i>=0) && (oldpos+i<oldsize))
|
||||||
|
new[newpos+i]+=old[oldpos+i];
|
||||||
|
|
||||||
|
/* Adjust pointers */
|
||||||
|
newpos+=ctrl[0];
|
||||||
|
oldpos+=ctrl[0];
|
||||||
|
|
||||||
|
/* Sanity-check */
|
||||||
|
if(newpos+ctrl[1]>newsize) {
|
||||||
|
fprintf(stderr, "5: Corrupt diff data\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read extra string */
|
||||||
|
lenread = BUFREAD(&epf, new + newpos, ctrl[1]);
|
||||||
|
if (lenread < ctrl[1]) {
|
||||||
|
fprintf(stderr, "6: Corrupt diff data\n");
|
||||||
|
rv = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Adjust pointers */
|
||||||
|
newpos+=ctrl[1];
|
||||||
|
oldpos+=ctrl[2];
|
||||||
|
};
|
||||||
|
|
||||||
|
out:
|
||||||
|
free(diffdata);
|
||||||
|
free(extradata);
|
||||||
|
|
||||||
|
return (rv);
|
||||||
|
}
|
112
bsdiff/rle_encoder.c
Normal file
112
bsdiff/rle_encoder.c
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
/*
|
||||||
|
* This file is a part of Pcompress, a chunked parallel multi-
|
||||||
|
* algorithm lossless compression and decompression program.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
|
||||||
|
* Use is subject to license terms.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 3 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
||||||
|
*
|
||||||
|
* This RLE encoder is a simple approach to encode long runs of '0'
|
||||||
|
* bytes that typically are found in a bsdiff patch output. This
|
||||||
|
* does not encode repeating runs of other characters.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <utils.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#define ZERO_MASK (32768)
|
||||||
|
#define DATA_MASK (32767)
|
||||||
|
#define COUNT_MAX (32767)
|
||||||
|
|
||||||
|
int
|
||||||
|
zero_rle_encode(const void *const ibuf, const unsigned int ilen,
|
||||||
|
void *obuf, unsigned int *const olen)
|
||||||
|
{
|
||||||
|
unsigned int pos1, pos2;
|
||||||
|
unsigned short count;
|
||||||
|
const uchar_t *const ib = ibuf;
|
||||||
|
uchar_t *ob = obuf;
|
||||||
|
|
||||||
|
pos2 = 0;
|
||||||
|
for (pos1=0; pos1<ilen && pos2<*olen;) {
|
||||||
|
count = 0;
|
||||||
|
if (ib[pos1] == 0) {
|
||||||
|
for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
|
||||||
|
count |= ZERO_MASK;
|
||||||
|
*((unsigned short *)(ob + pos2)) = htons(count);
|
||||||
|
pos2 += 2;
|
||||||
|
} else {
|
||||||
|
unsigned int pos3, pos4, cnt, state;
|
||||||
|
pos3 = pos2;
|
||||||
|
pos2 += 2;
|
||||||
|
if (pos2 > *olen) break;
|
||||||
|
|
||||||
|
state = 0;
|
||||||
|
for (;pos1<ilen && pos2<*olen && count<COUNT_MAX;) {
|
||||||
|
if (ib[pos1] != 0) state = 0;
|
||||||
|
if (ib[pos1] == 0 && !state) {
|
||||||
|
cnt = 0;
|
||||||
|
pos4 = pos1;
|
||||||
|
state = 1;
|
||||||
|
// Lookahead if have ate least 4 consecutive zeroes
|
||||||
|
for (;pos4<ilen && ib[pos4] == 0; pos4++) cnt++;
|
||||||
|
if (cnt >= 4) break;
|
||||||
|
}
|
||||||
|
ob[pos2++] = ib[pos1++];
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
*((unsigned short *)(ob + pos3)) = htons(count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*olen = pos2;
|
||||||
|
if (pos1 < ilen) {
|
||||||
|
return (-1);
|
||||||
|
} else {
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
zero_rle_decode(const void* ibuf, unsigned int ilen,
|
||||||
|
void* obuf, unsigned int *olen)
|
||||||
|
{
|
||||||
|
unsigned int pos1, pos2, i;
|
||||||
|
unsigned short count;
|
||||||
|
const uchar_t *ib = ibuf;
|
||||||
|
uchar_t *ob = obuf;
|
||||||
|
|
||||||
|
pos2 = 0;
|
||||||
|
pos1 = 0;
|
||||||
|
for (; pos1<ilen && pos2<*olen;) {
|
||||||
|
count = ntohs(*((unsigned short *)(ib + pos1)));
|
||||||
|
pos1 += 2;
|
||||||
|
if (count & ZERO_MASK) {
|
||||||
|
count &= DATA_MASK;
|
||||||
|
for (i=0; i<count && pos2<*olen; i++)
|
||||||
|
ob[pos2++] = 0;
|
||||||
|
} else {
|
||||||
|
for (i=0; i<count && pos1<ilen && pos2<*olen; i++)
|
||||||
|
ob[pos2++] = ib[pos1++];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i = *olen;
|
||||||
|
*olen = pos2;
|
||||||
|
if (pos1 < ilen || pos2 < i) {
|
||||||
|
return (-1);
|
||||||
|
} else {
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
26
main.c
26
main.c
|
@ -78,6 +78,7 @@ static int nthreads = 0;
|
||||||
static int hide_mem_stats = 1;
|
static int hide_mem_stats = 1;
|
||||||
static int hide_cmp_stats = 1;
|
static int hide_cmp_stats = 1;
|
||||||
static int enable_rabin_scan = 0;
|
static int enable_rabin_scan = 0;
|
||||||
|
static int enable_delta_encode = 0;
|
||||||
static int enable_rabin_split = 1;
|
static int enable_rabin_split = 1;
|
||||||
static unsigned int chunk_num;
|
static unsigned int chunk_num;
|
||||||
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
static uint64_t largest_chunk, smallest_chunk, avg_chunk;
|
||||||
|
@ -118,10 +119,12 @@ usage(void)
|
||||||
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
"4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
|
||||||
" %s -D ...\n"
|
" %s -D ...\n"
|
||||||
" %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n"
|
" %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n"
|
||||||
"5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
"5) Perform Delta Encoding in addition to Exact Dedup:\n"
|
||||||
"6) Pass '-M' to display memory allocator statistics\n"
|
" %s -E ... - This also implies '-D'.\n"
|
||||||
"7) Pass '-C' to display compression statistics\n\n",
|
"6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
|
||||||
exec_name, exec_name, exec_name, exec_name);
|
"7) Pass '-M' to display memory allocator statistics\n"
|
||||||
|
"8) Pass '-C' to display compression statistics\n\n",
|
||||||
|
exec_name, exec_name, exec_name, exec_name, exec_name, exec_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -436,7 +439,8 @@ start_decompress(const char *filename, const char *to_filename)
|
||||||
if (_init_func)
|
if (_init_func)
|
||||||
_init_func(&(tdat->data), &(tdat->level), chunksize);
|
_init_func(&(tdat->data), &(tdat->level), chunksize);
|
||||||
if (enable_rabin_scan)
|
if (enable_rabin_scan)
|
||||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, algo);
|
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
|
||||||
|
algo, enable_delta_encode);
|
||||||
else
|
else
|
||||||
tdat->rctx = NULL;
|
tdat->rctx = NULL;
|
||||||
if (pthread_create(&(tdat->thr), NULL, perform_decompress,
|
if (pthread_create(&(tdat->thr), NULL, perform_decompress,
|
||||||
|
@ -905,7 +909,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
if (_init_func)
|
if (_init_func)
|
||||||
_init_func(&(tdat->data), &(tdat->level), chunksize);
|
_init_func(&(tdat->data), &(tdat->level), chunksize);
|
||||||
if (enable_rabin_scan)
|
if (enable_rabin_scan)
|
||||||
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, algo);
|
tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
|
||||||
|
algo, enable_delta_encode);
|
||||||
else
|
else
|
||||||
tdat->rctx = NULL;
|
tdat->rctx = NULL;
|
||||||
|
|
||||||
|
@ -965,7 +970,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
* Read the first chunk into a spare buffer (a simple double-buffering).
|
* Read the first chunk into a spare buffer (a simple double-buffering).
|
||||||
*/
|
*/
|
||||||
if (enable_rabin_split) {
|
if (enable_rabin_split) {
|
||||||
rctx = create_rabin_context(chunksize, 0, algo);
|
rctx = create_rabin_context(chunksize, 0, algo, enable_delta_encode);
|
||||||
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
|
||||||
} else {
|
} else {
|
||||||
rbytes = Read(uncompfd, cread_buf, chunksize);
|
rbytes = Read(uncompfd, cread_buf, chunksize);
|
||||||
|
@ -1203,7 +1208,7 @@ main(int argc, char *argv[])
|
||||||
level = 6;
|
level = 6;
|
||||||
slab_init();
|
slab_init();
|
||||||
|
|
||||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDr")) != -1) {
|
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEr")) != -1) {
|
||||||
int ovr;
|
int ovr;
|
||||||
|
|
||||||
switch (opt) {
|
switch (opt) {
|
||||||
|
@ -1259,6 +1264,11 @@ main(int argc, char *argv[])
|
||||||
enable_rabin_scan = 1;
|
enable_rabin_scan = 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'E':
|
||||||
|
enable_rabin_scan = 1;
|
||||||
|
enable_delta_encode = 1;
|
||||||
|
break;
|
||||||
|
|
||||||
case 'r':
|
case 'r':
|
||||||
enable_rabin_split = 0;
|
enable_rabin_split = 0;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
/*
|
/*
|
||||||
* rabin_polynomial.c
|
* rabin_polynomial.c
|
||||||
*
|
*
|
||||||
* Created by Joel Lawrence Tucci on 09-March-2011.
|
* The rabin polynomial computation is derived from:
|
||||||
|
* http://code.google.com/p/rabin-fingerprint-c/
|
||||||
*
|
*
|
||||||
* Copyright (c) 2011 Joel Lawrence Tucci
|
* originally created by Joel Lawrence Tucci on 09-March-2011.
|
||||||
|
*
|
||||||
|
* Rabin polynomial portions Copyright (c) 2011 Joel Lawrence Tucci
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -70,6 +73,11 @@ extern int lzma_compress(void *src, size_t srclen, void *dst,
|
||||||
extern int lzma_decompress(void *src, size_t srclen, void *dst,
|
extern int lzma_decompress(void *src, size_t srclen, void *dst,
|
||||||
size_t *dstlen, int level, uchar_t chdr, void *data);
|
size_t *dstlen, int level, uchar_t chdr, void *data);
|
||||||
extern int lzma_deinit(void **data);
|
extern int lzma_deinit(void **data);
|
||||||
|
extern int bsdiff(u_char *old, bsize_t oldsize, u_char *new, bsize_t newsize,
|
||||||
|
u_char *diff, u_char *scratch, bsize_t scratchsize);
|
||||||
|
extern bsize_t get_bsdiff_sz(u_char *pbuf);
|
||||||
|
extern int bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new,
|
||||||
|
bsize_t *_newsize);
|
||||||
|
|
||||||
uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||||
|
|
||||||
|
@ -77,11 +85,10 @@ uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||||
* Initialize the algorithm with the default params.
|
* Initialize the algorithm with the default params.
|
||||||
*/
|
*/
|
||||||
rabin_context_t *
|
rabin_context_t *
|
||||||
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo) {
|
create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo, int delta_flag) {
|
||||||
rabin_context_t *ctx;
|
rabin_context_t *ctx;
|
||||||
unsigned char *current_window_data;
|
unsigned char *current_window_data;
|
||||||
uint32_t blknum;
|
uint32_t blknum;
|
||||||
int level = 14;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Rabin window size must be power of 2 for optimization.
|
* Rabin window size must be power of 2 for optimization.
|
||||||
|
@ -90,14 +97,22 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
|
||||||
fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n");
|
fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n");
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (chunksize < RAB_MIN_CHUNK_SIZE) {
|
||||||
|
fprintf(stderr, "Minimum chunk size for Dedup must be %l bytes\n",
|
||||||
|
RAB_MIN_CHUNK_SIZE);
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin
|
* For LZMA with chunksize <= LZMA Window size and/or Delta enabled we
|
||||||
* block size. For everything else it is 1K based on experimentation.
|
* use 4K minimum Rabin block size. For everything else it is 2K based
|
||||||
|
* on experimentation.
|
||||||
*/
|
*/
|
||||||
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
|
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
|
||||||
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
|
||||||
if ((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
|
if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
|
||||||
chunksize <= LZMA_WINDOW_MAX) {
|
chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
|
||||||
ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE;
|
ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE;
|
||||||
ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK;
|
ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK;
|
||||||
ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
|
ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
|
||||||
|
@ -132,11 +147,12 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->lzma_data = NULL;
|
ctx->lzma_data = NULL;
|
||||||
|
ctx->level = 14;
|
||||||
if (real_chunksize > 0) {
|
if (real_chunksize > 0) {
|
||||||
lzma_init(&(ctx->lzma_data), &(ctx->level), chunksize);
|
lzma_init(&(ctx->lzma_data), &(ctx->level), chunksize);
|
||||||
if (!(ctx->lzma_data)) {
|
if (!(ctx->lzma_data)) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Could not allocate rabin polynomial context, out of memory\n");
|
"Could not initialize LZMA data for rabin index, out of memory\n");
|
||||||
destroy_rabin_context(ctx);
|
destroy_rabin_context(ctx);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
@ -154,6 +170,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
|
||||||
|
|
||||||
ctx->current_window_data = current_window_data;
|
ctx->current_window_data = current_window_data;
|
||||||
ctx->real_chunksize = real_chunksize;
|
ctx->real_chunksize = real_chunksize;
|
||||||
|
ctx->delta_flag = delta_flag;
|
||||||
reset_rabin_context(ctx);
|
reset_rabin_context(ctx);
|
||||||
return (ctx);
|
return (ctx);
|
||||||
}
|
}
|
||||||
|
@ -185,12 +202,24 @@ cmpblks(const void *a, const void *b)
|
||||||
rabin_blockentry_t *a1 = (rabin_blockentry_t *)a;
|
rabin_blockentry_t *a1 = (rabin_blockentry_t *)a;
|
||||||
rabin_blockentry_t *b1 = (rabin_blockentry_t *)b;
|
rabin_blockentry_t *b1 = (rabin_blockentry_t *)b;
|
||||||
|
|
||||||
if (a1->cksum_n_offset < b1->cksum_n_offset)
|
if (a1->cksum_n_offset < b1->cksum_n_offset) {
|
||||||
return (-1);
|
return (-1);
|
||||||
else if (a1->cksum_n_offset == b1->cksum_n_offset)
|
} else if (a1->cksum_n_offset == b1->cksum_n_offset) {
|
||||||
|
/*
|
||||||
|
* If fingerprints match then compare lengths. Length match makes
|
||||||
|
* for strong exact detection/ordering during sort while stopping
|
||||||
|
* short of expensive memcmp().
|
||||||
|
*/
|
||||||
|
if (a1->length < b1->length) {
|
||||||
|
return (-1);
|
||||||
|
} else if (a1->length == b1->length) {
|
||||||
return (0);
|
return (0);
|
||||||
else if (a1->cksum_n_offset > b1->cksum_n_offset)
|
} else if (a1->length > b1->length) {
|
||||||
return (1);
|
return (1);
|
||||||
|
}
|
||||||
|
} else if (a1->cksum_n_offset > b1->cksum_n_offset) {
|
||||||
|
return (1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -200,19 +229,32 @@ cmpblks(const void *a, const void *b)
|
||||||
uint32_t
|
uint32_t
|
||||||
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
|
rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
|
||||||
{
|
{
|
||||||
ssize_t i, last_offset, j;
|
ssize_t i, last_offset, j, fplist_sz;
|
||||||
uint32_t blknum;
|
uint32_t blknum;
|
||||||
char *buf1 = (char *)buf;
|
char *buf1 = (char *)buf;
|
||||||
uint32_t length;
|
uint32_t length;
|
||||||
uint64_t cur_roll_checksum[2];
|
uint64_t cur_roll_checksum, cur_sketch;
|
||||||
|
uint64_t *fplist;
|
||||||
|
uint32_t len1, fpos;
|
||||||
|
|
||||||
|
if (rabin_pos == NULL) {
|
||||||
|
/*
|
||||||
|
* Initialize arrays for sketch computation. We re-use memory allocated
|
||||||
|
* for the compressed chunk temporarily.
|
||||||
|
*/
|
||||||
|
fplist_sz = 8 * ctx->rabin_poly_avg_block_size;
|
||||||
|
fplist = (uint64_t *)(ctx->cbuf + ctx->real_chunksize - fplist_sz);
|
||||||
|
memset(fplist, 0, fplist_sz);
|
||||||
|
fpos = 0;
|
||||||
|
len1 = 0;
|
||||||
|
}
|
||||||
length = offset;
|
length = offset;
|
||||||
last_offset = 0;
|
last_offset = 0;
|
||||||
blknum = 0;
|
blknum = 0;
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
cur_roll_checksum[0] = 0;
|
cur_roll_checksum = 0;
|
||||||
cur_roll_checksum[1] = 0;
|
|
||||||
j = 0;
|
j = 0;
|
||||||
|
cur_sketch = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
|
||||||
|
@ -234,13 +276,39 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
*
|
*
|
||||||
* However since RAB_POLYNOMIAL_CONST == 2, we use shifts.
|
* However since RAB_POLYNOMIAL_CONST == 2, we use shifts.
|
||||||
*/
|
*/
|
||||||
cur_roll_checksum[1] = (cur_roll_checksum[1] << 1) + cur_byte;
|
cur_roll_checksum = (cur_roll_checksum << 1) + cur_byte;
|
||||||
cur_roll_checksum[1] -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
|
cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
|
|
||||||
// Compute Sum 0 mod 25 Sketch. We are avoiding a branch here.
|
|
||||||
// See: http://www.armedia.com/wp/SimilarityIndex.pdf
|
|
||||||
j += cur_roll_checksum[(cur_roll_checksum[1] % 25 == 0)];
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compute a super sketch value of the block. We store a sum of relative
|
||||||
|
* maximal rabin hash values per 1K(SKETCH_BASIC_BLOCK_SZ) of data. So we
|
||||||
|
* get upto 128 sums for a max block size of 128K. This is a representative
|
||||||
|
* fingerprint sketch of the block. Storing and comparing upto 128 fingerprints
|
||||||
|
* per block is very expensive (compute & RAM) so we eventually sum all the
|
||||||
|
* fingerprints for the block to create a single super sketch value representing
|
||||||
|
* maximal features of the block.
|
||||||
|
*
|
||||||
|
* This value can be used for similarity detection for delta encoding. Exact
|
||||||
|
* match for deduplication is additionally detected via a memcmp(). This is a
|
||||||
|
* variant of some approaches detailed in:
|
||||||
|
* http://www.armedia.com/wp/SimilarityIndex.pdf
|
||||||
|
*/
|
||||||
|
if (rabin_pos == NULL) {
|
||||||
|
len1++;
|
||||||
|
j = cur_roll_checksum & ctx->rabin_avg_block_mask;
|
||||||
|
fplist[j] += cur_roll_checksum;
|
||||||
|
if (fplist[j] > fplist[fpos]) fpos = j;
|
||||||
|
if (len1 == SKETCH_BASIC_BLOCK_SZ) {
|
||||||
|
/*
|
||||||
|
* Compute the super sketch value by summing all the representative
|
||||||
|
* fingerprints of the block.
|
||||||
|
*/
|
||||||
|
cur_sketch += fplist[fpos];
|
||||||
|
memset(fplist, 0, fplist_sz);
|
||||||
|
fpos = 0;
|
||||||
|
len1 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
||||||
* We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
|
* We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
|
||||||
|
@ -252,14 +320,19 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
if (length < ctx->rabin_poly_min_block_size) continue;
|
if (length < ctx->rabin_poly_min_block_size) continue;
|
||||||
|
|
||||||
// If we hit our special value or reached the max block size update block offset
|
// If we hit our special value or reached the max block size update block offset
|
||||||
if ((cur_roll_checksum[1] & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
if ((cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
|
||||||
length >= rabin_polynomial_max_block_size) {
|
length >= rabin_polynomial_max_block_size) {
|
||||||
if (rabin_pos == NULL) {
|
if (rabin_pos == NULL) {
|
||||||
ctx->blocks[blknum].offset = last_offset;
|
ctx->blocks[blknum].offset = last_offset;
|
||||||
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
ctx->blocks[blknum].index = blknum; // Need to store for sorting
|
||||||
ctx->blocks[blknum].cksum_n_offset = j;
|
|
||||||
ctx->blocks[blknum].length = length;
|
ctx->blocks[blknum].length = length;
|
||||||
ctx->blocks[blknum].refcount = 0;
|
ctx->blocks[blknum].refcount = 0;
|
||||||
|
ctx->blocks[blknum].similar = 0;
|
||||||
|
ctx->blocks[blknum].cksum_n_offset = cur_sketch;
|
||||||
|
memset(fplist, 0, fplist_sz);
|
||||||
|
fpos = 0;
|
||||||
|
len1 = 0;
|
||||||
|
cur_sketch = 0;
|
||||||
blknum++;
|
blknum++;
|
||||||
}
|
}
|
||||||
last_offset = i+1;
|
last_offset = i+1;
|
||||||
|
@ -287,9 +360,10 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
if (last_offset < *size) {
|
if (last_offset < *size) {
|
||||||
ctx->blocks[blknum].offset = last_offset;
|
ctx->blocks[blknum].offset = last_offset;
|
||||||
ctx->blocks[blknum].index = blknum;
|
ctx->blocks[blknum].index = blknum;
|
||||||
ctx->blocks[blknum].cksum_n_offset = j;
|
|
||||||
ctx->blocks[blknum].length = *size - last_offset;
|
ctx->blocks[blknum].length = *size - last_offset;
|
||||||
ctx->blocks[blknum].refcount = 0;
|
ctx->blocks[blknum].refcount = 0;
|
||||||
|
ctx->blocks[blknum].similar = 0;
|
||||||
|
ctx->blocks[blknum].cksum_n_offset = cur_sketch;
|
||||||
blknum++;
|
blknum++;
|
||||||
last_offset = *size;
|
last_offset = *size;
|
||||||
}
|
}
|
||||||
|
@ -302,8 +376,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
/*
|
/*
|
||||||
* Now sort the block array based on checksums. This will bring virtually
|
* Now sort the block array based on checksums. This will bring virtually
|
||||||
* all similar block entries together. Effectiveness depends on how strong
|
* all similar block entries together. Effectiveness depends on how strong
|
||||||
* our checksum is. We are using CRC64 here so we should be pretty okay.
|
* our checksum is. We are using a maximal super-sketch value.
|
||||||
* TODO: Test with a heavily optimized MD5 (from OpenSSL?) later.
|
|
||||||
*/
|
*/
|
||||||
qsort(ctx->blocks, blknum, sizeof (rabin_blockentry_t), cmpblks);
|
qsort(ctx->blocks, blknum, sizeof (rabin_blockentry_t), cmpblks);
|
||||||
rabin_index = (uint32_t *)(ctx->cbuf + RABIN_HDR_SIZE);
|
rabin_index = (uint32_t *)(ctx->cbuf + RABIN_HDR_SIZE);
|
||||||
|
@ -332,7 +405,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
if (blk > 0 && ctx->blocks[blk].cksum_n_offset == prev_cksum &&
|
if (blk > 0 && ctx->blocks[blk].cksum_n_offset == prev_cksum &&
|
||||||
ctx->blocks[blk].length == prev_length &&
|
ctx->blocks[blk].length == prev_length &&
|
||||||
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
|
memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
|
||||||
ctx->blocks[blk].length = 0;
|
ctx->blocks[blk].similar = SIMILAR_EXACT;
|
||||||
ctx->blocks[blk].index = prev_index;
|
ctx->blocks[blk].index = prev_index;
|
||||||
(ctx->blocks[prev_blk].refcount)++;
|
(ctx->blocks[prev_blk].refcount)++;
|
||||||
matchlen += prev_length;
|
matchlen += prev_length;
|
||||||
|
@ -344,10 +417,32 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
prev_index = ctx->blocks[blk].index;
|
prev_index = ctx->blocks[blk].index;
|
||||||
prev_blk = blk;
|
prev_blk = blk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx->delta_flag) {
|
||||||
|
for (blk = 0; blk < blknum; blk++) {
|
||||||
|
if (ctx->blocks[blk].similar) continue;
|
||||||
|
|
||||||
|
if (blk > 0 && ctx->blocks[blk].refcount == 0 &&
|
||||||
|
ctx->blocks[blk].cksum_n_offset == prev_cksum) {
|
||||||
|
ssize_t sz1, sz2;
|
||||||
|
ctx->blocks[blk].index = prev_index;
|
||||||
|
ctx->blocks[blk].similar = SIMILAR_PARTIAL;
|
||||||
|
(ctx->blocks[prev_blk].refcount)++;
|
||||||
|
matchlen += prev_length/2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
prev_offset = buf1 + ctx->blocks[blk].offset;
|
||||||
|
prev_cksum = ctx->blocks[blk].cksum_n_offset;
|
||||||
|
prev_length = ctx->blocks[blk].length;
|
||||||
|
prev_index = ctx->blocks[blk].index;
|
||||||
|
prev_blk = blk;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (matchlen < rabin_index_sz) {
|
if (matchlen < rabin_index_sz) {
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Another pass, this time through the block index in the chunk. We insert
|
* Another pass, this time through the block index in the chunk. We insert
|
||||||
* block length into unique block entries. For block entries that are
|
* block length into unique block entries. For block entries that are
|
||||||
|
@ -362,11 +457,12 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
rabin_blockentry_t *be;
|
rabin_blockentry_t *be;
|
||||||
|
|
||||||
be = &(ctx->blocks[blkarr[blk]]);
|
be = &(ctx->blocks[blkarr[blk]]);
|
||||||
if (be->length > 0) {
|
if (be->similar == 0) {
|
||||||
/*
|
/*
|
||||||
* Update Index entry with the length. Also try to merge runs
|
* Update Index entry with the length. Also try to merge runs
|
||||||
* of unique (non-duplicate) blocks into a single block entry
|
* of unique (non-duplicate/similar) blocks into a single block
|
||||||
* as long as the total length does not exceed max block size.
|
* entry as long as the total length does not exceed max block
|
||||||
|
* size.
|
||||||
*/
|
*/
|
||||||
if (prev_index == 0) {
|
if (prev_index == 0) {
|
||||||
if (be->refcount == 0) {
|
if (be->refcount == 0) {
|
||||||
|
@ -402,23 +498,31 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
} else {
|
} else {
|
||||||
prev_index = 0;
|
prev_index = 0;
|
||||||
prev_length = 0;
|
prev_length = 0;
|
||||||
rabin_index[pos] = be->index | RABIN_INDEX_FLAG;
|
ctx->blocks[pos].cksum_n_offset = be->offset;
|
||||||
|
ctx->blocks[pos].new_length = be->length;
|
||||||
trans[blk] = pos;
|
trans[blk] = pos;
|
||||||
|
|
||||||
|
if (be->similar == SIMILAR_EXACT) {
|
||||||
|
rabin_index[pos] = (blkarr[be->index] | RABIN_INDEX_FLAG) &
|
||||||
|
CLEAR_SIMILARITY_FLAG;
|
||||||
|
} else {
|
||||||
|
rabin_index[pos] = blkarr[be->index] | RABIN_INDEX_FLAG |
|
||||||
|
SET_SIMILARITY_FLAG;
|
||||||
|
}
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Final pass, copy the data.
|
* Final pass, copy the data and perform delta encoding.
|
||||||
*/
|
*/
|
||||||
blknum = pos;
|
blknum = pos;
|
||||||
rabin_index_sz = (ssize_t)pos * RABIN_ENTRY_SIZE;
|
rabin_index_sz = (ssize_t)pos * RABIN_ENTRY_SIZE;
|
||||||
pos1 = rabin_index_sz + RABIN_HDR_SIZE;
|
pos1 = rabin_index_sz + RABIN_HDR_SIZE;
|
||||||
for (blk = 0; blk < blknum; blk++) {
|
for (blk = 0; blk < blknum; blk++) {
|
||||||
if (rabin_index[blk] & RABIN_INDEX_FLAG) {
|
uchar_t *old, *new;
|
||||||
j = rabin_index[blk] & RABIN_INDEX_VALUE;
|
int32_t bsz;
|
||||||
rabin_index[blk] = htonl(trans[j] | RABIN_INDEX_FLAG);
|
|
||||||
} else {
|
|
||||||
/*
|
/*
|
||||||
* If blocks are overflowing the allowed chunk size then dedup did not
|
* If blocks are overflowing the allowed chunk size then dedup did not
|
||||||
* help at all. We invalidate the dedup operation.
|
* help at all. We invalidate the dedup operation.
|
||||||
|
@ -427,7 +531,30 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
||||||
valid = 0;
|
valid = 0;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
memcpy(ctx->cbuf + pos1, buf1 + ctx->blocks[blk].cksum_n_offset, rabin_index[blk]);
|
if (rabin_index[blk] & RABIN_INDEX_FLAG) {
|
||||||
|
j = rabin_index[blk] & RABIN_INDEX_VALUE;
|
||||||
|
i = ctx->blocks[j].index;
|
||||||
|
|
||||||
|
if (rabin_index[blk] & GET_SIMILARITY_FLAG) {
|
||||||
|
old = buf1 + ctx->blocks[j].offset;
|
||||||
|
new = buf1 + ctx->blocks[blk].cksum_n_offset;
|
||||||
|
bsz = bsdiff(old, ctx->blocks[j].length, new,
|
||||||
|
ctx->blocks[blk].new_length, ctx->cbuf + pos1, 0, 0);
|
||||||
|
if (bsz == 0) {
|
||||||
|
memcpy(ctx->cbuf + pos1, new, ctx->blocks[blk].new_length);
|
||||||
|
rabin_index[blk] = htonl(ctx->blocks[blk].new_length);
|
||||||
|
pos1 += ctx->blocks[blk].new_length;
|
||||||
|
} else {
|
||||||
|
rabin_index[blk] = htonl(trans[i] |
|
||||||
|
RABIN_INDEX_FLAG | SET_SIMILARITY_FLAG);
|
||||||
|
pos1 += bsz;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
rabin_index[blk] = htonl(trans[i] | RABIN_INDEX_FLAG);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
memcpy(ctx->cbuf + pos1, buf1 + ctx->blocks[blk].cksum_n_offset,
|
||||||
|
rabin_index[blk]);
|
||||||
pos1 += rabin_index[blk];
|
pos1 += rabin_index[blk];
|
||||||
rabin_index[blk] = htonl(rabin_index[blk]);
|
rabin_index[blk] = htonl(rabin_index[blk]);
|
||||||
}
|
}
|
||||||
|
@ -512,29 +639,66 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
|
||||||
ctx->blocks[blk].offset = pos1;
|
ctx->blocks[blk].offset = pos1;
|
||||||
pos1 += len;
|
pos1 += len;
|
||||||
} else {
|
} else {
|
||||||
|
bsize_t blen;
|
||||||
|
|
||||||
ctx->blocks[blk].length = 0;
|
ctx->blocks[blk].length = 0;
|
||||||
|
if (len & GET_SIMILARITY_FLAG) {
|
||||||
|
ctx->blocks[blk].offset = pos1;
|
||||||
|
ctx->blocks[blk].index = (len & RABIN_INDEX_VALUE) | SET_SIMILARITY_FLAG;
|
||||||
|
blen = get_bsdiff_sz(buf + pos1);
|
||||||
|
pos1 += blen;
|
||||||
|
} else {
|
||||||
ctx->blocks[blk].index = len & RABIN_INDEX_VALUE;
|
ctx->blocks[blk].index = len & RABIN_INDEX_VALUE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (blk = 0; blk < blknum; blk++) {
|
for (blk = 0; blk < blknum; blk++) {
|
||||||
|
int rv;
|
||||||
|
bsize_t newsz;
|
||||||
|
|
||||||
if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue;
|
if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue;
|
||||||
if (ctx->blocks[blk].length > 0) {
|
if (ctx->blocks[blk].length > 0) {
|
||||||
len = ctx->blocks[blk].length;
|
len = ctx->blocks[blk].length;
|
||||||
pos1 = ctx->blocks[blk].offset;
|
pos1 = ctx->blocks[blk].offset;
|
||||||
} else {
|
} else {
|
||||||
oblk = ctx->blocks[blk].index;
|
oblk = ctx->blocks[blk].index;
|
||||||
|
|
||||||
|
if (oblk & GET_SIMILARITY_FLAG) {
|
||||||
|
oblk = oblk & CLEAR_SIMILARITY_FLAG;
|
||||||
len = ctx->blocks[oblk].length;
|
len = ctx->blocks[oblk].length;
|
||||||
pos1 = ctx->blocks[oblk].offset;
|
pos1 = ctx->blocks[oblk].offset;
|
||||||
|
newsz = data_sz - sz;
|
||||||
|
rv = bspatch(buf + ctx->blocks[blk].offset, buf + pos1, len, pos2, &newsz);
|
||||||
|
if (rv == 0) {
|
||||||
|
fprintf(stderr, "Failed to bspatch block.\n");
|
||||||
|
ctx->valid = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
pos2 += newsz;
|
||||||
|
sz += newsz;
|
||||||
|
if (sz > data_sz) {
|
||||||
|
fprintf(stderr, "Dedup data overflows chunk.\n");
|
||||||
|
ctx->valid = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
len = ctx->blocks[oblk].length;
|
||||||
|
pos1 = ctx->blocks[oblk].offset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
memcpy(pos2, buf + pos1, len);
|
memcpy(pos2, buf + pos1, len);
|
||||||
pos2 += len;
|
pos2 += len;
|
||||||
sz += len;
|
sz += len;
|
||||||
if (sz > data_sz) {
|
if (sz > data_sz) {
|
||||||
|
fprintf(stderr, "Dedup data overflows chunk.\n");
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ctx->valid && sz < data_sz) {
|
if (ctx->valid && sz < data_sz) {
|
||||||
|
fprintf(stderr, "Too little dedup data processed.\n");
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
}
|
}
|
||||||
*size = data_sz;
|
*size = data_sz;
|
||||||
|
|
|
@ -84,19 +84,15 @@
|
||||||
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
|
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
|
||||||
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
|
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
|
||||||
|
|
||||||
typedef struct {
|
// Minimum practical chunk size when doing dedup
|
||||||
ssize_t offset;
|
#define RAB_MIN_CHUNK_SIZE (1048576L)
|
||||||
uint64_t cksum_n_offset; // Dual purpose variable
|
|
||||||
unsigned int index;
|
// Number of bytes to compute one maximal fingerprint value
|
||||||
unsigned int length;
|
#define SKETCH_BASIC_BLOCK_SZ (1024)
|
||||||
unsigned short refcount;
|
|
||||||
} rabin_blockentry_t;
|
|
||||||
|
|
||||||
// An entry in the Rabin block array in the chunk.
|
// An entry in the Rabin block array in the chunk.
|
||||||
// It is either a length value <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE or
|
// It is either a length value <= RABIN_MAX_BLOCK_SIZE or an index value with
|
||||||
// if value > RAB_POLYNOMIAL_MAX_BLOCK_SIZE then
|
// which this block is a duplicate/similar. The entries are variable sized.
|
||||||
// value - RAB_POLYNOMIAL_MAX_BLOCK_SIZE is index of block with which
|
|
||||||
// this block is a duplicate.
|
|
||||||
// Offset can be dynamically calculated.
|
// Offset can be dynamically calculated.
|
||||||
//
|
//
|
||||||
#define RABIN_ENTRY_SIZE (sizeof (unsigned int))
|
#define RABIN_ENTRY_SIZE (sizeof (unsigned int))
|
||||||
|
@ -106,20 +102,43 @@ typedef struct {
|
||||||
// size of deduped data, size of compressed data
|
// size of deduped data, size of compressed data
|
||||||
#define RABIN_HDR_SIZE (sizeof (unsigned int) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t))
|
#define RABIN_HDR_SIZE (sizeof (unsigned int) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t))
|
||||||
|
|
||||||
// Maximum number of dedup blocks supported (2^31 - 1)
|
// Maximum number of dedup blocks supported (2^30 - 1)
|
||||||
#define RABIN_MAX_BLOCKS (0x7fffffff)
|
#define RABIN_MAX_BLOCKS (0x3FFFFFFFUL)
|
||||||
|
|
||||||
// Maximum possible block size for a single rabin block. This is a hard limit much
|
// Maximum possible block size for a single rabin block. This is a hard limit much
|
||||||
// larger than RAB_POLYNOMIAL_MAX_BLOCK_SIZE. Useful when merging non-duplicate blocks.
|
// larger than RAB_POLYNOMIAL_MAX_BLOCK_SIZE. Useful when merging non-duplicate blocks.
|
||||||
// This is also 2^31 - 1.
|
// This is also 2^31 - 1.
|
||||||
#define RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS)
|
#define RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS)
|
||||||
|
|
||||||
// Mask to determine whether Rabin index entry is a length value or index value.
|
// Masks to determine whether Rabin index entry is a length value, duplicate index value
|
||||||
|
// or similar index value.
|
||||||
// MSB = 1 : Index
|
// MSB = 1 : Index
|
||||||
// MSB = 0 : Length
|
// MSB = 0 : Length
|
||||||
#define RABIN_INDEX_FLAG (0x80000000)
|
// MSB-1 = 1: Similarity Index
|
||||||
|
// MSB-1 = 0: Exact Duplicate Index
|
||||||
|
#define RABIN_INDEX_FLAG (0x80000000UL)
|
||||||
|
#define SET_SIMILARITY_FLAG (0x40000000UL)
|
||||||
|
#define GET_SIMILARITY_FLAG SET_SIMILARITY_FLAG
|
||||||
|
#define CLEAR_SIMILARITY_FLAG (0xBFFFFFFFUL)
|
||||||
|
|
||||||
// Mask to extract value from a rabin index entry
|
// Mask to extract value from a rabin index entry
|
||||||
#define RABIN_INDEX_VALUE (0x7fffffff)
|
#define RABIN_INDEX_VALUE (0x3FFFFFFFUL)
|
||||||
|
|
||||||
|
// Tolerance for partial similarity check. We expect 80% similarity for
|
||||||
|
// delta compression. See: http://www.armedia.com/wp/SimilarityIndex.pdf
|
||||||
|
#define SIMILARITY_TOLERANCE (0.2f)
|
||||||
|
#define SIMILAR_EXACT 1
|
||||||
|
#define SIMILAR_PARTIAL 2
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ssize_t offset;
|
||||||
|
uint64_t cksum_n_offset; // Dual purpose variable
|
||||||
|
unsigned int index;
|
||||||
|
unsigned int length;
|
||||||
|
unsigned int new_length;
|
||||||
|
unsigned short refcount;
|
||||||
|
short similar;
|
||||||
|
} rabin_blockentry_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
unsigned char *current_window_data;
|
unsigned char *current_window_data;
|
||||||
|
@ -134,11 +153,11 @@ typedef struct {
|
||||||
uint64_t real_chunksize;
|
uint64_t real_chunksize;
|
||||||
short valid;
|
short valid;
|
||||||
void *lzma_data;
|
void *lzma_data;
|
||||||
int level;
|
int level, delta_flag;
|
||||||
} rabin_context_t;
|
} rabin_context_t;
|
||||||
|
|
||||||
extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
|
extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
|
||||||
const char *algo);
|
const char *algo, int delta_flag);
|
||||||
extern void destroy_rabin_context(rabin_context_t *ctx);
|
extern void destroy_rabin_context(rabin_context_t *ctx);
|
||||||
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf,
|
||||||
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
|
||||||
|
|
1
utils.h
1
utils.h
|
@ -52,6 +52,7 @@ extern "C" {
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
typedef unsigned long uintptr_t;
|
typedef unsigned long uintptr_t;
|
||||||
|
typedef ssize_t bsize_t;
|
||||||
|
|
||||||
#undef WORDS_BIGENDIAN
|
#undef WORDS_BIGENDIAN
|
||||||
#if BYTE_ORDER == BIG_ENDIAN
|
#if BYTE_ORDER == BIG_ENDIAN
|
||||||
|
|
Loading…
Reference in a new issue