diff --git a/Makefile b/Makefile
index d02699f..4b092db 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,10 @@ RABINSRCS = rabin/rabin_polynomial.c
 RABINHDRS = rabin/rabin_polynomial.h utils.h
 RABINOBJS = $(RABINSRCS:.c=.o)
 
+BSDIFFSRCS = bsdiff/bsdiff.c bsdiff/bspatch.c bsdiff/rle_encoder.c
+BSDIFFHDRS = bsdiff/bscommon.h utils.h allocator.h
+BSDIFFOBJS = $(BSDIFFSRCS:.c=.o)
+
 LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c
 LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \
 	lzma/LzHash.h lzma/LzmaDec.h utils.h
@@ -44,10 +48,10 @@ CRCSRCS = lzma/crc64_fast.c lzma/crc64_table.c
 CRCHDRS = lzma/crc64_table_le.h lzma/crc64_table_be.h lzma/crc_macros.h
 CRCOBJS = $(CRCSRCS:.c=.o)
 
-BAKFILES = *~ lzma/*~ rabin/*~
+BAKFILES = *~ lzma/*~ rabin/*~ bsdiff/*~
 
 RM = rm -f
-CPPFLAGS = -I. -I./lzma -I./rabin -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
+CPPFLAGS = -I. -I./lzma -I./rabin -I./bsdiff -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
 	-D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
 VEC_FLAGS = -ftree-vectorize
 LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
@@ -57,6 +61,7 @@ ifdef DEBUG
 LINK = g++ -m64 -pthread -msse3
 COMPILE = gcc -m64 -O -g -msse3 -c
 COMPILE_cpp = g++ -m64 -O -g -msse3 -c
+VEC_FLAGS = 
 ifdef DEBUG_NO_SLAB
 CPPFLAGS += -DDEBUG_NO_SLAB
 endif
@@ -84,12 +89,15 @@ $(PPMDOBJS): $(PPMDSRCS) $(PPMDHDRS)
 $(RABINOBJS): $(RABINSRCS) $(RABINHDRS)
 	$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
 
+$(BSDIFFOBJS): $(BSDIFFSRCS) $(BSDIFFHDRS)
+	$(COMPILE) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
+
 $(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
 	$(COMPILE) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
 
-$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS)
-	$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(LDLIBS)
+$(PROG): $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS)
+	$(LINK) -o $@ $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) $(LDLIBS)
 
 clean:
-	$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BAKFILES)
+	$(RM) $(PROG) $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(CRCOBJS) $(RABINOBJS) $(BSDIFFOBJS) $(BAKFILES)
 
diff --git a/bsdiff/bscommon.h b/bsdiff/bscommon.h
new file mode 100644
index 0000000..d5a6dfe
--- /dev/null
+++ b/bsdiff/bscommon.h
@@ -0,0 +1,132 @@
+/*-
+ * Copyright 2012 Moinak Ghosh
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions 
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ * 
+ */
+
+#ifndef _BS_COMMON_
+#define _BS_COMMON_
+
+#include <stdio.h>
+#include <utils.h>
+
+// Simple stream I/O to buffer
+typedef struct {
+	uchar_t *buf;
+	bsize_t pos;
+	bsize_t buflen;
+} bufio_t;
+
+static int
+BUFOPEN(bufio_t *bio, uchar_t *buf, bsize_t len)
+{
+	bio->buf = buf; bio->pos = 0; bio->buflen = len;
+	return (0);
+}
+static bsize_t
+BUFWRITE(bufio_t *bio, uchar_t *buf, bsize_t len)
+{
+	if (bio->pos + len < bio->buflen) {
+		memcpy(bio->buf + bio->pos, buf, len);
+		bio->pos += len;
+		return (len);
+	} else {
+		return (-1);
+	}
+}
+
+static bsize_t
+BUFREAD(bufio_t *bio, uchar_t *buf, bsize_t len)
+{
+	bsize_t actual;
+int i;
+
+	actual = len;
+	if (bio->pos + len > bio->buflen) {
+		actual = bio->buflen - bio->pos;
+	}
+	if (actual == 0) return (0);
+	memcpy(buf, bio->buf + bio->pos, actual);
+	bio->pos += actual;
+	return (actual);
+}
+
+static bsize_t
+BUFTELL(bufio_t *bio)
+{
+	return (bio->pos);
+}
+
+static void *
+BUFPTR(bufio_t *bio)
+{
+	return (bio->buf + bio->pos);
+}
+
+static int
+BUFSEEK(bufio_t *bio, bsize_t pos, int typ)
+{
+	if (typ == SEEK_SET) {
+		bio->pos = pos;
+
+	} else if (typ == SEEK_CUR) {
+		bio->pos += pos;
+
+	} else {
+		if (pos > 0) {
+			fprintf(stderr, "Cannot seek beyond buffer end.\n");
+			return (-1);
+		} else {
+			bio->pos = bio->buflen + pos;
+		}
+	}
+	return (0);
+}
+
+extern int zero_rle_encode(const void *const ibuf, const unsigned int ilen,
+	void *obuf, unsigned int *const olen);
+extern int zero_rle_decode(const void* ibuf, unsigned int ilen,
+	void* obuf, unsigned int *olen);
+
+#endif
diff --git a/bsdiff/bsdiff.c b/bsdiff/bsdiff.c
new file mode 100644
index 0000000..ebc8148
--- /dev/null
+++ b/bsdiff/bsdiff.c
@@ -0,0 +1,402 @@
+/*-
+ * Copyright 2003-2005 Colin Percival
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions 
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ *
+ * This is a somewhat modified bsdiff implementation. It has been modified
+ * to do buffer to buffer diffing instead of file to file and also use
+ * a custom RLE encoding rather than Bzip2 on the diff output.
+ */
+
+#if 0
+__FBSDID("$FreeBSD: src/usr.bin/bsdiff/bsdiff/bsdiff.c,v 1.1 2005/08/06 01:59:05 cperciva Exp $");
+#endif
+
+#include <sys/types.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <allocator.h>
+#include <utils.h>
+#include "bscommon.h"
+
+#define MIN(x,y) (((x)<(y)) ? (x) : (y))
+
+static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
+{
+	bsize_t i,j,k,x,tmp,jj,kk;
+
+	if(len<16) {
+		for(k=start;k<start+len;k+=j) {
+			j=1;x=V[I[k]+h];
+			for(i=1;k+i<start+len;i++) {
+				if(V[I[k+i]+h]<x) {
+					x=V[I[k+i]+h];
+					j=0;
+				};
+				if(V[I[k+i]+h]==x) {
+					tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp;
+					j++;
+				};
+			};
+			for(i=0;i<j;i++) V[I[k+i]]=k+j-1;
+			if(j==1) I[k]=-1;
+		};
+		return;
+	};
+
+	x=V[I[start+len/2]+h];
+	jj=0;kk=0;
+	for(i=start;i<start+len;i++) {
+		if(V[I[i]+h]<x) jj++;
+		if(V[I[i]+h]==x) kk++;
+	};
+	jj+=start;kk+=jj;
+
+	i=start;j=0;k=0;
+	while(i<jj) {
+		if(V[I[i]+h]<x) {
+			i++;
+		} else if(V[I[i]+h]==x) {
+			tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp;
+			j++;
+		} else {
+			tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp;
+			k++;
+		};
+	};
+
+	while(jj+j<kk) {
+		if(V[I[jj+j]+h]==x) {
+			j++;
+		} else {
+			tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp;
+			k++;
+		};
+	};
+
+	if(jj>start) split(I,V,start,jj-start,h);
+
+	for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1;
+	if(jj==kk-1) I[jj]=-1;
+
+	if(start+len>kk) split(I,V,kk,start+len-kk,h);
+}
+
+static void qsufsort(bsize_t *I,bsize_t *V,u_char *old,bsize_t oldsize)
+{
+	bsize_t buckets[256];
+	bsize_t i,h,len;
+
+	for(i=0;i<256;i++) buckets[i]=0;
+	for(i=0;i<oldsize;i++) buckets[old[i]]++;
+	for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
+	for(i=255;i>0;i--) buckets[i]=buckets[i-1];
+	buckets[0]=0;
+
+	for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
+	I[0]=oldsize;
+	for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
+	V[oldsize]=0;
+	for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
+	I[0]=-1;
+
+	for(h=1;I[0]!=-(oldsize+1);h+=h) {
+		len=0;
+		for(i=0;i<oldsize+1;) {
+			if(I[i]<0) {
+				len-=I[i];
+				i-=I[i];
+			} else {
+				if(len) I[i-len]=-len;
+				len=V[I[i]]+1-i;
+				split(I,V,i,len,h);
+				i+=len;
+				len=0;
+			};
+		};
+		if(len) I[i-len]=-len;
+	};
+
+	for(i=0;i<oldsize+1;i++) I[V[i]]=i;
+}
+
+static bsize_t matchlen(u_char *old,bsize_t oldsize,u_char *new,bsize_t newsize)
+{
+	bsize_t i;
+
+	for(i=0;(i<oldsize)&&(i<newsize);i++)
+		if(old[i]!=new[i]) break;
+
+	return i;
+}
+
+static bsize_t search(bsize_t *I,u_char *old,bsize_t oldsize,
+		u_char *new,bsize_t newsize,bsize_t st,bsize_t en,bsize_t *pos)
+{
+	bsize_t x,y;
+
+	if(en-st<2) {
+		x=matchlen(old+I[st],oldsize-I[st],new,newsize);
+		y=matchlen(old+I[en],oldsize-I[en],new,newsize);
+
+		if(x>y) {
+			*pos=I[st];
+			return x;
+		} else {
+			*pos=I[en];
+			return y;
+		}
+	};
+
+	x=st+(en-st)/2;
+	if(memcmp(old+I[x],new,MIN(oldsize-I[x],newsize))<0) {
+		return search(I,old,oldsize,new,newsize,x,en,pos);
+	} else {
+		return search(I,old,oldsize,new,newsize,st,x,pos);
+	};
+}
+
+static void
+valout(bsize_t x, u_char *buf)
+{
+	*((bsize_t *)buf) = htonll(x);
+}
+
+static void
+valouti32(bsize_t x, u_char *buf)
+{
+	int32_t val;
+	val = x;
+	*((int32_t *)buf) = htonl(val);
+}
+
+bsize_t
+bsdiff(u_char *old, bsize_t oldsize, u_char *new, bsize_t newsize,
+       u_char *diff, u_char *scratch, bsize_t scratchsize)
+{
+	bsize_t *I,*V;
+	bsize_t scan,pos,len;
+	bsize_t lastscan,lastpos,lastoffset;
+	bsize_t oldscore,scsc;
+	bsize_t s,Sf,lenf,Sb,lenb;
+	bsize_t overlap,Ss,lens;
+	bsize_t i, rv;
+	bsize_t dblen,eblen;
+	u_char *db,*eb;
+	u_char buf[sizeof (bsize_t)];
+	u_char header[48];
+	unsigned int sz, hdrsz, ulen;
+	bufio_t pf;
+
+	sz = sizeof (bsize_t);
+	I = slab_alloc(NULL, (oldsize+1)*sz);
+	V = slab_alloc(NULL, (oldsize+1)*sz);
+	if(I == NULL || V == NULL) return (0);
+
+	qsufsort(I,V,old,oldsize);
+	slab_free(NULL, V);
+
+	if(((db=slab_alloc(NULL, newsize+1))==NULL) ||
+		((eb=slab_alloc(NULL, newsize+1))==NULL)) {
+		fprintf(stderr, "bsdiff: Memory allocation error.\n");
+		slab_free(NULL, I);
+		slab_free(NULL, V);
+		return (0);
+	}
+	dblen=0;
+	eblen=0;
+	BUFOPEN(&pf, diff, newsize);
+
+	/* Header is
+		0	8	length of ctrl block
+		8	8	compressed length of diff block
+		16	8	actual length of diff block
+		24	8	compressed length of extra block
+		32	8	actual length of extra block
+		40	8	length of new file */
+	/* File is
+		0	32	Header
+		32	??	ctrl block
+		??	??	diff block
+		??	??	extra block */
+	valout(0, header);
+	valout(0, header + sz);
+	valout(0, header + sz*2);
+	valout(0, header + sz*3);
+	valout(0, header + sz*4);
+	valout(newsize, header + sz*5);
+	if (BUFWRITE(&pf, header, sz*6) != sz*6) {
+		fprintf(stderr, "bsdiff: Write to compressed buffer failed.\n");
+		rv = 0;
+		goto out;
+	}
+	hdrsz = sz*6;
+
+	/* Compute the differences, writing ctrl as we go */
+	scan=0;len=0;
+	lastscan=0;lastpos=0;lastoffset=0;
+	while(scan<newsize) {
+		oldscore=0;
+
+		for(scsc=scan+=len;scan<newsize;scan++) {
+			len=search(I,old,oldsize,new+scan,newsize-scan,
+					0,oldsize,&pos);
+
+			for(;scsc<scan+len;scsc++)
+			if((scsc+lastoffset<oldsize) &&
+				(old[scsc+lastoffset] == new[scsc]))
+				oldscore++;
+
+			if(((len==oldscore) && (len!=0)) || 
+				(len>oldscore+sz)) break;
+
+			if((scan+lastoffset<oldsize) &&
+				(old[scan+lastoffset] == new[scan]))
+				oldscore--;
+		};
+
+		if((len!=oldscore) || (scan==newsize)) {
+			s=0;Sf=0;lenf=0;
+			for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
+				if(old[lastpos+i]==new[lastscan+i]) s++;
+				i++;
+				if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
+			};
+
+			lenb=0;
+			if(scan<newsize) {
+				s=0;Sb=0;
+				for(i=1;(scan>=lastscan+i)&&(pos>=i);i++) {
+					if(old[pos-i]==new[scan-i]) s++;
+					if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; };
+				};
+			};
+
+			if(lastscan+lenf>scan-lenb) {
+				overlap=(lastscan+lenf)-(scan-lenb);
+				s=0;Ss=0;lens=0;
+				for(i=0;i<overlap;i++) {
+					if(new[lastscan+lenf-overlap+i]==
+					   old[lastpos+lenf-overlap+i]) s++;
+					if(new[scan-lenb+i]==
+					   old[pos-lenb+i]) s--;
+					if(s>Ss) { Ss=s; lens=i+1; };
+				};
+
+				lenf+=lens-overlap;
+				lenb-=lens;
+			};
+
+			for(i=0;i<lenf;i++)
+				db[dblen+i]=new[lastscan+i]-old[lastpos+i];
+			for(i=0;i<(scan-lenb)-(lastscan+lenf);i++)
+				eb[eblen+i]=new[lastscan+lenf+i];
+
+			dblen+=lenf;
+			eblen+=(scan-lenb)-(lastscan+lenf);
+
+			valouti32(lenf, buf);
+			BUFWRITE(&pf, buf, 4);
+			valouti32((scan-lenb)-(lastscan+lenf),buf);
+			BUFWRITE(&pf, buf, 4);
+			valouti32((pos-lenb)-(lastpos+lenf),buf);
+			BUFWRITE(&pf, buf, 4);
+
+			lastscan=scan-lenb;
+			lastpos=pos-lenb;
+			lastoffset=pos-scan;
+		}
+	}
+	if (eblen > newsize/2) {
+		rv = 0;
+		goto out;
+	}
+
+	/* Compute size of ctrl data */
+	len = BUFTELL(&pf);
+	valout(len-hdrsz, header);
+	rv = len;
+
+	/* Write diff data */
+	len = newsize - rv;
+	ulen = len;
+	if (zero_rle_encode(db, dblen, BUFPTR(&pf), &ulen) == -1) {
+		rv = 0;
+		goto out;
+	}
+	/* Output size of diff data */
+	len = ulen;
+	valout(len, header + sz);
+	valout(dblen, header + sz*2);
+	rv += len;
+	BUFSEEK(&pf, len, SEEK_CUR);
+
+	/* Write extra data */
+	len = newsize - rv;
+	ulen = len;
+	if (zero_rle_encode(eb, eblen, BUFPTR(&pf), &ulen) == -1) {
+		rv = 0;
+		goto out;
+	}
+	/* Output size of extra data */
+	len = ulen;
+	valout(len, header + sz*3);
+	valout(eblen, header + sz*4);
+	rv += len;
+
+	/* Seek to the beginning, re-write the header.*/
+	BUFSEEK(&pf, 0, SEEK_SET);
+	BUFWRITE(&pf, header, hdrsz);
+
+out:
+	/* Free the memory we used */
+	slab_free(NULL, db);
+	slab_free(NULL, eb);
+	slab_free(NULL, I);
+
+	return (rv);
+}
diff --git a/bsdiff/bspatch.c b/bsdiff/bspatch.c
new file mode 100644
index 0000000..7c9f53e
--- /dev/null
+++ b/bsdiff/bspatch.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright 2003-2005 Colin Percival
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions 
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if 0
+__FBSDID("$FreeBSD: src/usr.bin/bsdiff/bspatch/bspatch.c,v 1.1 2005/08/06 01:59:06 cperciva Exp $");
+#endif
+
+#include <bzlib.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <err.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <allocator.h>
+#include <utils.h>
+#include "bscommon.h"
+
+static bsize_t
+valin(u_char *buf)
+{
+	return ntohll(*((bsize_t *)buf));
+}
+
+static int32_t
+valini32(u_char *buf)
+{
+	return ntohl(*((int32_t *)buf));
+}
+
+bsize_t
+get_bsdiff_sz(u_char *pbuf) {
+	bsize_t newsize;
+	bsize_t ctrllen, lzdatalen, datalen, lzextralen, extralen;
+	int sz, hdrsz, rv;
+
+	sz = sizeof (bsize_t);
+	hdrsz = sz*6;
+
+	ctrllen = valin(pbuf);
+	lzdatalen = valin(pbuf+sz);
+	datalen = valin(pbuf+sz*2);
+	lzextralen = valin(pbuf+sz*3);
+	extralen = valin(pbuf+sz*4);
+	newsize = valin(pbuf+sz*5);
+	return (ctrllen + lzdatalen + lzextralen + hdrsz);
+}
+
+int
+bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new, bsize_t *_newsize)
+{
+	bsize_t newsize;
+	bsize_t ctrllen, lzdatalen, datalen, lzextralen, extralen;
+	u_char buf[8];
+	u_char *diffdata, *extradata;
+	bsize_t oldpos,newpos;
+	bsize_t ctrl[3];
+	bsize_t lenread;
+	bsize_t i;
+	bufio_t cpf, dpf, epf;
+	int sz, hdrsz, rv;
+	unsigned int len;
+
+	/*
+	File format:
+		0	8	length of ctrl block (X)
+		8	8	compressed length of diff block (Y)
+		16	8	actual length of diff block
+		24	8	compressed length of extra block (Z)
+		32	8	actual length of extra block
+		40	8	length of new file
+		48	X	control block
+		48+X	Y	lzfx(diff block)
+		48+X+Y	Z	lzfx(extra block)
+	with control block a set of triples (x,y,z) meaning "add x bytes
+	from oldfile to x bytes from the diff block; copy y bytes from the
+	extra block; seek forwards in oldfile by z bytes".
+	*/
+	sz = sizeof (bsize_t);
+	hdrsz = sz*6;
+	rv = 1;
+
+	/* Read lengths from header first. */
+	ctrllen = valin(pbuf);
+	lzdatalen = valin(pbuf+sz);
+	datalen = valin(pbuf+sz*2);
+	lzextralen = valin(pbuf+sz*3);
+	extralen = valin(pbuf+sz*4);
+	newsize = valin(pbuf+sz*5);
+
+	if((ctrllen<0) || (lzdatalen<0) || (newsize<0) || (lzextralen<0)) {
+		fprintf(stderr, "1: Corrupt patch\n");
+		return (0);
+	}
+	if (newsize > *_newsize) {
+		fprintf(stderr, "Output buffer too small.\n");
+		return (0);
+	}
+	*_newsize = newsize;
+
+	/* Allocate buffers. */
+	diffdata = malloc(datalen);
+	extradata = malloc(extralen);
+	if (diffdata == NULL || extradata == NULL) {
+		fprintf(stderr, "bspatch: Out of memory.\n");
+		if (diffdata) free(diffdata);
+		if (extradata) free(extradata);
+		return (0);
+	}
+
+	/* Decompress diffdata and extradata. */
+	len = datalen;
+	if (zero_rle_decode(pbuf + hdrsz + ctrllen, lzdatalen, diffdata, &len) == -1 ||
+	    len != datalen) {
+		fprintf(stderr, "bspatch: Failed to decompress diff data.\n");
+		rv = 0;
+		goto out;
+	}
+	datalen = len;
+
+	len = extralen;
+	if (zero_rle_decode(pbuf + hdrsz + ctrllen + lzdatalen, lzextralen, extradata, &len) == -1 ||
+	    len != extralen) {
+		fprintf(stderr, "bspatch: Failed to decompress extra data.\n");
+		rv = 0;
+		goto out;
+	}
+	extralen = len;
+	BUFOPEN(&cpf, pbuf + hdrsz, ctrllen);
+	BUFOPEN(&dpf, diffdata, datalen);
+	BUFOPEN(&epf, extradata, extralen);
+
+	oldpos=0;newpos=0;
+	while(newpos<newsize) {
+		/* Read control data */
+		for(i=0;i<=2;i++) {
+			lenread = BUFREAD(&cpf, buf, 4);
+			if (lenread < 4) {
+				fprintf(stderr, "2: Corrupt diff data\n");
+				rv = 0;
+				goto out;
+			}
+			ctrl[i]=valini32(buf);
+		};
+
+		/* Sanity-check */
+		if(newpos+ctrl[0]>newsize) {
+			fprintf(stderr, "3: Corrupt diff data\n");
+			rv = 0;
+			goto out;
+		}
+
+		/* Read diff string */
+		lenread = BUFREAD(&dpf, new + newpos, ctrl[0]);
+		if (lenread < ctrl[0]) {
+			fprintf(stderr, "4: Corrupt diff data\n");
+			rv = 0;
+			goto out;
+		}
+
+		/* Add old data to diff string */
+		for(i=0;i<ctrl[0];i++)
+			if((oldpos+i>=0) && (oldpos+i<oldsize))
+				new[newpos+i]+=old[oldpos+i];
+
+		/* Adjust pointers */
+		newpos+=ctrl[0];
+		oldpos+=ctrl[0];
+
+		/* Sanity-check */
+		if(newpos+ctrl[1]>newsize) {
+			fprintf(stderr, "5: Corrupt diff data\n");
+			rv = 0;
+			goto out;
+		}
+
+		/* Read extra string */
+		lenread = BUFREAD(&epf, new + newpos, ctrl[1]);
+		if (lenread < ctrl[1]) {
+			fprintf(stderr, "6: Corrupt diff data\n");
+			rv = 0;
+			goto out;
+		}
+
+		/* Adjust pointers */
+		newpos+=ctrl[1];
+		oldpos+=ctrl[2];
+	};
+
+out:
+	free(diffdata);
+	free(extradata);
+
+	return (rv);
+}
diff --git a/bsdiff/rle_encoder.c b/bsdiff/rle_encoder.c
new file mode 100644
index 0000000..fdc5f15
--- /dev/null
+++ b/bsdiff/rle_encoder.c
@@ -0,0 +1,112 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ *
+ * This RLE encoder is a simple approach to encode long runs of '0'
+ * bytes that typically are found in a bsdiff patch output. This
+ * does not encode repeating runs of other characters.
+ */
+
+#include <utils.h>
+#include <stdio.h>
+
+#define ZERO_MASK (32768)
+#define DATA_MASK (32767)
+#define COUNT_MAX (32767)
+
+int
+zero_rle_encode(const void *const ibuf, const unsigned int ilen,
+	void *obuf, unsigned int *const olen)
+{
+	unsigned int pos1, pos2;
+	unsigned short count;
+	const uchar_t *const ib = ibuf;
+	uchar_t *ob = obuf;
+
+	pos2 = 0;
+	for (pos1=0; pos1<ilen && pos2<*olen;) {
+		count = 0;
+		if (ib[pos1] == 0) {
+			for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
+			count |= ZERO_MASK;
+			*((unsigned short *)(ob + pos2)) = htons(count);
+			pos2 += 2;
+		} else {
+			unsigned int pos3, pos4, cnt, state;
+			pos3 = pos2;
+			pos2 += 2;
+			if (pos2 > *olen) break;
+
+			state = 0;
+			for (;pos1<ilen && pos2<*olen && count<COUNT_MAX;) {
+				if (ib[pos1] != 0) state = 0;
+				if (ib[pos1] == 0 && !state) {
+					cnt = 0;
+					pos4 = pos1;
+					state = 1;
+					// Lookahead if have ate least 4 consecutive zeroes
+					for (;pos4<ilen && ib[pos4] == 0; pos4++) cnt++;
+					if (cnt >= 4) break;
+				}
+				ob[pos2++] = ib[pos1++];
+				count++;
+			}
+			*((unsigned short *)(ob + pos3)) = htons(count);
+		}
+	}
+	*olen = pos2;
+	if (pos1 < ilen) {
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+int
+zero_rle_decode(const void* ibuf, unsigned int ilen,
+	void* obuf, unsigned int *olen)
+{
+	unsigned int pos1, pos2, i;
+	unsigned short count;
+	const uchar_t *ib = ibuf;
+	uchar_t *ob = obuf;
+
+	pos2 = 0;
+	pos1 = 0;
+	for (; pos1<ilen && pos2<*olen;) {
+		count = ntohs(*((unsigned short *)(ib + pos1)));
+		pos1 += 2;
+		if (count & ZERO_MASK) {
+			count &= DATA_MASK;
+			for (i=0; i<count && pos2<*olen; i++)
+				ob[pos2++] = 0;
+		} else {
+			for (i=0; i<count && pos1<ilen && pos2<*olen; i++)
+				ob[pos2++] = ib[pos1++];
+		}
+	}
+	i = *olen;
+	*olen = pos2;
+	if (pos1 < ilen || pos2 < i) {
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+
diff --git a/main.c b/main.c
index 6183bab..3471ba3 100644
--- a/main.c
+++ b/main.c
@@ -78,6 +78,7 @@ static int nthreads = 0;
 static int hide_mem_stats = 1;
 static int hide_cmp_stats = 1;
 static int enable_rabin_scan = 0;
+static int enable_delta_encode = 0;
 static int enable_rabin_split = 1;
 static unsigned int chunk_num;
 static uint64_t largest_chunk, smallest_chunk, avg_chunk;
@@ -118,10 +119,12 @@ usage(void)
 	    "4) Attempt Rabin fingerprinting based deduplication on chunks:\n"
 	    "   %s -D ...\n"
 	    "   %s -D -r ... - Do NOT split chunks at a rabin boundary. Default is to split.\n"
-	    "5) Number of threads can optionally be specified: -t <1 - 256 count>\n"
-	    "6) Pass '-M' to display memory allocator statistics\n"
-	    "7) Pass '-C' to display compression statistics\n\n",
-	    exec_name, exec_name, exec_name, exec_name);
+	    "5) Perform Delta Encoding in addition to Exact Dedup:\n"
+	    "   %s -E ... - This also implies '-D'.\n"
+	    "6) Number of threads can optionally be specified: -t <1 - 256 count>\n"
+	    "7) Pass '-M' to display memory allocator statistics\n"
+	    "8) Pass '-C' to display compression statistics\n\n",
+	    exec_name, exec_name, exec_name, exec_name, exec_name, exec_name);
 }
 
 void
@@ -436,7 +439,8 @@ start_decompress(const char *filename, const char *to_filename)
 		if (_init_func)
 			_init_func(&(tdat->data), &(tdat->level), chunksize);
 		if (enable_rabin_scan)
-			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, algo);
+			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
+			    algo, enable_delta_encode);
 		else
 			tdat->rctx = NULL;
 		if (pthread_create(&(tdat->thr), NULL, perform_decompress,
@@ -905,7 +909,8 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 		if (_init_func)
 			_init_func(&(tdat->data), &(tdat->level), chunksize);
 		if (enable_rabin_scan)
-			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize, algo);
+			tdat->rctx = create_rabin_context(chunksize, compressed_chunksize,
+			    algo, enable_delta_encode);
 		else
 			tdat->rctx = NULL;
 
@@ -965,7 +970,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 	 * Read the first chunk into a spare buffer (a simple double-buffering).
 	 */
 	if (enable_rabin_split) {
-		rctx = create_rabin_context(chunksize, 0, algo);
+		rctx = create_rabin_context(chunksize, 0, algo, enable_delta_encode);
 		rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
 	} else {
 		rbytes = Read(uncompfd, cread_buf, chunksize);
@@ -1203,7 +1208,7 @@ main(int argc, char *argv[])
 	level = 6;
 	slab_init();
 
-	while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDr")) != -1) {
+	while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDEr")) != -1) {
 		int ovr;
 
 		switch (opt) {
@@ -1259,6 +1264,11 @@ main(int argc, char *argv[])
 			enable_rabin_scan = 1;
 			break;
 
+		    case 'E':
+			enable_rabin_scan = 1;
+			enable_delta_encode = 1;
+			break;
+
 		    case 'r':
 			enable_rabin_split = 0;
 			break;
diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c
index a275186..ffb9ba5 100755
--- a/rabin/rabin_polynomial.c
+++ b/rabin/rabin_polynomial.c
@@ -1,9 +1,12 @@
 /*
  * rabin_polynomial.c
  * 
- * Created by Joel Lawrence Tucci on 09-March-2011.
+ * The rabin polynomial computation is derived from:
+ * http://code.google.com/p/rabin-fingerprint-c/
  * 
- * Copyright (c) 2011 Joel Lawrence Tucci
+ * originally created by Joel Lawrence Tucci on 09-March-2011.
+ * 
+ * Rabin polynomial portions Copyright (c) 2011 Joel Lawrence Tucci
  * All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
@@ -70,6 +73,11 @@ extern int lzma_compress(void *src, size_t srclen, void *dst,
 extern int lzma_decompress(void *src, size_t srclen, void *dst,
 	size_t *dstlen, int level, uchar_t chdr, void *data);
 extern int lzma_deinit(void **data);
+extern int bsdiff(u_char *old, bsize_t oldsize, u_char *new, bsize_t newsize,
+       u_char *diff, u_char *scratch, bsize_t scratchsize);
+extern bsize_t get_bsdiff_sz(u_char *pbuf);
+extern int bspatch(u_char *pbuf, u_char *old, bsize_t oldsize, u_char *new,
+	bsize_t *_newsize);
 
 uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
 
@@ -77,11 +85,10 @@ uint32_t rabin_polynomial_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
  * Initialize the algorithm with the default params.
  */
 rabin_context_t *
-create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo) {
+create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *algo, int delta_flag) {
 	rabin_context_t *ctx;
 	unsigned char *current_window_data;
 	uint32_t blknum;
-	int level = 14;
 
 	/*
 	 * Rabin window size must be power of 2 for optimization.
@@ -90,14 +97,22 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
 		fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n");
 		return (NULL);
 	}
+
+	if (chunksize < RAB_MIN_CHUNK_SIZE) {
+		fprintf(stderr, "Minimum chunk size for Dedup must be %l bytes\n",
+		    RAB_MIN_CHUNK_SIZE);
+		return (NULL);
+	}
+
 	/*
-	 * For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin
-	 * block size. For everything else it is 1K based on experimentation.
+	 * For LZMA with chunksize <= LZMA Window size and/or Delta enabled we
+	 * use 4K minimum Rabin block size. For everything else it is 2K based
+	 * on experimentation.
 	 */
 	ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
 	ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
-	if ((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
-	    chunksize <= LZMA_WINDOW_MAX) {
+	if (((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
+	      chunksize <= LZMA_WINDOW_MAX) || delta_flag) {
 		ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE;
 		ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK;
 		ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
@@ -132,11 +147,12 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
 	}
 
 	ctx->lzma_data = NULL;
+	ctx->level = 14;
 	if (real_chunksize > 0) {
 		lzma_init(&(ctx->lzma_data), &(ctx->level), chunksize);
 		if (!(ctx->lzma_data)) {
 			fprintf(stderr,
-			    "Could not allocate rabin polynomial context, out of memory\n");
+			    "Could not initialize LZMA data for rabin index, out of memory\n");
 			destroy_rabin_context(ctx);
 			return (NULL);
 		}
@@ -154,6 +170,7 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
 
 	ctx->current_window_data = current_window_data;
 	ctx->real_chunksize = real_chunksize;
+	ctx->delta_flag = delta_flag;
 	reset_rabin_context(ctx);
 	return (ctx);
 }
@@ -185,12 +202,24 @@ cmpblks(const void *a, const void *b)
 	rabin_blockentry_t *a1 = (rabin_blockentry_t *)a;
 	rabin_blockentry_t *b1 = (rabin_blockentry_t *)b;
 
-	if (a1->cksum_n_offset < b1->cksum_n_offset)
+	if (a1->cksum_n_offset < b1->cksum_n_offset) {
 		return (-1);
-	else if (a1->cksum_n_offset == b1->cksum_n_offset)
-		return (0);
-	else if (a1->cksum_n_offset > b1->cksum_n_offset)
+	} else if (a1->cksum_n_offset == b1->cksum_n_offset) {
+		/*
+		 * If fingerprints match then compare lengths. Length match makes
+		 * for strong exact detection/ordering during sort while stopping
+		 * short of expensive memcmp().
+		 */
+		if (a1->length < b1->length) {
+			return (-1);
+		} else if (a1->length == b1->length) {
+			return (0);
+		} else if (a1->length > b1->length) {
+			return (1);
+		}
+	} else if (a1->cksum_n_offset > b1->cksum_n_offset) {
 		return (1);
+	}
 }
 
 /**
@@ -200,19 +229,32 @@ cmpblks(const void *a, const void *b)
 uint32_t
 rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, ssize_t *rabin_pos)
 {
-	ssize_t i, last_offset, j;
+	ssize_t i, last_offset, j, fplist_sz;
 	uint32_t blknum;
 	char *buf1 = (char *)buf;
 	uint32_t length;
-	uint64_t cur_roll_checksum[2];
+	uint64_t cur_roll_checksum, cur_sketch;
+	uint64_t *fplist;
+	uint32_t len1, fpos;
 
+	if (rabin_pos == NULL) {
+		/*
+		 * Initialize arrays for sketch computation. We re-use memory allocated
+		 * for the compressed chunk temporarily.
+		 */
+		fplist_sz = 8 * ctx->rabin_poly_avg_block_size;
+		fplist = (uint64_t *)(ctx->cbuf + ctx->real_chunksize - fplist_sz);
+		memset(fplist, 0, fplist_sz);
+		fpos = 0;
+		len1 = 0;
+	}
 	length = offset;
 	last_offset = 0;
 	blknum = 0;
 	ctx->valid = 0;
-	cur_roll_checksum[0] = 0;
-	cur_roll_checksum[1] = 0;
+	cur_roll_checksum = 0;
 	j = 0;
+	cur_sketch = 0;
 
 	/* 
 	 * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
@@ -234,13 +276,39 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 		 *
 		 * However since RAB_POLYNOMIAL_CONST == 2, we use shifts.
 		 */
-		cur_roll_checksum[1] = (cur_roll_checksum[1] << 1) + cur_byte;
-		cur_roll_checksum[1] -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
-
-		// Compute Sum 0 mod 25 Sketch. We are avoiding a branch here.
-		// See: http://www.armedia.com/wp/SimilarityIndex.pdf
-		j += cur_roll_checksum[(cur_roll_checksum[1] % 25 == 0)];
+		cur_roll_checksum = (cur_roll_checksum << 1) + cur_byte;
+		cur_roll_checksum -= (pushed_out << RAB_POLYNOMIAL_WIN_SIZE);
 
+		/*
+		 * Compute a super sketch value of the block. We store a sum of relative
+		 * maximal rabin hash values per 1K(SKETCH_BASIC_BLOCK_SZ) of data. So we
+		 * get upto 128 sums for a max block size of 128K. This is a representative
+		 * fingerprint sketch of the block. Storing and comparing upto 128 fingerprints
+		 * per block is very expensive (compute & RAM) so we eventually sum all the
+		 * fingerprints for the block to create a single super sketch value representing
+		 * maximal features of the block.
+		 * 
+		 * This value can be used for similarity detection for delta encoding. Exact
+		 * match for deduplication is additionally detected via a memcmp(). This is a
+		 * variant of some approaches detailed in:
+		 * http://www.armedia.com/wp/SimilarityIndex.pdf
+		 */
+		if (rabin_pos == NULL) {
+			len1++;
+			j = cur_roll_checksum & ctx->rabin_avg_block_mask;
+			fplist[j] += cur_roll_checksum;
+			if (fplist[j] > fplist[fpos]) fpos = j;
+			if (len1 == SKETCH_BASIC_BLOCK_SZ) {
+				/*
+				 * Compute the super sketch value by summing all the representative
+				 * fingerprints of the block.
+				 */
+				cur_sketch += fplist[fpos];
+				memset(fplist, 0, fplist_sz);
+				fpos = 0;
+				len1 = 0;
+			}
+		}
 		/*
 		 * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
 		 * We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
@@ -252,14 +320,19 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 		if (length < ctx->rabin_poly_min_block_size) continue;
 
 		// If we hit our special value or reached the max block size update block offset
-		if ((cur_roll_checksum[1] & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
+		if ((cur_roll_checksum & ctx->rabin_avg_block_mask) == ctx->rabin_break_patt ||
 		    length >= rabin_polynomial_max_block_size) {
 			if (rabin_pos == NULL) {
 				ctx->blocks[blknum].offset = last_offset;
 				ctx->blocks[blknum].index = blknum; // Need to store for sorting
-				ctx->blocks[blknum].cksum_n_offset = j;
 				ctx->blocks[blknum].length = length;
 				ctx->blocks[blknum].refcount = 0;
+				ctx->blocks[blknum].similar = 0;
+				ctx->blocks[blknum].cksum_n_offset = cur_sketch;
+				memset(fplist, 0, fplist_sz);
+				fpos = 0;
+				len1 = 0;
+				cur_sketch = 0;
 				blknum++;
 			}
 			last_offset = i+1;
@@ -287,9 +360,10 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 		if (last_offset < *size) {
 			ctx->blocks[blknum].offset = last_offset;
 			ctx->blocks[blknum].index = blknum;
-			ctx->blocks[blknum].cksum_n_offset = j;
 			ctx->blocks[blknum].length = *size - last_offset;
 			ctx->blocks[blknum].refcount = 0;
+			ctx->blocks[blknum].similar = 0;
+			ctx->blocks[blknum].cksum_n_offset = cur_sketch;
 			blknum++;
 			last_offset = *size;
 		}
@@ -302,8 +376,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 		/*
 		 * Now sort the block array based on checksums. This will bring virtually 
 		 * all similar block entries together. Effectiveness depends on how strong
-		 * our checksum is. We are using CRC64 here so we should be pretty okay.
-		 * TODO: Test with a heavily optimized MD5 (from OpenSSL?) later.
+		 * our checksum is. We are using a maximal super-sketch value.
 		 */
 		qsort(ctx->blocks, blknum, sizeof (rabin_blockentry_t), cmpblks);
 		rabin_index = (uint32_t *)(ctx->cbuf + RABIN_HDR_SIZE);
@@ -332,7 +405,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			if (blk > 0 && ctx->blocks[blk].cksum_n_offset == prev_cksum &&
 			    ctx->blocks[blk].length == prev_length &&
 			    memcmp(prev_offset, buf1 + ctx->blocks[blk].offset, prev_length) == 0) {
-				ctx->blocks[blk].length = 0;
+				ctx->blocks[blk].similar = SIMILAR_EXACT;
 				ctx->blocks[blk].index = prev_index;
 				(ctx->blocks[prev_blk].refcount)++;
 				matchlen += prev_length;
@@ -344,10 +417,32 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			prev_index = ctx->blocks[blk].index;
 			prev_blk = blk;
 		}
+
+		if (ctx->delta_flag) {
+			for (blk = 0; blk < blknum; blk++) {
+				if (ctx->blocks[blk].similar) continue;
+
+				if (blk > 0 && ctx->blocks[blk].refcount == 0 &&
+				    ctx->blocks[blk].cksum_n_offset == prev_cksum) {
+					ssize_t sz1, sz2;
+					ctx->blocks[blk].index = prev_index;
+					ctx->blocks[blk].similar = SIMILAR_PARTIAL;
+					(ctx->blocks[prev_blk].refcount)++;
+					matchlen += prev_length/2;
+					continue;
+				}
+				prev_offset = buf1 + ctx->blocks[blk].offset;
+				prev_cksum = ctx->blocks[blk].cksum_n_offset;
+				prev_length = ctx->blocks[blk].length;
+				prev_index = ctx->blocks[blk].index;
+				prev_blk = blk;
+			}
+		}
 		if (matchlen < rabin_index_sz) {
 			ctx->valid = 0;
 			return;
 		}
+
 		/*
 		 * Another pass, this time through the block index in the chunk. We insert
 		 * block length into unique block entries. For block entries that are
@@ -362,11 +457,12 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			rabin_blockentry_t *be;
 
 			be = &(ctx->blocks[blkarr[blk]]);
-			if (be->length > 0) {
+			if (be->similar == 0) {
 				/*
 				 * Update Index entry with the length. Also try to merge runs
-				 * of unique (non-duplicate) blocks into a single block entry
-				 * as long as the total length does not exceed max block size.
+				 * of unique (non-duplicate/similar) blocks into a single block
+				 * entry as long as the total length does not exceed max block
+				 * size.
 				 */
 				if (prev_index == 0) {
 					if (be->refcount == 0) {
@@ -402,32 +498,63 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
 			} else {
 				prev_index = 0;
 				prev_length = 0;
-				rabin_index[pos] = be->index | RABIN_INDEX_FLAG;
+				ctx->blocks[pos].cksum_n_offset = be->offset;
+				ctx->blocks[pos].new_length = be->length;
 				trans[blk] = pos;
+
+				if (be->similar == SIMILAR_EXACT) {
+					rabin_index[pos] = (blkarr[be->index] | RABIN_INDEX_FLAG) &
+					    CLEAR_SIMILARITY_FLAG;
+				} else {
+					rabin_index[pos] = blkarr[be->index] | RABIN_INDEX_FLAG |
+					    SET_SIMILARITY_FLAG;
+				}
 				pos++;
 			}
 		}
 
 		/*
-		 * Final pass, copy the data.
+		 * Final pass, copy the data and perform delta encoding.
 		 */
 		blknum = pos;
 		rabin_index_sz = (ssize_t)pos * RABIN_ENTRY_SIZE;
 		pos1 = rabin_index_sz + RABIN_HDR_SIZE;
 		for (blk = 0; blk < blknum; blk++) {
+			uchar_t *old, *new;
+			int32_t bsz;
+
+			/*
+			 * If blocks are overflowing the allowed chunk size then dedup did not
+			 * help at all. We invalidate the dedup operation.
+			 */
+			if (pos1 > last_offset) {
+				valid = 0;
+				break;
+			}
 			if (rabin_index[blk] & RABIN_INDEX_FLAG) {
 				j = rabin_index[blk] & RABIN_INDEX_VALUE;
-				rabin_index[blk] = htonl(trans[j] | RABIN_INDEX_FLAG);
-			} else {
-				/*
-				 * If blocks are overflowing the allowed chunk size then dedup did not
-				 * help at all. We invalidate the dedup operation.
-				 */
-				if (pos1 > last_offset) {
-					valid = 0;
-					break;
+				i = ctx->blocks[j].index;
+
+				if (rabin_index[blk] & GET_SIMILARITY_FLAG) {
+					old = buf1 + ctx->blocks[j].offset;
+					new = buf1 + ctx->blocks[blk].cksum_n_offset;
+					bsz = bsdiff(old, ctx->blocks[j].length, new,
+					    ctx->blocks[blk].new_length, ctx->cbuf + pos1, 0, 0);
+					if (bsz == 0) {
+						memcpy(ctx->cbuf + pos1, new, ctx->blocks[blk].new_length);
+						rabin_index[blk] = htonl(ctx->blocks[blk].new_length);
+						pos1 += ctx->blocks[blk].new_length;
+					} else {
+						rabin_index[blk] = htonl(trans[i] |
+						    RABIN_INDEX_FLAG | SET_SIMILARITY_FLAG);
+						pos1 += bsz;
+					}
+				} else {
+					rabin_index[blk] = htonl(trans[i] | RABIN_INDEX_FLAG);
 				}
-				memcpy(ctx->cbuf + pos1, buf1 + ctx->blocks[blk].cksum_n_offset, rabin_index[blk]);
+			} else {
+				memcpy(ctx->cbuf + pos1, buf1 + ctx->blocks[blk].cksum_n_offset,
+				    rabin_index[blk]);
 				pos1 += rabin_index[blk];
 				rabin_index[blk] = htonl(rabin_index[blk]);
 			}
@@ -512,29 +639,66 @@ rabin_inverse_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size)
 			ctx->blocks[blk].offset = pos1;
 			pos1 += len;
 		} else {
+			bsize_t blen;
+
 			ctx->blocks[blk].length = 0;
-			ctx->blocks[blk].index = len & RABIN_INDEX_VALUE;
+			if (len & GET_SIMILARITY_FLAG) {
+				ctx->blocks[blk].offset = pos1;
+				ctx->blocks[blk].index = (len & RABIN_INDEX_VALUE) | SET_SIMILARITY_FLAG;
+				blen = get_bsdiff_sz(buf + pos1);
+				pos1 += blen;
+			} else {
+				ctx->blocks[blk].index = len & RABIN_INDEX_VALUE;
+			}
 		}
 	}
+
 	for (blk = 0; blk < blknum; blk++) {
+		int rv;
+		bsize_t newsz;
+
 		if (ctx->blocks[blk].length == 0 && ctx->blocks[blk].index == 0) continue;
 		if (ctx->blocks[blk].length > 0) {
 			len = ctx->blocks[blk].length;
 			pos1 = ctx->blocks[blk].offset;
 		} else {
 			oblk = ctx->blocks[blk].index;
-			len = ctx->blocks[oblk].length;
-			pos1 = ctx->blocks[oblk].offset;
+
+			if (oblk & GET_SIMILARITY_FLAG) {
+				oblk = oblk & CLEAR_SIMILARITY_FLAG;
+				len = ctx->blocks[oblk].length;
+				pos1 = ctx->blocks[oblk].offset;
+				newsz = data_sz - sz;
+				rv = bspatch(buf + ctx->blocks[blk].offset, buf + pos1, len, pos2, &newsz);
+				if (rv == 0) {
+					fprintf(stderr, "Failed to bspatch block.\n");
+					ctx->valid = 0;
+					break;
+				}
+				pos2 += newsz;
+				sz += newsz;
+				if (sz > data_sz) {
+					fprintf(stderr, "Dedup data overflows chunk.\n");
+					ctx->valid = 0;
+					break;
+				}
+				continue;
+			} else {
+				len = ctx->blocks[oblk].length;
+				pos1 = ctx->blocks[oblk].offset;
+			}
 		}
 		memcpy(pos2, buf + pos1, len);
 		pos2 += len;
 		sz += len;
 		if (sz > data_sz) {
+			fprintf(stderr, "Dedup data overflows chunk.\n");
 			ctx->valid = 0;
 			break;
 		}
 	}
 	if (ctx->valid && sz < data_sz) {
+		fprintf(stderr, "Too little dedup data processed.\n");
 		ctx->valid = 0;
 	}
 	*size = data_sz;
diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h
index ffe642f..fe67122 100644
--- a/rabin/rabin_polynomial.h
+++ b/rabin/rabin_polynomial.h
@@ -84,19 +84,15 @@
 #define	RAB_POLYNOMIAL_MIN_WIN_SIZE 8
 #define	RAB_POLYNOMIAL_MAX_WIN_SIZE 64
 
-typedef struct {
-	ssize_t offset;
-	uint64_t cksum_n_offset; // Dual purpose variable
-	unsigned int index;
-	unsigned int length;
-	unsigned short refcount;
-} rabin_blockentry_t;
+// Minimum practical chunk size when doing dedup
+#define	RAB_MIN_CHUNK_SIZE (1048576L)
+
+// Number of bytes to compute one maximal fingerprint value
+#define	SKETCH_BASIC_BLOCK_SZ (1024)
 
 // An entry in the Rabin block array in the chunk.
-// It is either a length value <= RAB_POLYNOMIAL_MAX_BLOCK_SIZE or
-// if value > RAB_POLYNOMIAL_MAX_BLOCK_SIZE then
-// value - RAB_POLYNOMIAL_MAX_BLOCK_SIZE is index of block with which
-// this block is a duplicate.
+// It is either a length value <= RABIN_MAX_BLOCK_SIZE or an index value with
+// which this block is a duplicate/similar. The entries are variable sized.
 // Offset can be dynamically calculated.
 //
 #define	RABIN_ENTRY_SIZE (sizeof (unsigned int))
@@ -106,20 +102,43 @@ typedef struct {
 // size of deduped data, size of compressed data
 #define	RABIN_HDR_SIZE (sizeof (unsigned int) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t) + sizeof (ssize_t))
 
-// Maximum number of dedup blocks supported (2^31 - 1)
-#define	RABIN_MAX_BLOCKS (0x7fffffff)
+// Maximum number of dedup blocks supported (2^30 - 1)
+#define	RABIN_MAX_BLOCKS (0x3FFFFFFFUL)
 
 // Maximum possible block size for a single rabin block. This is a hard limit much
 // larger than RAB_POLYNOMIAL_MAX_BLOCK_SIZE. Useful when merging non-duplicate blocks.
 // This is also 2^31 - 1.
-#define RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS)
+#define	RABIN_MAX_BLOCK_SIZE (RABIN_MAX_BLOCKS)
 
-// Mask to determine whether Rabin index entry is a length value or index value.
+// Masks to determine whether Rabin index entry is a length value, duplicate index value
+// or similar index value.
 // MSB = 1 : Index
 // MSB = 0 : Length
-#define RABIN_INDEX_FLAG (0x80000000)
+// MSB-1 = 1: Similarity Index
+// MSB-1 = 0: Exact Duplicate Index
+#define	RABIN_INDEX_FLAG (0x80000000UL)
+#define	SET_SIMILARITY_FLAG (0x40000000UL)
+#define	GET_SIMILARITY_FLAG SET_SIMILARITY_FLAG
+#define	CLEAR_SIMILARITY_FLAG (0xBFFFFFFFUL)
+
 // Mask to extract value from a rabin index entry
-#define RABIN_INDEX_VALUE (0x7fffffff)
+#define	RABIN_INDEX_VALUE (0x3FFFFFFFUL)
+
+// Tolerance for partial similarity check. We expect 80% similarity for
+// delta compression. See: http://www.armedia.com/wp/SimilarityIndex.pdf
+#define	SIMILARITY_TOLERANCE (0.2f)
+#define	SIMILAR_EXACT 1
+#define	SIMILAR_PARTIAL 2
+
+typedef struct {
+	ssize_t offset;
+	uint64_t cksum_n_offset; // Dual purpose variable
+	unsigned int index;
+	unsigned int length;
+	unsigned int new_length;
+	unsigned short refcount;
+	short similar;
+} rabin_blockentry_t;
 
 typedef struct {
 	unsigned char *current_window_data;
@@ -134,11 +153,11 @@ typedef struct {
 	uint64_t real_chunksize;
 	short valid;
 	void *lzma_data;
-	int level;
+	int level, delta_flag;
 } rabin_context_t;
 
 extern rabin_context_t *create_rabin_context(uint64_t chunksize, uint64_t real_chunksize,
-	const char *algo);
+	const char *algo, int delta_flag);
 extern void destroy_rabin_context(rabin_context_t *ctx);
 extern unsigned int rabin_dedup(rabin_context_t *ctx, unsigned char *buf, 
 	ssize_t *size, ssize_t offset, ssize_t *rabin_pos);
diff --git a/utils.h b/utils.h
index 03b0e0b..bbf7e58 100644
--- a/utils.h
+++ b/utils.h
@@ -52,6 +52,7 @@ extern "C" {
 #       endif
 #endif
 typedef unsigned long uintptr_t;
+typedef ssize_t bsize_t;
 
 #undef WORDS_BIGENDIAN
 #if BYTE_ORDER == BIG_ENDIAN