Enable building with alternate Zlib and Bzlib.

Update README and comments.
Fix correct setting of output size when using Delta2 without LZP.
This commit is contained in:
Moinak Ghosh 2012-12-16 23:17:04 +05:30
parent 5ac47db6d5
commit fb30b5c295
6 changed files with 132 additions and 45 deletions

10
INSTALL
View file

@ -76,6 +76,13 @@ not the usual GNU Autoconf script.
path to the libbsc source tree must be provided. It path to the libbsc source tree must be provided. It
links the library statically. links the library statically.
--with-zlib=<path to zlib installation tree> (Default: System)
Enable building against an alternate Zlib installation.
--with-bzlib=<path to Bzip2 library installation tree> (Default: System)
Enable building against an alternate Bzip2 and library
installation.
--help Display the help message. --help Display the help message.
Steps for building with libbsc support Steps for building with libbsc support
@ -95,4 +102,7 @@ Steps for building with libbsc support
4) Now run make in the pcompress directory. This will also run make in 4) Now run make in the pcompress directory. This will also run make in
the libbsc source directory to build it. the libbsc source directory to build it.
5) Additional compilation flags can be passed to make like this:
make EXTRA_CPPFLAGS=<...> EXTRA_LDFLAGS=<...>

View file

@ -22,16 +22,31 @@
# #
PROG= pcompress PROG= pcompress
MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \ MAINSRCS = main.c utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \ adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c utils/cpuid.c \ utils/xxhash.c utils/heapq.c utils/cpuid.c
crypto/aes/crypto_aes.c crypto/scrypt/crypto_scrypt-nosse.c \
crypto/scrypt/sha256.c crypto/scrypt/crypto_aesctr.c crypto/crypto_utils.c
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h \ MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h \
utils/cpuid.h crypto/crypto_utils.h crypto/scrypt/crypto_scrypt.h \ utils/cpuid.h
crypto/scrypt/sha256.h crypto/scrypt/crypto_aesctr.h crypto/aes/crypto_aes.h
MAINOBJS = $(MAINSRCS:.c=.o) MAINOBJS = $(MAINSRCS:.c=.o)
CRYPTO_SRCS = crypto/aes/crypto_aes.c crypto/scrypt/crypto_scrypt-nosse.c \
crypto/scrypt/sha256.c crypto/scrypt/crypto_aesctr.c crypto/crypto_utils.c
CRYPTO_HDRS = crypto/crypto_utils.h crypto/scrypt/crypto_scrypt.h \
crypto/scrypt/sha256.h crypto/scrypt/crypto_aesctr.h crypto/aes/crypto_aes.h \
$(MAINHDRS)
CRYPTO_OBJS = $(CRYPTO_SRCS:.c=.o)
CRYPTO_CPPFLAGS=-I@OPENSSL_INCDIR@
ZLIB_SRCS = zlib_compress.c
ZLIB_HDRS = $(MAINHDRS)
ZLIB_OBJS = $(ZLIB_SRCS:.c=.o)
ZLIB_CPPFLAGS = @LIBZ_INC@
BZLIB_SRCS = bzip2_compress.c
BZLIB_HDRS = $(MAINHDRS)
BZLIB_OBJS = $(BZLIB_SRCS:.c=.o)
BZLIB_CPPFLAGS = @LIBBZ2_INC@
RABINSRCS = rabin/rabin_dedup.c RABINSRCS = rabin/rabin_dedup.c
RABINHDRS = rabin/rabin_dedup.h utils/utils.h RABINHDRS = rabin/rabin_dedup.h utils/utils.h
RABINOBJS = $(RABINSRCS:.c=.o) RABINOBJS = $(RABINSRCS:.c=.o)
@ -130,17 +145,17 @@ RM = rm -f
RM_RF = rm -rf RM_RF = rm -rf
COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \ COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \ -DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@ \ -I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I./crypto/sha2 \
-I./crypto/sha2 -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \ -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \
@LIBBZ2_INC@ @LIBZ_INC@ -I./crypto/keccak -I./transpose -I./crypto/keccak -I./transpose $(EXTRA_CPPFLAGS)
COMMON_VEC_FLAGS = -ftree-vectorize COMMON_VEC_FLAGS = -ftree-vectorize
COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \ LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \
-L@OPENSSL_LIBDIR@ -lcrypto -lrt -L@OPENSSL_LIBDIR@ -lcrypto -lrt $(EXTRA_LDFLAGS)
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \ OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \ $(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
$(TRANSP_OBJS) $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS)
DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@ DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@
DEBUG_COMPILE = gcc -m64 -g -msse3 -c DEBUG_COMPILE = gcc -m64 -g -msse3 -c
@ -235,6 +250,15 @@ $(LIBBSCWRAPOBJ): $(LIBBSCWRAP) $(LIBBSCLIB)
$(TRANSP_OBJS): $(TRANSP_SRCS) $(TRANSP_HDRS) $(TRANSP_OBJS): $(TRANSP_SRCS) $(TRANSP_HDRS)
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ $(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(CRYPTO_OBJS): $(CRYPTO_SRCS) $(CRYPTO_HDRS)
$(COMPILE) $(GEN_OPT) $(CRYPTO_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(ZLIB_OBJS): $(ZLIB_SRCS) $(ZLIB_HDRS)
$(COMPILE) $(GEN_OPT) $(ZLIB_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(BZLIB_OBJS): $(BZLIB_SRCS) $(BZLIB_HDRS)
$(COMPILE) $(GEN_OPT) $(BZLIB_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS) $(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
$(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ $(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
@ -253,13 +277,13 @@ distclean: clean
$(RM) Makefile $(RM) Makefile
install: $(PROG) install: $(PROG)
@mkdir -p $(PREFIX)/bin @mkdir -p $(DESTDIR)$(PREFIX)/bin
@chmod 0755 $(PREFIX)/bin @chmod 0755 $(DESTDIR)$(PREFIX)/bin
@cp $(PROG) $(PREFIX)/bin @cp $(PROG) $(DESTDIR)$(PREFIX)/bin
@chmod 0555 $(PREFIX)/bin/$(PROG) @chmod 0555 $(DESTDIR)$(PREFIX)/bin/$(PROG)
@mkdir -p $(PREFIX)/share/doc/$(PROG) @mkdir -p $(DESTDIR)$(PREFIX)/share/doc/$(PROG)
@chmod 0755 $(PREFIX)/share $(PREFIX)/share/doc $(PREFIX)/share/doc/$(PROG) @chmod 0755 $(DESTDIR)$(PREFIX)/share $(PREFIX)/share/doc $(PREFIX)/share/doc/$(PROG)
@cp README.md $(PREFIX)/share/doc/$(PROG)/README @cp README.md $(DESTDIR)$(PREFIX)/share/doc/$(PROG)/README
@chmod 0444 $(PREFIX)/share/doc/$(PROG)/README @chmod 0444 $(DESTDIR)$(PREFIX)/share/doc/$(PROG)/README

View file

@ -221,22 +221,28 @@ algorithm can be selected for textual and binary portions.
Pre-Processing Algorithms Pre-Processing Algorithms
========================= =========================
As can be seen above a multitude of pre-processing algorithms are available that provide As can be seen above a multitude of pre-processing algorithms are available that
further compression effectiveness beyond what the usual compression algorithms can provide further compression effectiveness beyond what the usual compression
achieve by themselves. These are summarized below: algorithms can achieve by themselves. These are summarized below:
1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin 1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin
fingerprinting. fingerprinting.
2) LZP : LZ Prediction is a variant of LZ77 that replaces repeating runs of 2) Delta Compression : A similarity based (minhash) comparison of Rabin blocks. Two
text with shorter codes. blocks at least 60% similar with each other are diffed using
bsdiff.
3) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic progressions 3) LZP : LZ Prediction is a variant of LZ77 that replaces repeating
are detected in the data stream and collapsed via Run-Length encoding. runs of text with shorter codes.
4) Matrix Transpose: This is used automatically in Delta Encoding and Deduplication. This 4) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic
attempts to transpose columnar repeating sequences of bytes into progressions are detected in the data stream and collapsed
row-wise sequences so that compression algorithms can work better. via Run-Length encoding.
4) Matrix Transpose : This is used automatically in Delta Encoding and Deduplication.
This attempts to transpose columnar repeating sequences of
bytes into row-wise sequences so that compression algorithms
can work better.
Memory Usage Memory Usage
============ ============

57
config
View file

@ -17,6 +17,10 @@ ${prog} [<options>]
--with-openssl=<path to OpenSSL installation tree> (Default: System) --with-openssl=<path to OpenSSL installation tree> (Default: System)
This defaults to the system's OpenSSL library. You can use this option This defaults to the system's OpenSSL library. You can use this option
if you want to use an alternate OpenSSL installation. if you want to use an alternate OpenSSL installation.
--with-zlib=<path to zlib installation tree> (Default: System)
Enable building against an alternate Zlib installation.
--with-bzlib=<path to Bzip2 library installation tree> (Default: System)
Enable building against an alternate Bzip2 and library installation.
--use-key256 Use 256-bit encryption keys. Default key length is 128-bit. --use-key256 Use 256-bit encryption keys. Default key length is 128-bit.
--help Display this help message. --help Display this help message.
@ -47,6 +51,8 @@ keccak_srcs=
keccak_hdrs= keccak_hdrs=
keccak_srcs_asm= keccak_srcs_asm=
lto_flag= lto_flag=
zlib_prefix=
bzlib_prefix=
# Try a simple compilation # Try a simple compilation
cat << _EOF > tst.c cat << _EOF > tst.c
@ -114,6 +120,12 @@ do
--with-openssl=*) --with-openssl=*)
openssl_prefix=`echo ${arg1} | cut -f2 -d"="` openssl_prefix=`echo ${arg1} | cut -f2 -d"="`
;; ;;
--with-zlib=*)
zlib_prefix=`echo ${arg1} | cut -f2 -d"="`
;;
--with-bzlib=*)
bzlib_prefix=`echo ${arg1} | cut -f2 -d"="`
;;
--use-key256) --use-key256)
keylen='-DKEYLEN=32' keylen='-DKEYLEN=32'
;; ;;
@ -263,11 +275,26 @@ fi
# Detect other library packages # Detect other library packages
for libname in "libbz2" "libz" for libspec in "libbz2:${bzlib_prefix}" "libz:${zlib_prefix}"
do do
for lib in "/lib64" "/usr/lib64" "/lib" "/usr/lib" "/lib/x86_64-linux-gnu" \ _OIFS="$IFS"
"/usr/lib/x86_64-linux-gnu" \ IFS=":"
"${prefix}/lib64" "${prefix}/lib" "${prefix}/lib/x86_64-linux-gnu" set -- ${libspec}
libname=$1
pref=$2
IFS="$_OIFS"
use_prefix="${pref}"
if [ "x${pref}" = "x" ]
then
use_prefix="$prefix"
fi
for lib in "${pref}/lib64" "${pref}/usr/lib64" "${pref}/lib" "${pref}/usr/lib" \
"${pref}/lib/x86_64-linux-gnu" "${pref}/usr/lib/x86_64-linux-gnu" \
"${pref}/local/lib64" "${pref}/usr/local/lib64" "${pref}/local/lib" "${pref}/usr/local/lib" \
"${pref}/local/lib/x86_64-linux-gnu" "${pref}/usr/local/lib/x86_64-linux-gnu" \
"${use_prefix}/lib64" "${use_prefix}/lib" "${use_prefix}/lib/x86_64-linux-gnu" \
"${use_prefix}/usr/lib/x86_64-linux-gnu"
do do
if [ -d ${lib} ] if [ -d ${lib} ]
then then
@ -287,32 +314,50 @@ do
done done
if [ "x${libbz2_libdir}" = "x" ] if [ "x${libbz2_libdir}" = "x" ]
then
if [ "x$bzlib_prefix" = "x" ]
then then
echo "ERROR: Libbz2 not detected." echo "ERROR: Libbz2 not detected."
echo " You may have to install libbz2-devel or libbz2-dev" echo " You may have to install libbz2-devel or libbz2-dev"
else
echo "ERROR: Bzip2 library not detected in given prefix."
fi
exit 1 exit 1
fi fi
if [ "x${libz_libdir}" = "x" ] if [ "x${libz_libdir}" = "x" ]
then
if [ "x$zlib_prefix" = "x" ]
then then
echo "ERROR: Zlib not detected." echo "ERROR: Zlib not detected."
echo " You may have to install libz-devel or libz-dev" echo " You may have to install libz-devel or libz-dev"
else
echo "ERROR: Zlib not detected in given prefix."
fi
exit 1 exit 1
fi fi
libbz2_inc= libbz2_inc=
libz_inc= libz_inc=
# Detect other library headers # Detect other library headers
for hdr in "libbz2_inc:bzlib.h" "libz_inc:zlib.h" for hdr in "libbz2_inc:bzlib.h:${bzlib_prefix}" "libz_inc:zlib.h:${zlib_prefix}"
do do
_OIFS="$IFS" _OIFS="$IFS"
IFS=":" IFS=":"
set -- ${hdr} set -- ${hdr}
var=$1 var=$1
hdrf=$2 hdrf=$2
pref=$3
IFS="$_OIFS" IFS="$_OIFS"
for inc in "${prefix}/include" "/usr/include" use_prefix="${pref}"
if [ "x${pref}" = "x" ]
then
use_prefix="$prefix"
fi
for inc in "${pref}/include" "${pref}/usr/include" \
"${pref}/local/include" "${pref}/usr/local/include" \
"${use_prefix}/include" "${use_prefix}/usr/include"
do do
if [ -d ${inc} ] if [ -d ${inc} ]
then then

View file

@ -34,8 +34,9 @@
* objects are output by the encoder: * objects are output by the encoder:
* 1) A literal run of unmodified bytes. Header: 1 zero byte followed * 1) A literal run of unmodified bytes. Header: 1 zero byte followed
* by a 64bit length in bytes. * by a 64bit length in bytes.
* 2) A literal run of transposed bytes containing at least 87% below * 2) A literal run of transposed bytes containing sequences that are
* threshold sequences. * below threshold and the total span of those sequences is at least
* 87% of the entire run.
* Header: 1 byte stride length with high bit set. * Header: 1 byte stride length with high bit set.
* 64bit length of span in bytes. * 64bit length of span in bytes.
* 3) An encoded run length of a series in arithmetic progression. * 3) An encoded run length of a series in arithmetic progression.
@ -175,7 +176,7 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
if (gtot1 > 0) { if (gtot1 > 0) {
/* /*
* Encode previous literal run, if any. If the literal run * Encode previous literal run, if any. If the literal run
* has enough (90%+) large sequences just below threshold, * has enough (87%+) large sequences just below threshold,
* do a matrix transpose on the range in the hope of achieving * do a matrix transpose on the range in the hope of achieving
* a better compression ratio. * a better compression ratio.
*/ */

1
main.c
View file

@ -272,6 +272,7 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
if (result != -1) { if (result != -1) {
memcpy(src, dst, _dstlen); memcpy(src, dst, _dstlen);
srclen = _dstlen; srclen = _dstlen;
*dstlen = _dstlen;
} else { } else {
return (result); return (result);
} }