Enable building with alternate Zlib and Bzlib.

Update README and comments.
Fix correct setting of output size when using Delta2 without LZP.
This commit is contained in:
Moinak Ghosh 2012-12-16 23:17:04 +05:30
parent 5ac47db6d5
commit fb30b5c295
6 changed files with 132 additions and 45 deletions

10
INSTALL
View file

@ -76,6 +76,13 @@ not the usual GNU Autoconf script.
path to the libbsc source tree must be provided. It
links the library statically.
--with-zlib=<path to zlib installation tree> (Default: System)
Enable building against an alternate Zlib installation.
--with-bzlib=<path to Bzip2 library installation tree> (Default: System)
Enable building against an alternate Bzip2 and library
installation.
--help Display the help message.
Steps for building with libbsc support
@ -95,4 +102,7 @@ Steps for building with libbsc support
4) Now run make in the pcompress directory. This will also run make in
the libbsc source directory to build it.
5) Additional compilation flags can be passed to make like this:
make EXTRA_CPPFLAGS=<...> EXTRA_LDFLAGS=<...>

View file

@ -22,16 +22,31 @@
#
PROG= pcompress
MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \
lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \
lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c utils/cpuid.c \
crypto/aes/crypto_aes.c crypto/scrypt/crypto_scrypt-nosse.c \
crypto/scrypt/sha256.c crypto/scrypt/crypto_aesctr.c crypto/crypto_utils.c
MAINSRCS = main.c utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
utils/xxhash.c utils/heapq.c utils/cpuid.c
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h \
utils/cpuid.h crypto/crypto_utils.h crypto/scrypt/crypto_scrypt.h \
crypto/scrypt/sha256.h crypto/scrypt/crypto_aesctr.h crypto/aes/crypto_aes.h
utils/cpuid.h
MAINOBJS = $(MAINSRCS:.c=.o)
CRYPTO_SRCS = crypto/aes/crypto_aes.c crypto/scrypt/crypto_scrypt-nosse.c \
crypto/scrypt/sha256.c crypto/scrypt/crypto_aesctr.c crypto/crypto_utils.c
CRYPTO_HDRS = crypto/crypto_utils.h crypto/scrypt/crypto_scrypt.h \
crypto/scrypt/sha256.h crypto/scrypt/crypto_aesctr.h crypto/aes/crypto_aes.h \
$(MAINHDRS)
CRYPTO_OBJS = $(CRYPTO_SRCS:.c=.o)
CRYPTO_CPPFLAGS=-I@OPENSSL_INCDIR@
ZLIB_SRCS = zlib_compress.c
ZLIB_HDRS = $(MAINHDRS)
ZLIB_OBJS = $(ZLIB_SRCS:.c=.o)
ZLIB_CPPFLAGS = @LIBZ_INC@
BZLIB_SRCS = bzip2_compress.c
BZLIB_HDRS = $(MAINHDRS)
BZLIB_OBJS = $(BZLIB_SRCS:.c=.o)
BZLIB_CPPFLAGS = @LIBBZ2_INC@
RABINSRCS = rabin/rabin_dedup.c
RABINHDRS = rabin/rabin_dedup.h utils/utils.h
RABINOBJS = $(RABINSRCS:.c=.o)
@ -130,17 +145,17 @@ RM = rm -f
RM_RF = rm -rf
COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@ \
-I./crypto/sha2 -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \
@LIBBZ2_INC@ @LIBZ_INC@ -I./crypto/keccak -I./transpose
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I./crypto/sha2 \
-I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \
-I./crypto/keccak -I./transpose $(EXTRA_CPPFLAGS)
COMMON_VEC_FLAGS = -ftree-vectorize
COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \
-L@OPENSSL_LIBDIR@ -lcrypto -lrt
-L@OPENSSL_LIBDIR@ -lcrypto -lrt $(EXTRA_LDFLAGS)
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
$(TRANSP_OBJS)
$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS)
DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@
DEBUG_COMPILE = gcc -m64 -g -msse3 -c
@ -235,6 +250,15 @@ $(LIBBSCWRAPOBJ): $(LIBBSCWRAP) $(LIBBSCLIB)
$(TRANSP_OBJS): $(TRANSP_SRCS) $(TRANSP_HDRS)
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(CRYPTO_OBJS): $(CRYPTO_SRCS) $(CRYPTO_HDRS)
$(COMPILE) $(GEN_OPT) $(CRYPTO_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(ZLIB_OBJS): $(ZLIB_SRCS) $(ZLIB_HDRS)
$(COMPILE) $(GEN_OPT) $(ZLIB_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(BZLIB_OBJS): $(BZLIB_SRCS) $(BZLIB_HDRS)
$(COMPILE) $(GEN_OPT) $(BZLIB_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
$(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
@ -253,13 +277,13 @@ distclean: clean
$(RM) Makefile
install: $(PROG)
@mkdir -p $(PREFIX)/bin
@chmod 0755 $(PREFIX)/bin
@cp $(PROG) $(PREFIX)/bin
@chmod 0555 $(PREFIX)/bin/$(PROG)
@mkdir -p $(PREFIX)/share/doc/$(PROG)
@chmod 0755 $(PREFIX)/share $(PREFIX)/share/doc $(PREFIX)/share/doc/$(PROG)
@cp README.md $(PREFIX)/share/doc/$(PROG)/README
@chmod 0444 $(PREFIX)/share/doc/$(PROG)/README
@mkdir -p $(DESTDIR)$(PREFIX)/bin
@chmod 0755 $(DESTDIR)$(PREFIX)/bin
@cp $(PROG) $(DESTDIR)$(PREFIX)/bin
@chmod 0555 $(DESTDIR)$(PREFIX)/bin/$(PROG)
@mkdir -p $(DESTDIR)$(PREFIX)/share/doc/$(PROG)
@chmod 0755 $(DESTDIR)$(PREFIX)/share $(PREFIX)/share/doc $(PREFIX)/share/doc/$(PROG)
@cp README.md $(DESTDIR)$(PREFIX)/share/doc/$(PROG)/README
@chmod 0444 $(DESTDIR)$(PREFIX)/share/doc/$(PROG)/README

View file

@ -221,22 +221,28 @@ algorithm can be selected for textual and binary portions.
Pre-Processing Algorithms
=========================
As can be seen above a multitude of pre-processing algorithms are available that provide
further compression effectiveness beyond what the usual compression algorithms can
achieve by themselves. These are summarized below:
As can be seen above a multitude of pre-processing algorithms are available that
provide further compression effectiveness beyond what the usual compression
algorithms can achieve by themselves. These are summarized below:
1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin
fingerprinting.
2) LZP : LZ Prediction is a variant of LZ77 that replaces repeating runs of
text with shorter codes.
2) Delta Compression : A similarity based (minhash) comparison of Rabin blocks. Two
blocks at least 60% similar with each other are diffed using
bsdiff.
3) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic progressions
are detected in the data stream and collapsed via Run-Length encoding.
3) LZP : LZ Prediction is a variant of LZ77 that replaces repeating
runs of text with shorter codes.
4) Matrix Transpose: This is used automatically in Delta Encoding and Deduplication. This
attempts to transpose columnar repeating sequences of bytes into
row-wise sequences so that compression algorithms can work better.
4) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic
progressions are detected in the data stream and collapsed
via Run-Length encoding.
4) Matrix Transpose : This is used automatically in Delta Encoding and Deduplication.
This attempts to transpose columnar repeating sequences of
bytes into row-wise sequences so that compression algorithms
can work better.
Memory Usage
============

57
config
View file

@ -17,6 +17,10 @@ ${prog} [<options>]
--with-openssl=<path to OpenSSL installation tree> (Default: System)
This defaults to the system's OpenSSL library. You can use this option
if you want to use an alternate OpenSSL installation.
--with-zlib=<path to zlib installation tree> (Default: System)
Enable building against an alternate Zlib installation.
--with-bzlib=<path to Bzip2 library installation tree> (Default: System)
Enable building against an alternate Bzip2 and library installation.
--use-key256 Use 256-bit encryption keys. Default key length is 128-bit.
--help Display this help message.
@ -47,6 +51,8 @@ keccak_srcs=
keccak_hdrs=
keccak_srcs_asm=
lto_flag=
zlib_prefix=
bzlib_prefix=
# Try a simple compilation
cat << _EOF > tst.c
@ -114,6 +120,12 @@ do
--with-openssl=*)
openssl_prefix=`echo ${arg1} | cut -f2 -d"="`
;;
--with-zlib=*)
zlib_prefix=`echo ${arg1} | cut -f2 -d"="`
;;
--with-bzlib=*)
bzlib_prefix=`echo ${arg1} | cut -f2 -d"="`
;;
--use-key256)
keylen='-DKEYLEN=32'
;;
@ -263,11 +275,26 @@ fi
# Detect other library packages
for libname in "libbz2" "libz"
for libspec in "libbz2:${bzlib_prefix}" "libz:${zlib_prefix}"
do
for lib in "/lib64" "/usr/lib64" "/lib" "/usr/lib" "/lib/x86_64-linux-gnu" \
"/usr/lib/x86_64-linux-gnu" \
"${prefix}/lib64" "${prefix}/lib" "${prefix}/lib/x86_64-linux-gnu"
_OIFS="$IFS"
IFS=":"
set -- ${libspec}
libname=$1
pref=$2
IFS="$_OIFS"
use_prefix="${pref}"
if [ "x${pref}" = "x" ]
then
use_prefix="$prefix"
fi
for lib in "${pref}/lib64" "${pref}/usr/lib64" "${pref}/lib" "${pref}/usr/lib" \
"${pref}/lib/x86_64-linux-gnu" "${pref}/usr/lib/x86_64-linux-gnu" \
"${pref}/local/lib64" "${pref}/usr/local/lib64" "${pref}/local/lib" "${pref}/usr/local/lib" \
"${pref}/local/lib/x86_64-linux-gnu" "${pref}/usr/local/lib/x86_64-linux-gnu" \
"${use_prefix}/lib64" "${use_prefix}/lib" "${use_prefix}/lib/x86_64-linux-gnu" \
"${use_prefix}/usr/lib/x86_64-linux-gnu"
do
if [ -d ${lib} ]
then
@ -288,31 +315,49 @@ done
if [ "x${libbz2_libdir}" = "x" ]
then
if [ "x$bzlib_prefix" = "x" ]
then
echo "ERROR: Libbz2 not detected."
echo " You may have to install libbz2-devel or libbz2-dev"
else
echo "ERROR: Bzip2 library not detected in given prefix."
fi
exit 1
fi
if [ "x${libz_libdir}" = "x" ]
then
if [ "x$zlib_prefix" = "x" ]
then
echo "ERROR: Zlib not detected."
echo " You may have to install libz-devel or libz-dev"
else
echo "ERROR: Zlib not detected in given prefix."
fi
exit 1
fi
libbz2_inc=
libz_inc=
# Detect other library headers
for hdr in "libbz2_inc:bzlib.h" "libz_inc:zlib.h"
for hdr in "libbz2_inc:bzlib.h:${bzlib_prefix}" "libz_inc:zlib.h:${zlib_prefix}"
do
_OIFS="$IFS"
IFS=":"
set -- ${hdr}
var=$1
hdrf=$2
pref=$3
IFS="$_OIFS"
for inc in "${prefix}/include" "/usr/include"
use_prefix="${pref}"
if [ "x${pref}" = "x" ]
then
use_prefix="$prefix"
fi
for inc in "${pref}/include" "${pref}/usr/include" \
"${pref}/local/include" "${pref}/usr/local/include" \
"${use_prefix}/include" "${use_prefix}/usr/include"
do
if [ -d ${inc} ]
then

View file

@ -34,8 +34,9 @@
* objects are output by the encoder:
* 1) A literal run of unmodified bytes. Header: 1 zero byte followed
* by a 64bit length in bytes.
* 2) A literal run of transposed bytes containing at least 87% below
* threshold sequences.
* 2) A literal run of transposed bytes containing sequences that are
* below threshold and the total span of those sequences is at least
* 87% of the entire run.
* Header: 1 byte stride length with high bit set.
* 64bit length of span in bytes.
* 3) An encoded run length of a series in arithmetic progression.
@ -175,7 +176,7 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int
if (gtot1 > 0) {
/*
* Encode previous literal run, if any. If the literal run
* has enough (90%+) large sequences just below threshold,
* has enough (87%+) large sequences just below threshold,
* do a matrix transpose on the range in the hope of achieving
* a better compression ratio.
*/

1
main.c
View file

@ -272,6 +272,7 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void
if (result != -1) {
memcpy(src, dst, _dstlen);
srclen = _dstlen;
*dstlen = _dstlen;
} else {
return (result);
}