diff --git a/INSTALL b/INSTALL index 1b6a0fa..6b98b8e 100644 --- a/INSTALL +++ b/INSTALL @@ -76,6 +76,13 @@ not the usual GNU Autoconf script. path to the libbsc source tree must be provided. It links the library statically. +--with-zlib= (Default: System) + Enable building against an alternate Zlib installation. + +--with-bzlib= (Default: System) + Enable building against an alternate Bzip2 and library + installation. + --help Display the help message. Steps for building with libbsc support @@ -95,4 +102,7 @@ Steps for building with libbsc support 4) Now run make in the pcompress directory. This will also run make in the libbsc source directory to build it. +5) Additional compilation flags can be passed to make like this: + make EXTRA_CPPFLAGS=<...> EXTRA_LDFLAGS=<...> + diff --git a/Makefile.in b/Makefile.in index f279b31..e7a7a46 100644 --- a/Makefile.in +++ b/Makefile.in @@ -22,16 +22,31 @@ # PROG= pcompress -MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \ - lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \ - lz4_compress.c none_compress.c utils/xxhash.c utils/heapq.c utils/cpuid.c \ - crypto/aes/crypto_aes.c crypto/scrypt/crypto_scrypt-nosse.c \ - crypto/scrypt/sha256.c crypto/scrypt/crypto_aesctr.c crypto/crypto_utils.c +MAINSRCS = main.c utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \ + adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \ + utils/xxhash.c utils/heapq.c utils/cpuid.c MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h \ - utils/cpuid.h crypto/crypto_utils.h crypto/scrypt/crypto_scrypt.h \ - crypto/scrypt/sha256.h crypto/scrypt/crypto_aesctr.h crypto/aes/crypto_aes.h + utils/cpuid.h MAINOBJS = $(MAINSRCS:.c=.o) +CRYPTO_SRCS = crypto/aes/crypto_aes.c crypto/scrypt/crypto_scrypt-nosse.c \ + crypto/scrypt/sha256.c crypto/scrypt/crypto_aesctr.c crypto/crypto_utils.c +CRYPTO_HDRS = crypto/crypto_utils.h crypto/scrypt/crypto_scrypt.h \ + crypto/scrypt/sha256.h crypto/scrypt/crypto_aesctr.h crypto/aes/crypto_aes.h \ + $(MAINHDRS) +CRYPTO_OBJS = $(CRYPTO_SRCS:.c=.o) +CRYPTO_CPPFLAGS=-I@OPENSSL_INCDIR@ + +ZLIB_SRCS = zlib_compress.c +ZLIB_HDRS = $(MAINHDRS) +ZLIB_OBJS = $(ZLIB_SRCS:.c=.o) +ZLIB_CPPFLAGS = @LIBZ_INC@ + +BZLIB_SRCS = bzip2_compress.c +BZLIB_HDRS = $(MAINHDRS) +BZLIB_OBJS = $(BZLIB_SRCS:.c=.o) +BZLIB_CPPFLAGS = @LIBBZ2_INC@ + RABINSRCS = rabin/rabin_dedup.c RABINHDRS = rabin/rabin_dedup.h utils/utils.h RABINOBJS = $(RABINSRCS:.c=.o) @@ -130,17 +145,17 @@ RM = rm -f RM_RF = rm -rf COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \ -DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \ - -I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@ \ - -I./crypto/sha2 -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \ - @LIBBZ2_INC@ @LIBZ_INC@ -I./crypto/keccak -I./transpose + -I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I./crypto/sha2 \ + -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \ + -I./crypto/keccak -I./transpose $(EXTRA_CPPFLAGS) COMMON_VEC_FLAGS = -ftree-vectorize COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \ - -L@OPENSSL_LIBDIR@ -lcrypto -lrt + -L@OPENSSL_LIBDIR@ -lcrypto -lrt $(EXTRA_LDFLAGS) OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \ $(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \ -$(TRANSP_OBJS) +$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@ DEBUG_COMPILE = gcc -m64 -g -msse3 -c @@ -235,6 +250,15 @@ $(LIBBSCWRAPOBJ): $(LIBBSCWRAP) $(LIBBSCLIB) $(TRANSP_OBJS): $(TRANSP_SRCS) $(TRANSP_HDRS) $(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ +$(CRYPTO_OBJS): $(CRYPTO_SRCS) $(CRYPTO_HDRS) + $(COMPILE) $(GEN_OPT) $(CRYPTO_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ + +$(ZLIB_OBJS): $(ZLIB_SRCS) $(ZLIB_HDRS) + $(COMPILE) $(GEN_OPT) $(ZLIB_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ + +$(BZLIB_OBJS): $(BZLIB_SRCS) $(BZLIB_HDRS) + $(COMPILE) $(GEN_OPT) $(BZLIB_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ + $(MAINOBJS): $(MAINSRCS) $(MAINHDRS) $(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ @@ -253,13 +277,13 @@ distclean: clean $(RM) Makefile install: $(PROG) - @mkdir -p $(PREFIX)/bin - @chmod 0755 $(PREFIX)/bin - @cp $(PROG) $(PREFIX)/bin - @chmod 0555 $(PREFIX)/bin/$(PROG) - @mkdir -p $(PREFIX)/share/doc/$(PROG) - @chmod 0755 $(PREFIX)/share $(PREFIX)/share/doc $(PREFIX)/share/doc/$(PROG) - @cp README.md $(PREFIX)/share/doc/$(PROG)/README - @chmod 0444 $(PREFIX)/share/doc/$(PROG)/README + @mkdir -p $(DESTDIR)$(PREFIX)/bin + @chmod 0755 $(DESTDIR)$(PREFIX)/bin + @cp $(PROG) $(DESTDIR)$(PREFIX)/bin + @chmod 0555 $(DESTDIR)$(PREFIX)/bin/$(PROG) + @mkdir -p $(DESTDIR)$(PREFIX)/share/doc/$(PROG) + @chmod 0755 $(DESTDIR)$(PREFIX)/share $(PREFIX)/share/doc $(PREFIX)/share/doc/$(PROG) + @cp README.md $(DESTDIR)$(PREFIX)/share/doc/$(PROG)/README + @chmod 0444 $(DESTDIR)$(PREFIX)/share/doc/$(PROG)/README diff --git a/README.md b/README.md index 74f1288..4fb26ba 100644 --- a/README.md +++ b/README.md @@ -221,22 +221,28 @@ algorithm can be selected for textual and binary portions. Pre-Processing Algorithms ========================= -As can be seen above a multitude of pre-processing algorithms are available that provide -further compression effectiveness beyond what the usual compression algorithms can -achieve by themselves. These are summarized below: +As can be seen above a multitude of pre-processing algorithms are available that +provide further compression effectiveness beyond what the usual compression +algorithms can achieve by themselves. These are summarized below: -1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin - fingerprinting. +1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin + fingerprinting. -2) LZP : LZ Prediction is a variant of LZ77 that replaces repeating runs of - text with shorter codes. +2) Delta Compression : A similarity based (minhash) comparison of Rabin blocks. Two + blocks at least 60% similar with each other are diffed using + bsdiff. -3) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic progressions - are detected in the data stream and collapsed via Run-Length encoding. +3) LZP : LZ Prediction is a variant of LZ77 that replaces repeating + runs of text with shorter codes. -4) Matrix Transpose: This is used automatically in Delta Encoding and Deduplication. This - attempts to transpose columnar repeating sequences of bytes into - row-wise sequences so that compression algorithms can work better. +4) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic + progressions are detected in the data stream and collapsed + via Run-Length encoding. + +4) Matrix Transpose : This is used automatically in Delta Encoding and Deduplication. + This attempts to transpose columnar repeating sequences of + bytes into row-wise sequences so that compression algorithms + can work better. Memory Usage ============ diff --git a/config b/config index 87225f5..50a16c4 100755 --- a/config +++ b/config @@ -17,6 +17,10 @@ ${prog} [] --with-openssl= (Default: System) This defaults to the system's OpenSSL library. You can use this option if you want to use an alternate OpenSSL installation. +--with-zlib= (Default: System) + Enable building against an alternate Zlib installation. +--with-bzlib= (Default: System) + Enable building against an alternate Bzip2 and library installation. --use-key256 Use 256-bit encryption keys. Default key length is 128-bit. --help Display this help message. @@ -47,6 +51,8 @@ keccak_srcs= keccak_hdrs= keccak_srcs_asm= lto_flag= +zlib_prefix= +bzlib_prefix= # Try a simple compilation cat << _EOF > tst.c @@ -114,6 +120,12 @@ do --with-openssl=*) openssl_prefix=`echo ${arg1} | cut -f2 -d"="` ;; + --with-zlib=*) + zlib_prefix=`echo ${arg1} | cut -f2 -d"="` + ;; + --with-bzlib=*) + bzlib_prefix=`echo ${arg1} | cut -f2 -d"="` + ;; --use-key256) keylen='-DKEYLEN=32' ;; @@ -263,11 +275,26 @@ fi # Detect other library packages -for libname in "libbz2" "libz" +for libspec in "libbz2:${bzlib_prefix}" "libz:${zlib_prefix}" do - for lib in "/lib64" "/usr/lib64" "/lib" "/usr/lib" "/lib/x86_64-linux-gnu" \ - "/usr/lib/x86_64-linux-gnu" \ - "${prefix}/lib64" "${prefix}/lib" "${prefix}/lib/x86_64-linux-gnu" + _OIFS="$IFS" + IFS=":" + set -- ${libspec} + libname=$1 + pref=$2 + IFS="$_OIFS" + + use_prefix="${pref}" + if [ "x${pref}" = "x" ] + then + use_prefix="$prefix" + fi + for lib in "${pref}/lib64" "${pref}/usr/lib64" "${pref}/lib" "${pref}/usr/lib" \ + "${pref}/lib/x86_64-linux-gnu" "${pref}/usr/lib/x86_64-linux-gnu" \ + "${pref}/local/lib64" "${pref}/usr/local/lib64" "${pref}/local/lib" "${pref}/usr/local/lib" \ + "${pref}/local/lib/x86_64-linux-gnu" "${pref}/usr/local/lib/x86_64-linux-gnu" \ + "${use_prefix}/lib64" "${use_prefix}/lib" "${use_prefix}/lib/x86_64-linux-gnu" \ + "${use_prefix}/usr/lib/x86_64-linux-gnu" do if [ -d ${lib} ] then @@ -288,31 +315,49 @@ done if [ "x${libbz2_libdir}" = "x" ] then - echo "ERROR: Libbz2 not detected." - echo " You may have to install libbz2-devel or libbz2-dev" + if [ "x$bzlib_prefix" = "x" ] + then + echo "ERROR: Libbz2 not detected." + echo " You may have to install libbz2-devel or libbz2-dev" + else + echo "ERROR: Bzip2 library not detected in given prefix." + fi exit 1 fi if [ "x${libz_libdir}" = "x" ] then - echo "ERROR: Zlib not detected." - echo " You may have to install libz-devel or libz-dev" + if [ "x$zlib_prefix" = "x" ] + then + echo "ERROR: Zlib not detected." + echo " You may have to install libz-devel or libz-dev" + else + echo "ERROR: Zlib not detected in given prefix." + fi exit 1 fi libbz2_inc= libz_inc= # Detect other library headers -for hdr in "libbz2_inc:bzlib.h" "libz_inc:zlib.h" +for hdr in "libbz2_inc:bzlib.h:${bzlib_prefix}" "libz_inc:zlib.h:${zlib_prefix}" do _OIFS="$IFS" IFS=":" set -- ${hdr} var=$1 hdrf=$2 + pref=$3 IFS="$_OIFS" - for inc in "${prefix}/include" "/usr/include" + use_prefix="${pref}" + if [ "x${pref}" = "x" ] + then + use_prefix="$prefix" + fi + for inc in "${pref}/include" "${pref}/usr/include" \ + "${pref}/local/include" "${pref}/usr/local/include" \ + "${use_prefix}/include" "${use_prefix}/usr/include" do if [ -d ${inc} ] then diff --git a/delta2/delta2.c b/delta2/delta2.c index f185821..292f5db 100644 --- a/delta2/delta2.c +++ b/delta2/delta2.c @@ -34,8 +34,9 @@ * objects are output by the encoder: * 1) A literal run of unmodified bytes. Header: 1 zero byte followed * by a 64bit length in bytes. - * 2) A literal run of transposed bytes containing at least 87% below - * threshold sequences. + * 2) A literal run of transposed bytes containing sequences that are + * below threshold and the total span of those sequences is at least + * 87% of the entire run. * Header: 1 byte stride length with high bit set. * 64bit length of span in bytes. * 3) An encoded run length of a series in arithmetic progression. @@ -175,7 +176,7 @@ delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int if (gtot1 > 0) { /* * Encode previous literal run, if any. If the literal run - * has enough (90%+) large sequences just below threshold, + * has enough (87%+) large sequences just below threshold, * do a matrix transpose on the range in the hope of achieving * a better compression ratio. */ diff --git a/main.c b/main.c index 10488db..c9e9e89 100644 --- a/main.c +++ b/main.c @@ -272,6 +272,7 @@ preproc_decompress(compress_func_ptr dec_func, void *src, uint64_t srclen, void if (result != -1) { memcpy(src, dst, _dstlen); srclen = _dstlen; + *dstlen = _dstlen; } else { return (result); }