diff --git a/Makefile.in b/Makefile.in index 4606505..b4fd42b 100644 --- a/Makefile.in +++ b/Makefile.in @@ -22,23 +22,23 @@ # PROG= pcompress -MAINSRCS = main.c utils.c allocator.c zlib_compress.c bzip2_compress.c \ +MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \ lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \ - lz4_compress.c none_compress.c -MAINHDRS = allocator.h pcompress.h utils.h + lz4_compress.c none_compress.c utils/xxhash.c +MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h MAINOBJS = $(MAINSRCS:.c=.o) -RABINSRCS = rabin/rabin_polynomial.c -RABINHDRS = rabin/rabin_polynomial.h utils.h +RABINSRCS = rabin/rabin_dedup.c +RABINHDRS = rabin/rabin_dedup.h utils/utils.h RABINOBJS = $(RABINSRCS:.c=.o) BSDIFFSRCS = bsdiff/bsdiff.c bsdiff/bspatch.c bsdiff/rle_encoder.c -BSDIFFHDRS = bsdiff/bscommon.h utils.h allocator.h +BSDIFFHDRS = bsdiff/bscommon.h utils/utils.h allocator.h BSDIFFOBJS = $(BSDIFFSRCS:.c=.o) LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c lzma/Threads.c lzma/LzFindMt.c LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \ - lzma/LzHash.h lzma/LzmaDec.h utils.h lzma/LzFindMt.h lzma/Threads.h lzma/windows.h \ + lzma/LzHash.h lzma/LzmaDec.h utils/utils.h lzma/LzFindMt.h lzma/Threads.h lzma/windows.h \ lzma/Common/MyWindows.h lzma/Common/MyGuidDef.h lzma/basetyps.h LZMAOBJS = $(LZMASRCS:.c=.o) @@ -80,12 +80,12 @@ LIBBSCLIB = @LIBBSCLIB@ LIBBSCGEN_OPT = -fopenmp LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC -BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ +BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ RM = rm -f COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \ -DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \ - -I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein + -I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils COMMON_VEC_FLAGS = -ftree-vectorize COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm @LIBBSCLFLAGS@ diff --git a/main.c b/main.c index 8235a34..e640877 100644 --- a/main.c +++ b/main.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include /* diff --git a/pcompress.h b/pcompress.h index 65429b2..7da016a 100644 --- a/pcompress.h +++ b/pcompress.h @@ -32,7 +32,7 @@ extern "C" { #endif -#include +#include #define CHUNK_FLAG_SZ 1 #define ALGO_SZ 8 diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_dedup.c similarity index 99% rename from rabin/rabin_polynomial.c rename to rabin/rabin_dedup.c index 8b04455..adb67a6 100755 --- a/rabin/rabin_polynomial.c +++ b/rabin/rabin_dedup.c @@ -1,6 +1,4 @@ /* - * rabin_polynomial.c - * * The rabin polynomial computation is derived from: * http://code.google.com/p/rabin-fingerprint-c/ * @@ -66,7 +64,7 @@ #include #include -#include "rabin_polynomial.h" +#include "rabin_dedup.h" extern int lzma_init(void **data, int *level, ssize_t chunksize); extern int lzma_compress(void *src, size_t srclen, void *dst, @@ -309,6 +307,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s fplist = (uint64_t *)(ctx->cbuf + ctx->real_chunksize - fplist_sz - 256 * 4); charcounts = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - 256 * 4); memset(fplist, 0, fplist_sz); + memset(charcounts, 0, 256 * 4); fpos[0] = 0; fpos[1] = 0; len1 = 0; @@ -460,7 +459,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s ctx->blocks[blknum]->length = length; ctx->blocks[blknum]->ref = 0; ctx->blocks[blknum]->similar = 0; - ctx->blocks[blknum]->crc = lzma_crc64(buf1+last_offset, length, 0); + ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, length, 0); // Accumulate the 2 sketch values into a combined similarity checksum ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2; @@ -504,7 +503,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s j = (j > 0 ? j:1); ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2; ctx->blocks[blknum]->mean_n_length = cur_sketch / j; - ctx->blocks[blknum]->crc = lzma_crc64(buf1+last_offset, ctx->blocks[blknum]->length, 0); + ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0); blknum++; last_offset = *size; } diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_dedup.h similarity index 98% rename from rabin/rabin_polynomial.h rename to rabin/rabin_dedup.h index dee17cb..6e7f019 100644 --- a/rabin/rabin_polynomial.h +++ b/rabin/rabin_dedup.h @@ -168,6 +168,5 @@ extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp, extern void reset_rabin_context(rabin_context_t *ctx); extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta_flag); -extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc); #endif /* _RABIN_POLY_H_ */ diff --git a/utils.c b/utils/utils.c similarity index 99% rename from utils.c rename to utils/utils.c index ba42553..75052cd 100644 --- a/utils.c +++ b/utils/utils.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include "utils.h" diff --git a/utils.h b/utils/utils.h similarity index 100% rename from utils.h rename to utils/utils.h diff --git a/utils/xxhash.c b/utils/xxhash.c new file mode 100644 index 0000000..5bd8ebc --- /dev/null +++ b/utils/xxhash.c @@ -0,0 +1,224 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + * + * This program includes partly-modified public domain source + * code from the LZMA SDK: http://www.7-zip.org/sdk.html + */ + +/* + xxHash - Fast Hash algorithm + Copyright (C) 2012, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : http://code.google.com/p/xxhash/ +*/ + + +//************************************** +// Includes +//************************************** +#include "xxhash.h" + + + +//************************************** +// Compiler Options +//************************************** +#ifdef _MSC_VER // Visual Studio +#define inline __forceinline // Visual is not C99, but supports some kind of inline +#endif + +// GCC does not support _rotl outside of Windows +#if !defined(_WIN32) +#define _rotl(x,r) ((x << r) | (x >> (32 - r))) +#endif + + + +//************************************** +// Constants +//************************************** +#define PRIME1 2654435761U +#define PRIME2 2246822519U +#define PRIME3 3266489917U +#define PRIME4 668265263U +#define PRIME5 0x165667b1 + + + +//**************************** +// Private functions +//**************************** + +// This version is for very small inputs (< 16 bytes) +inline unsigned int XXH_small(const void* key, int len, unsigned int seed) +{ + const unsigned char* p = (unsigned char*)key; + const unsigned char* const bEnd = p + len; + unsigned int idx = seed + PRIME1; + unsigned int crc = PRIME5; + const unsigned char* const limit = bEnd - 4; + + while (p> 15; + crc *= PRIME2; + crc ^= crc >> 13; + crc *= PRIME3; + crc ^= crc >> 16; + + return crc; +} + + + +//****************************** +// Hash functions +//****************************** +unsigned int XXH_fast32(const void* input, int len, unsigned int seed) +{ + // Special case, for small inputs + if (len < 16) return XXH_small(input, len, seed); + + { + const unsigned char* p = (const unsigned char*)input; + const unsigned char* const bEnd = p + len; + unsigned int v1 = seed + PRIME1; + unsigned int v2 = v1 * PRIME2 + len; + unsigned int v3 = v2 * PRIME3; + unsigned int v4 = v3 * PRIME4; + const unsigned char* const limit = bEnd - 16; + unsigned int crc; + + while (p> 11; + crc += (PRIME4+len) * PRIME1; + crc ^= crc >> 15; + crc *= PRIME2; + crc ^= crc >> 13; + + return crc; + } + +} + + + +unsigned int XXH_strong32(const void* input, int len, unsigned int seed) +{ + // Special case, for small inputs + if (len < 16) return XXH_small(input, len, seed); + + { + const unsigned char* p = (const unsigned char*)input; + const unsigned char* const bEnd = p + len; + unsigned int v1 = seed + PRIME1; + unsigned int v2 = v1 * PRIME2 + len; + unsigned int v3 = v2 * PRIME3; + unsigned int v4 = v3 * PRIME4; + const unsigned char* const limit = bEnd - 16; + unsigned int crc; + + while (p> 11; + crc += (PRIME4+len) * PRIME1; + crc ^= crc >> 15; + crc *= PRIME2; + crc ^= crc >> 13; + + return crc; + } + +} + + + + + diff --git a/utils/xxhash.h b/utils/xxhash.h new file mode 100644 index 0000000..3cf8803 --- /dev/null +++ b/utils/xxhash.h @@ -0,0 +1,80 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +/* + xxHash - Fast Hash algorithm + Header File + Copyright (C) 2012, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : http://code.google.com/p/xxhash/ +*/ +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + + +//**************************** +// Hash Functions +//**************************** + +unsigned int XXH_fast32 (const void* input, int len, unsigned int seed); +unsigned int XXH_strong32(const void* input, int len, unsigned int seed); + +/* +XXH_fast32() : + Calculate the 32-bits hash of "input", of length "len" + "seed" can be used to alter the result + +XXH_strong32() : + Same as XXH_fast(), but the resulting hash has stronger properties +*/ + + + +#if defined (__cplusplus) +} +#endif