Add xxHash for Rabin block checksums, slightly faster than CRC64.
Fix missing initialization of character counts table. Some file reorganization.
This commit is contained in:
parent
4ba840b255
commit
262566b59a
9 changed files with 320 additions and 18 deletions
18
Makefile.in
18
Makefile.in
|
@ -22,23 +22,23 @@
|
|||
#
|
||||
|
||||
PROG= pcompress
|
||||
MAINSRCS = main.c utils.c allocator.c zlib_compress.c bzip2_compress.c \
|
||||
MAINSRCS = main.c utils/utils.c allocator.c zlib_compress.c bzip2_compress.c \
|
||||
lzma_compress.c ppmd_compress.c adaptive_compress.c lzfx_compress.c \
|
||||
lz4_compress.c none_compress.c
|
||||
MAINHDRS = allocator.h pcompress.h utils.h
|
||||
lz4_compress.c none_compress.c utils/xxhash.c
|
||||
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h
|
||||
MAINOBJS = $(MAINSRCS:.c=.o)
|
||||
|
||||
RABINSRCS = rabin/rabin_polynomial.c
|
||||
RABINHDRS = rabin/rabin_polynomial.h utils.h
|
||||
RABINSRCS = rabin/rabin_dedup.c
|
||||
RABINHDRS = rabin/rabin_dedup.h utils/utils.h
|
||||
RABINOBJS = $(RABINSRCS:.c=.o)
|
||||
|
||||
BSDIFFSRCS = bsdiff/bsdiff.c bsdiff/bspatch.c bsdiff/rle_encoder.c
|
||||
BSDIFFHDRS = bsdiff/bscommon.h utils.h allocator.h
|
||||
BSDIFFHDRS = bsdiff/bscommon.h utils/utils.h allocator.h
|
||||
BSDIFFOBJS = $(BSDIFFSRCS:.c=.o)
|
||||
|
||||
LZMASRCS = lzma/LzmaEnc.c lzma/LzFind.c lzma/LzmaDec.c lzma/Threads.c lzma/LzFindMt.c
|
||||
LZMAHDRS = lzma/CpuArch.h lzma/LzFind.h lzma/LzmaEnc.h lzma/Types.h \
|
||||
lzma/LzHash.h lzma/LzmaDec.h utils.h lzma/LzFindMt.h lzma/Threads.h lzma/windows.h \
|
||||
lzma/LzHash.h lzma/LzmaDec.h utils/utils.h lzma/LzFindMt.h lzma/Threads.h lzma/windows.h \
|
||||
lzma/Common/MyWindows.h lzma/Common/MyGuidDef.h lzma/basetyps.h
|
||||
LZMAOBJS = $(LZMASRCS:.c=.o)
|
||||
|
||||
|
@ -80,12 +80,12 @@ LIBBSCLIB = @LIBBSCLIB@
|
|||
LIBBSCGEN_OPT = -fopenmp
|
||||
LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC
|
||||
|
||||
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~
|
||||
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~
|
||||
|
||||
RM = rm -f
|
||||
COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_PROPS \
|
||||
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
|
||||
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein
|
||||
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils
|
||||
COMMON_VEC_FLAGS = -ftree-vectorize
|
||||
COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
|
||||
LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm @LIBBSCLFLAGS@
|
||||
|
|
2
main.c
2
main.c
|
@ -44,7 +44,7 @@
|
|||
#include <utils.h>
|
||||
#include <pcompress.h>
|
||||
#include <allocator.h>
|
||||
#include <rabin_polynomial.h>
|
||||
#include <rabin_dedup.h>
|
||||
#include <lzp.h>
|
||||
|
||||
/*
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <rabin_polynomial.h>
|
||||
#include <rabin_dedup.h>
|
||||
|
||||
#define CHUNK_FLAG_SZ 1
|
||||
#define ALGO_SZ 8
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
/*
|
||||
* rabin_polynomial.c
|
||||
*
|
||||
* The rabin polynomial computation is derived from:
|
||||
* http://code.google.com/p/rabin-fingerprint-c/
|
||||
*
|
||||
|
@ -66,7 +64,7 @@
|
|||
#include <utils.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include "rabin_polynomial.h"
|
||||
#include "rabin_dedup.h"
|
||||
|
||||
extern int lzma_init(void **data, int *level, ssize_t chunksize);
|
||||
extern int lzma_compress(void *src, size_t srclen, void *dst,
|
||||
|
@ -309,6 +307,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
fplist = (uint64_t *)(ctx->cbuf + ctx->real_chunksize - fplist_sz - 256 * 4);
|
||||
charcounts = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - 256 * 4);
|
||||
memset(fplist, 0, fplist_sz);
|
||||
memset(charcounts, 0, 256 * 4);
|
||||
fpos[0] = 0;
|
||||
fpos[1] = 0;
|
||||
len1 = 0;
|
||||
|
@ -460,7 +459,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
ctx->blocks[blknum]->length = length;
|
||||
ctx->blocks[blknum]->ref = 0;
|
||||
ctx->blocks[blknum]->similar = 0;
|
||||
ctx->blocks[blknum]->crc = lzma_crc64(buf1+last_offset, length, 0);
|
||||
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, length, 0);
|
||||
|
||||
// Accumulate the 2 sketch values into a combined similarity checksum
|
||||
ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2;
|
||||
|
@ -504,7 +503,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
|
|||
j = (j > 0 ? j:1);
|
||||
ctx->blocks[blknum]->cksum_n_offset = (cur_sketch + cur_sketch2) / 2;
|
||||
ctx->blocks[blknum]->mean_n_length = cur_sketch / j;
|
||||
ctx->blocks[blknum]->crc = lzma_crc64(buf1+last_offset, ctx->blocks[blknum]->length, 0);
|
||||
ctx->blocks[blknum]->crc = XXH_strong32(buf1+last_offset, ctx->blocks[blknum]->length, 0);
|
||||
blknum++;
|
||||
last_offset = *size;
|
||||
}
|
|
@ -168,6 +168,5 @@ extern void rabin_update_hdr(uchar_t *buf, ssize_t rabin_index_sz_cmp,
|
|||
extern void reset_rabin_context(rabin_context_t *ctx);
|
||||
extern uint32_t rabin_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||
int delta_flag);
|
||||
extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
|
||||
|
||||
#endif /* _RABIN_POLY_H_ */
|
|
@ -31,7 +31,7 @@
|
|||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <link.h>
|
||||
#include <rabin_polynomial.h>
|
||||
#include <rabin_dedup.h>
|
||||
#include <skein.h>
|
||||
|
||||
#include "utils.h"
|
224
utils/xxhash.c
Normal file
224
utils/xxhash.c
Normal file
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
* This file is a part of Pcompress, a chunked parallel multi-
|
||||
* algorithm lossless compression and decompression program.
|
||||
*
|
||||
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
||||
*
|
||||
* This program includes partly-modified public domain source
|
||||
* code from the LZMA SDK: http://www.7-zip.org/sdk.html
|
||||
*/
|
||||
|
||||
/*
|
||||
xxHash - Fast Hash algorithm
|
||||
Copyright (C) 2012, Yann Collet.
|
||||
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact the author at :
|
||||
- xxHash source repository : http://code.google.com/p/xxhash/
|
||||
*/
|
||||
|
||||
|
||||
//**************************************
|
||||
// Includes
|
||||
//**************************************
|
||||
#include "xxhash.h"
|
||||
|
||||
|
||||
|
||||
//**************************************
|
||||
// Compiler Options
|
||||
//**************************************
|
||||
#ifdef _MSC_VER // Visual Studio
|
||||
#define inline __forceinline // Visual is not C99, but supports some kind of inline
|
||||
#endif
|
||||
|
||||
// GCC does not support _rotl outside of Windows
|
||||
#if !defined(_WIN32)
|
||||
#define _rotl(x,r) ((x << r) | (x >> (32 - r)))
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//**************************************
|
||||
// Constants
|
||||
//**************************************
|
||||
#define PRIME1 2654435761U
|
||||
#define PRIME2 2246822519U
|
||||
#define PRIME3 3266489917U
|
||||
#define PRIME4 668265263U
|
||||
#define PRIME5 0x165667b1
|
||||
|
||||
|
||||
|
||||
//****************************
|
||||
// Private functions
|
||||
//****************************
|
||||
|
||||
// This version is for very small inputs (< 16 bytes)
|
||||
inline unsigned int XXH_small(const void* key, int len, unsigned int seed)
|
||||
{
|
||||
const unsigned char* p = (unsigned char*)key;
|
||||
const unsigned char* const bEnd = p + len;
|
||||
unsigned int idx = seed + PRIME1;
|
||||
unsigned int crc = PRIME5;
|
||||
const unsigned char* const limit = bEnd - 4;
|
||||
|
||||
while (p<limit)
|
||||
{
|
||||
crc += ((*(unsigned int*)p) + idx++);
|
||||
crc += _rotl(crc, 17) * PRIME4;
|
||||
crc *= PRIME1;
|
||||
p+=4;
|
||||
}
|
||||
|
||||
while (p<bEnd)
|
||||
{
|
||||
crc += ((*p) + idx++);
|
||||
crc *= PRIME1;
|
||||
p++;
|
||||
}
|
||||
|
||||
crc += len;
|
||||
|
||||
crc ^= crc >> 15;
|
||||
crc *= PRIME2;
|
||||
crc ^= crc >> 13;
|
||||
crc *= PRIME3;
|
||||
crc ^= crc >> 16;
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//******************************
|
||||
// Hash functions
|
||||
//******************************
|
||||
unsigned int XXH_fast32(const void* input, int len, unsigned int seed)
|
||||
{
|
||||
// Special case, for small inputs
|
||||
if (len < 16) return XXH_small(input, len, seed);
|
||||
|
||||
{
|
||||
const unsigned char* p = (const unsigned char*)input;
|
||||
const unsigned char* const bEnd = p + len;
|
||||
unsigned int v1 = seed + PRIME1;
|
||||
unsigned int v2 = v1 * PRIME2 + len;
|
||||
unsigned int v3 = v2 * PRIME3;
|
||||
unsigned int v4 = v3 * PRIME4;
|
||||
const unsigned char* const limit = bEnd - 16;
|
||||
unsigned int crc;
|
||||
|
||||
while (p<limit)
|
||||
{
|
||||
v1 = _rotl(v1, 13) + (*(unsigned int*)p); p+=4;
|
||||
v2 = _rotl(v2, 11) + (*(unsigned int*)p); p+=4;
|
||||
v3 = _rotl(v3, 17) + (*(unsigned int*)p); p+=4;
|
||||
v4 = _rotl(v4, 19) + (*(unsigned int*)p); p+=4;
|
||||
}
|
||||
|
||||
p = bEnd - 16;
|
||||
v1 += _rotl(v1, 17); v2 += _rotl(v2, 19); v3 += _rotl(v3, 13); v4 += _rotl(v4, 11);
|
||||
v1 *= PRIME1; v2 *= PRIME1; v3 *= PRIME1; v4 *= PRIME1;
|
||||
v1 += *(unsigned int*)p; p+=4; v2 += *(unsigned int*)p; p+=4; v3 += *(unsigned int*)p; p+=4; v4 += *(unsigned int*)p; // p+=4;
|
||||
v1 *= PRIME2; v2 *= PRIME2; v3 *= PRIME2; v4 *= PRIME2;
|
||||
v1 += _rotl(v1, 11); v2 += _rotl(v2, 17); v3 += _rotl(v3, 19); v4 += _rotl(v4, 13);
|
||||
v1 *= PRIME3; v2 *= PRIME3; v3 *= PRIME3; v4 *= PRIME3;
|
||||
|
||||
crc = v1 + _rotl(v2, 3) + _rotl(v3, 6) + _rotl(v4, 9);
|
||||
crc ^= crc >> 11;
|
||||
crc += (PRIME4+len) * PRIME1;
|
||||
crc ^= crc >> 15;
|
||||
crc *= PRIME2;
|
||||
crc ^= crc >> 13;
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
unsigned int XXH_strong32(const void* input, int len, unsigned int seed)
|
||||
{
|
||||
// Special case, for small inputs
|
||||
if (len < 16) return XXH_small(input, len, seed);
|
||||
|
||||
{
|
||||
const unsigned char* p = (const unsigned char*)input;
|
||||
const unsigned char* const bEnd = p + len;
|
||||
unsigned int v1 = seed + PRIME1;
|
||||
unsigned int v2 = v1 * PRIME2 + len;
|
||||
unsigned int v3 = v2 * PRIME3;
|
||||
unsigned int v4 = v3 * PRIME4;
|
||||
const unsigned char* const limit = bEnd - 16;
|
||||
unsigned int crc;
|
||||
|
||||
while (p<limit)
|
||||
{
|
||||
v1 += _rotl(v1, 13); v1 *= PRIME1; v1 += (*(unsigned int*)p); p+=4;
|
||||
v2 += _rotl(v2, 11); v2 *= PRIME1; v2 += (*(unsigned int*)p); p+=4;
|
||||
v3 += _rotl(v3, 17); v3 *= PRIME1; v3 += (*(unsigned int*)p); p+=4;
|
||||
v4 += _rotl(v4, 19); v4 *= PRIME1; v4 += (*(unsigned int*)p); p+=4;
|
||||
}
|
||||
|
||||
p = bEnd - 16;
|
||||
v1 += _rotl(v1, 17); v2 += _rotl(v2, 19); v3 += _rotl(v3, 13); v4 += _rotl(v4, 11);
|
||||
v1 *= PRIME1; v2 *= PRIME1; v3 *= PRIME1; v4 *= PRIME1;
|
||||
v1 += *(unsigned int*)p; p+=4; v2 += *(unsigned int*)p; p+=4; v3 += *(unsigned int*)p; p+=4; v4 += *(unsigned int*)p; // p+=4;
|
||||
v1 *= PRIME2; v2 *= PRIME2; v3 *= PRIME2; v4 *= PRIME2;
|
||||
v1 += _rotl(v1, 11); v2 += _rotl(v2, 17); v3 += _rotl(v3, 19); v4 += _rotl(v4, 13);
|
||||
v1 *= PRIME3; v2 *= PRIME3; v3 *= PRIME3; v4 *= PRIME3;
|
||||
|
||||
crc = v1 + _rotl(v2, 3) + _rotl(v3, 6) + _rotl(v4, 9);
|
||||
crc ^= crc >> 11;
|
||||
crc += (PRIME4+len) * PRIME1;
|
||||
crc ^= crc >> 15;
|
||||
crc *= PRIME2;
|
||||
crc ^= crc >> 13;
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
80
utils/xxhash.h
Normal file
80
utils/xxhash.h
Normal file
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* This file is a part of Pcompress, a chunked parallel multi-
|
||||
* algorithm lossless compression and decompression program.
|
||||
*
|
||||
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
||||
*/
|
||||
|
||||
/*
|
||||
xxHash - Fast Hash algorithm
|
||||
Header File
|
||||
Copyright (C) 2012, Yann Collet.
|
||||
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact the author at :
|
||||
- xxHash source repository : http://code.google.com/p/xxhash/
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#if defined (__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
//****************************
|
||||
// Hash Functions
|
||||
//****************************
|
||||
|
||||
unsigned int XXH_fast32 (const void* input, int len, unsigned int seed);
|
||||
unsigned int XXH_strong32(const void* input, int len, unsigned int seed);
|
||||
|
||||
/*
|
||||
XXH_fast32() :
|
||||
Calculate the 32-bits hash of "input", of length "len"
|
||||
"seed" can be used to alter the result
|
||||
|
||||
XXH_strong32() :
|
||||
Same as XXH_fast(), but the resulting hash has stronger properties
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#if defined (__cplusplus)
|
||||
}
|
||||
#endif
|
Loading…
Reference in a new issue