From 26bb1372575df03e66327e0e86123356a97baa4a Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Fri, 25 Jan 2013 00:10:12 +0530 Subject: [PATCH] Changes for generalized runtime SSE/AVX/XOP detection. Multi instruction set XXhash build with runtime selection. Extend CPUID code to detect more instruction sets. Add options for BLAKE2 hash. Move GCC builtins into utils header. Bump file format version number due to extended digest flags. Add descriptions to digest list. --- INSTALL | 19 ++++++++++ Makefile.in | 22 +++++++++-- config | 17 +++++---- crypto/crypto_utils.c | 46 ++++++++++++++++------- crypto/crypto_utils.h | 7 +++- crypto/scrypt/crypto_aesctr.c | 4 +- crypto/sha2/sha256.c | 2 +- main.c | 12 +++--- pcompress.h | 2 +- utils/cpuid.c | 20 +++++++++- utils/sse_level.c | 7 +++- utils/utils.c | 11 ++++++ utils/utils.h | 18 +++++++-- utils/xxhash.c | 14 +++---- utils/xxhash.h | 16 +++++--- utils/xxhash_base.c | 71 +++++++++++++++++++++++++++++++++++ utils/xxhash_sse2.c | 3 ++ utils/xxhash_sse4.c | 3 ++ 18 files changed, 236 insertions(+), 58 deletions(-) create mode 100644 utils/xxhash_base.c create mode 100644 utils/xxhash_sse2.c create mode 100644 utils/xxhash_sse4.c diff --git a/INSTALL b/INSTALL index b2dbc4c..e4ed9bd 100644 --- a/INSTALL +++ b/INSTALL @@ -1,5 +1,17 @@ Copyright (c) 2012 Moinak Ghosh +Prerequisites +============= +64-bit System. +GCC 4.4 (with mpfr, ppl and cloog support packages for loop + vectorization). +libz (zlib) and developments packages. +Libbz2 and development packages. +Libbsc source tree if BSC support is desired. + See below: + "Steps for building with libbsc support". +OpenSSL version 0.9.8 or greater. + Basic Installation ================== The simplest process to build and install this utility is: @@ -96,6 +108,13 @@ is not the usual GNU Autoconf script. Enable building against an alternate Bzip2 and library installation. +--no-sse-detect Do not try to detect the CPU's SSE capability. This + mode will simply use SSE2 as the fallback default. + Using SSE4 and later improves performance significantly. + +--use-key256 Use 256-bit encryption keys. Default key length is + 128-bit. + --help Display the help message. Steps for building with libbsc support diff --git a/Makefile.in b/Makefile.in index a9a26b9..de63368 100644 --- a/Makefile.in +++ b/Makefile.in @@ -24,9 +24,9 @@ PROG= pcompress MAINSRCS = main.c utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \ adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \ - utils/xxhash.c utils/heapq.c utils/cpuid.c + utils/xxhash_base.c utils/heapq.c utils/cpuid.c MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heapq.h \ - utils/cpuid.h + utils/cpuid.h utils/xxhash.h MAINOBJS = $(MAINSRCS:.c=.o) CRYPTO_SRCS = crypto/aes/crypto_aes.c crypto/scrypt/crypto_scrypt-nosse.c \ @@ -37,6 +37,12 @@ CRYPTO_HDRS = crypto/crypto_utils.h crypto/scrypt/crypto_scrypt.h \ CRYPTO_OBJS = $(CRYPTO_SRCS:.c=.o) CRYPTO_CPPFLAGS=-I@OPENSSL_INCDIR@ +XXHASH_SRCS = utils/xxhash.c +XXHASH_SSE4_SRCS = utils/xxhash_sse4.c +XXHASH_SSE2_SRCS = utils/xxhash_sse2.c +XXHASH_OBJS = utils/xxhash_sse4.o utils/xxhash_sse2.o +XXHASH_HDRS = utils/xxhash.h + ZLIB_SRCS = zlib_compress.c ZLIB_HDRS = $(MAINHDRS) ZLIB_OBJS = $(ZLIB_SRCS:.c=.o) @@ -156,7 +162,7 @@ LDLIBS = -ldl -L./buildtmp -Wl,-R@LIBBZ2_DIR@ -lbz2 -L./buildtmp -Wl,-R@LIBZ_DIR OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \ $(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \ -$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) +$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ DEBUG_COMPILE = gcc -g -c @EXTRA_OPT_FLAGS@ @@ -185,8 +191,12 @@ COMPILE_cpp = @COMPILE_cpp@ VEC_FLAGS = @VEC_FLAGS@ LOOP_OPTFLAGS = @LOOP_OPTFLAGS@ CPPFLAGS = @CPPFLAGS@ @NO_SLAB_CPPFLAGS@ @DEBUG_STATS_CPPFLAGS@ -GEN_OPT = @GEN_OPT@ +GEN_OPT = @GEN_OPT@ @SSE_OPT_FLAGS@ +BASE_OPT = @GEN_OPT@ PREFIX=@PREFIX@ +SSE4_OPT_FLAG = -msse4.2 +SSE3_OPT_FLAG = -mssse3 +SSE2_OPT_FLAG = -msse2 SKEIN_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@ SHA256_FLAGS = $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) @FPTR_FLAG@ @@ -257,6 +267,10 @@ $(ZLIB_OBJS): $(ZLIB_SRCS) $(ZLIB_HDRS) $(BZLIB_OBJS): $(BZLIB_SRCS) $(BZLIB_HDRS) $(COMPILE) $(GEN_OPT) $(BZLIB_CPPFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ +$(XXHASH_OBJS): $(XXHASH_SSE4_SRCS) $(XXHASH_SSE2_SRCS) $(XXHASH_HDRS) $(XXHASH_SRCS) + $(COMPILE) $(BASE_OPT) $(SSE4_OPT_FLAG) $(CPPFLAGS) $(XXHASH_SSE4_SRCS) -o $(XXHASH_SSE4_SRCS:.c=.o) + $(COMPILE) $(BASE_OPT) $(SSE2_OPT_FLAG) $(CPPFLAGS) $(XXHASH_SSE2_SRCS) -o $(XXHASH_SSE2_SRCS:.c=.o) + $(MAINOBJS): $(MAINSRCS) $(MAINHDRS) $(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@ diff --git a/config b/config index cb7cb24..12139e2 100755 --- a/config +++ b/config @@ -21,6 +21,7 @@ ${prog} [] Enable building against an alternate Zlib installation. --with-bzlib= (Default: System) Enable building against an alternate Bzip2 and library installation. +--no-sse-detect Do NOT attempt to probe the system's SSE/AVX capability for build flags. --use-key256 Use 256-bit encryption keys. Default key length is 128-bit. --help Display this help message. @@ -54,7 +55,7 @@ extra_opt_flags= zlib_prefix= bzlib_prefix= sse_detect=1 -default_sse="-msse2" +sse_opt_flags="-msse2" rm -rf ./buildtmp mkdir ./buildtmp @@ -76,7 +77,7 @@ gcc tst.c -o tst if [ $? -ne 0 ] then echo "ERROR:" - echo "Cannot compile a simple program. GCC 4.1 and above is required" + echo "Cannot compile a simple program. GCC 4.4 and above is required" echo "to build this program. Please include installation bindir of GCC in the PATH." echo "" rm -f tst.c @@ -140,7 +141,7 @@ do --use-key256) keylen='-DKEYLEN=32' ;; - --no-sse-check) + --no-sse-detect) sse_detect=0 ;; --help) usage $0;; @@ -180,10 +181,10 @@ IFS=. set -- ${vers} IFS="$OIFS" -if [ $1 -lt 4 -o $2 -lt 1 ] +if [ $1 -lt 4 -o $2 -lt 4 ] then echo "ERROR:" - echo "GCC version 4.1 or above is required." + echo "GCC version 4.4 or above is required." echo "" exit 1 fi @@ -209,9 +210,7 @@ then exit 1 fi rm -f sse_level - extra_opt_flags="${extra_opt_flags} -msse${sse_ver}" -else - extra_opt_flags="${extra_opt_flags} ${default_sse}" + sse_opt_flags="-m${sse_ver}" fi @@ -498,6 +497,7 @@ sha256objsvar="SHA256_OBJS" yasmvar="YASM" fptr_flag_var="FPTR_FLAG" extra_opt_flags_var="EXTRA_OPT_FLAGS" +sse_opt_flags_var="SSE_OPT_FLAGS" openssllibdirvar="OPENSSL_LIBDIR" opensslincdirvar="OPENSSL_INCDIR" @@ -550,5 +550,6 @@ s#@${keccak_hdrs_var}@#${keccak_hdrs}#g s#@${keccak_srcs_var}@#${keccak_srcs}#g s#@${keccak_srcs_asm_var}@#${keccak_srcs_asm}#g s#@${extra_opt_flags_var}@#${extra_opt_flags}#g +s#@${sse_opt_flags_var}@#${sse_opt_flags}#g " > Makefile diff --git a/crypto/crypto_utils.c b/crypto/crypto_utils.c index 81148ae..a300103 100644 --- a/crypto/crypto_utils.c +++ b/crypto/crypto_utils.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include "crypto_utils.h" @@ -54,17 +54,29 @@ static int geturandom_bytes(uchar_t rbytes[32]); typedef void (*ckinit_func_ptr)(void); static struct { const char *name; + const char *desc; cksum_t cksum_id; int bytes, mac_bytes; ckinit_func_ptr init_func; } cksum_props[] = { - {"CRC64", CKSUM_CRC64, 8, 32, NULL}, - {"SKEIN256", CKSUM_SKEIN256, 32, 32, NULL}, - {"SKEIN512", CKSUM_SKEIN512, 64, 64, NULL}, - {"SHA256", CKSUM_SHA256, 32, 32, init_sha256}, - {"SHA512", CKSUM_SHA512, 64, 64, NULL}, - {"KECCAK256", CKSUM_KECCAK256, 32, 32, NULL}, - {"KECCAK512", CKSUM_KECCAK512, 64, 64, NULL} + {"CRC64", "Fast 64-bit CRC from LZMA SDK.", + CKSUM_CRC64, 8, 32, NULL}, + {"SKEIN256", "256-bit SKEIN a NIST SHA3 runners-up (90% faster than Keccak).", + CKSUM_SKEIN256, 32, 32, NULL}, + {"SKEIN512", "512-bit SKEIN", + CKSUM_SKEIN512, 64, 64, NULL}, + {"SHA256", "Intel's optimized (SSE,AVX) 256-bit SHA2 implementation for x86.", + CKSUM_SHA256, 32, 32, init_sha256}, + {"SHA512", "512-bit SHA2 from OpenSSL's crypto library.", + CKSUM_SHA512, 64, 64, NULL}, + {"KECCAK256", "Official 256-bit NIST SHA3 optimized implementation.", + CKSUM_KECCAK256, 32, 32, NULL}, + {"KECCAK512", "Official 512-bit NIST SHA3 optimized implementation.", + CKSUM_KECCAK512, 64, 64, NULL}, + {"BLAKE256", "Very fast 256-bit BLAKE2, derived from the NIST SHA3 runner-up BLAKE.", + CKSUM_BLAKE256, 32, 32, NULL}, + {"BLAKE512", "Very fast 256-bit BLAKE2, derived from the NIST SHA3 runner-up BLAKE.", + CKSUM_BLAKE512, 64, 64, NULL} }; static int cksum_provider = PROVIDER_OPENSSL; @@ -213,12 +225,9 @@ init_sha256(void) cksum_provider = PROVIDER_OPENSSL; #else #ifdef __x86_64__ - processor_info_t pc; - cksum_provider = PROVIDER_OPENSSL; - cpuid_basic_identify(&pc); - if (pc.proc_type == PROC_X64_INTEL || pc.proc_type == PROC_X64_AMD) { - if (opt_Init_SHA(&pc) == 0) { + if (proc_info.proc_type == PROC_X64_INTEL || proc_info.proc_type == PROC_X64_AMD) { + if (opt_Init_SHA(&proc_info) == 0) { cksum_provider = PROVIDER_X64_OPT; } } @@ -226,6 +235,15 @@ init_sha256(void) #endif } +void +list_checksums(FILE *strm, char *pad) +{ + int i; + for (i=0; i<(sizeof (cksum_props)/sizeof (cksum_props[0])); i++) { + fprintf(strm, "%s%10s - %s\n", pad, cksum_props[i].name, cksum_props[i].desc); + } +} + /* * Check if either the given checksum name or id is valid and * return it's properties. @@ -235,7 +253,7 @@ get_checksum_props(const char *name, int *cksum, int *cksum_bytes, int *mac_byte { int i; - for (i=0; ibuf)); dat = _mm_loadu_si128((__m128i *)(inbuf+pos)); odat = _mm_xor_si128(cblk, dat); diff --git a/crypto/sha2/sha256.c b/crypto/sha2/sha256.c index 86b7930..f8960ea 100644 --- a/crypto/sha2/sha256.c +++ b/crypto/sha2/sha256.c @@ -89,7 +89,7 @@ APS_NAMESPACE(Init_SHA) (processor_info_t *pc) if (pc->proc_type == PROC_X64_INTEL || pc->proc_type == PROC_X64_AMD) { if (pc->avx_level > 0) { sha_update_func = sha256_avx; - + } else if (pc->sse_level >= 4) { sha_update_func = sha256_sse4; diff --git a/main.c b/main.c index d06c73b..fb54d0c 100644 --- a/main.c +++ b/main.c @@ -158,15 +158,16 @@ usage(void) " NOTE - Both -L and -P can be used together to give maximum benefit on most.\n" " datasets.\n" " '-S' \n" - " - Specify chunk checksum to use: CRC64, SKEIN256, SKEIN512, SHA256 and\n" - " SHA512. Default one is SKEIN256.\n" + " - Specify chunk checksum to use:\n\n", + UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name); + list_checksums(stderr, " "); + fprintf(stderr, "\n" " '-F' - Perform Fixed-Block Deduplication. Faster than '-D' in some cases\n" " but with lower deduplication ratio.\n" " '-B' <1..5>\n" " - Specify an average Dedupe block size. 1 - 4K, 2 - 8K ... 5 - 64K.\n" " '-M' - Display memory allocator statistics\n" - " '-C' - Display compression statistics\n\n", - UTILITY_VERSION, exec_name, exec_name, exec_name, exec_name, exec_name, exec_name); + " '-C' - Display compression statistics\n\n"); } void @@ -716,7 +717,7 @@ start_decompress(const char *filename, const char *to_filename) err = 1; goto uncomp_done; } - if (version < VERSION-2) { + if (version < VERSION-3) { fprintf(stderr, "Unsupported version: %d\n", version); err = 1; goto uncomp_done; @@ -2340,6 +2341,7 @@ main(int argc, char *argv[]) exit(1); } main_cancel = 0; + init_pcompress(); if (cksum == 0) get_checksum_props(DEFAULT_CKSUM, &cksum, &cksum_bytes, &mac_bytes); diff --git a/pcompress.h b/pcompress.h index f3d6fb5..042538f 100644 --- a/pcompress.h +++ b/pcompress.h @@ -38,7 +38,7 @@ extern "C" { #define CHUNK_FLAG_SZ 1 #define ALGO_SZ 8 #define MIN_CHUNK 2048 -#define VERSION 5 +#define VERSION 6 #define FLAG_DEDUP 1 #define FLAG_DEDUP_FIXED 2 #define FLAG_SINGLE_CHUNK 4 diff --git a/utils/cpuid.c b/utils/cpuid.c index b8ef1df..917846e 100644 --- a/utils/cpuid.c +++ b/utils/cpuid.c @@ -32,6 +32,10 @@ #define SSE4_1_FLAG 0x080000 #define SSE4_2_FLAG 0x100000 +#define SSE3_FLAG 0x1 +#define SSSE3_FLAG 0x200 +#define AVX_FLAG 0x10000000 +#define XOP_FLAG 0x800 void exec_cpuid(uint32_t *regs) @@ -112,6 +116,7 @@ cpuid_basic_identify(processor_info_t *pc) pc->avx_level = 0; pc->sse_level = 0; pc->sse_sub_level = 0; + pc->xop_avail = 0; if (strcmp(raw.vendor_str, "GenuineIntel") == 0) { pc->proc_type = PROC_X64_INTEL; @@ -131,12 +136,23 @@ cpuid_basic_identify(processor_info_t *pc) pc->sse_sub_level = 2; } } else { - pc->sse_level = 3; + if (raw.basic_cpuid[1][2] & SSE3_FLAG) { + pc->sse_level = 3; + if (raw.basic_cpuid[1][2] & SSSE3_FLAG) { + pc->sse_sub_level = 1; + } + } else { + pc->sse_level = 2; + } } pc->avx_level = 0; - if (raw.basic_cpuid[1][2] & (1 << 28)) { + if (raw.basic_cpuid[1][2] & AVX_FLAG) { pc->avx_level = 1; } + + if (raw.ext_cpuid[1][2] & XOP_FLAG) { + pc->xop_avail = 1; + } } } diff --git a/utils/sse_level.c b/utils/sse_level.c index b69837d..106f72d 100644 --- a/utils/sse_level.c +++ b/utils/sse_level.c @@ -7,7 +7,12 @@ main(void) { processor_info_t pc; cpuid_basic_identify(&pc); - printf("%d", pc.sse_level); + if (pc.sse_level == 3 && pc.sse_sub_level == 1) { + printf("ssse%d", pc.sse_level); + pc.sse_sub_level = 0; + } else { + printf("sse%d", pc.sse_level); + } if (pc.sse_sub_level > 0) printf(".%d\n", pc.sse_sub_level); else diff --git a/utils/utils.c b/utils/utils.c index d1e4c81..795bd2f 100644 --- a/utils/utils.c +++ b/utils/utils.c @@ -36,9 +36,20 @@ #include #include #include +#include +#include +#define _IN_UTILS_ #include "utils.h" +processor_info_t proc_info; + +void +init_pcompress() { + cpuid_basic_identify(&proc_info); + XXH32_module_init(); +} + void err_exit(int show_errno, const char *format, ...) { diff --git a/utils/utils.h b/utils/utils.h index e9017b9..f8228d6 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -91,13 +91,17 @@ typedef int32_t bsize_t; // These allow helping the compiler in some often-executed branches, whose // result is almost always the same. #ifdef __GNUC__ -# define likely(expr) __builtin_expect(expr, 1) -# define unlikely(expr) __builtin_expect(expr, 0) -# define ATOMIC_ADD(var, val) __sync_fetch_and_add(&var, val) -# define ATOMIC_SUB(var, val) __sync_fetch_and_sub(&var, val) +# define likely(expr) __builtin_expect(expr, 1) +# define unlikely(expr) __builtin_expect(expr, 0) +# define ATOMIC_ADD(var, val) __sync_fetch_and_add(&var, val) +# define ATOMIC_SUB(var, val) __sync_fetch_and_sub(&var, val) +# define PREFETCH_WRITE(x, n) __builtin_prefetch((x), 1, (n)) +# define PREFETCH_READ(x, n) __builtin_prefetch((x), 0, (n)) #else # define likely(expr) (expr) # define unlikely(expr) (expr) +# define PREFETCH_WRITE(x, n) +# define PREFETCH_READ(x, n) # if defined(sun) || defined (__sun) # include # define ATOMIC_ADD(var, val) atomic_add_int(&var, val) @@ -148,9 +152,14 @@ typedef struct { int sse_level; int sse_sub_level; int avx_level; + int xop_avail; proc_type_t proc_type; } processor_info_t; +#ifndef _IN_UTILS_ +extern processor_info_t proc_info; +#endif + extern void err_exit(int show_errno, const char *format, ...); extern const char *get_execname(const char *); extern int parse_numeric(int64_t *val, const char *str); @@ -165,6 +174,7 @@ extern uint64_t get_total_ram(); extern double get_wtime_millis(void); extern double get_mb_s(uint64_t bytes, double strt, double en); extern void init_algo_props(algo_props_t *props); +extern void init_pcompress(); /* Pointer type for compress and decompress functions. */ typedef int (*compress_func_ptr)(void *src, uint64_t srclen, void *dst, diff --git a/utils/xxhash.c b/utils/xxhash.c index 1f7bae3..26966ba 100644 --- a/utils/xxhash.c +++ b/utils/xxhash.c @@ -82,8 +82,6 @@ # define XXH_BIG_ENDIAN 0 #endif - - //************************************** // Compiler-specific Options & Functions //************************************** @@ -141,7 +139,7 @@ static inline __m128i _x_mm_rotl_epi32(const __m128i a, int bits) // Simple Hash Functions //**************************** -unsigned int XXH32(const void* input, int len, unsigned int seed) +unsigned int CPUCAP_NM(XXH32)(const void* input, int len, unsigned int seed) { #if 0 // Simple version, good for code maintenance, but unfortunately slow for small inputs @@ -284,7 +282,7 @@ struct XXH_state32_t }; -void* XXH32_init (unsigned int seed) +void* CPUCAP_NM(XXH32_init) (unsigned int seed) { struct XXH_state32_t * state = (struct XXH_state32_t *) malloc ( sizeof(struct XXH_state32_t)); state->seed = seed; @@ -303,7 +301,7 @@ void* XXH32_init (unsigned int seed) } -int XXH32_feed (void* state_in, const void* input, int len) +int CPUCAP_NM(XXH32_feed) (void* state_in, const void* input, int len) { struct XXH_state32_t * state = state_in; const unsigned char* p = (const unsigned char*)input; @@ -437,7 +435,7 @@ int XXH32_feed (void* state_in, const void* input, int len) } -unsigned int XXH32_getIntermediateResult (void* state_in) +unsigned int CPUCAP_NM(XXH32_getIntermediateResult) (void* state_in) { struct XXH_state32_t * state = state_in; unsigned char * p = (unsigned char*)state->memory; @@ -489,9 +487,9 @@ unsigned int XXH32_getIntermediateResult (void* state_in) } -unsigned int XXH32_result (void* state_in) +unsigned int CPUCAP_NM(XXH32_result) (void* state_in) { - unsigned int h32 = XXH32_getIntermediateResult(state_in); + unsigned int h32 = CPUCAP_NM(XXH32_getIntermediateResult)(state_in); free(state_in); diff --git a/utils/xxhash.h b/utils/xxhash.h index d253fa0..a336a47 100644 --- a/utils/xxhash.h +++ b/utils/xxhash.h @@ -63,12 +63,16 @@ It depends on successfully passing SMHasher test set. extern "C" { #endif +#ifndef CPUCAP_NM +#define CPUCAP_NM(x) x +#endif + //**************************** // Simple Hash Functions //**************************** -unsigned int XXH32 (const void* input, int len, unsigned int seed); +unsigned int CPUCAP_NM(XXH32) (const void* input, int len, unsigned int seed); /* XXH32() : @@ -86,9 +90,9 @@ XXH32() : // Advanced Hash Functions //**************************** -void* XXH32_init (unsigned int seed); -int XXH32_feed (void* state, const void* input, int len); -unsigned int XXH32_result (void* state); +void* CPUCAP_NM(XXH32_init) (unsigned int seed); +int CPUCAP_NM(XXH32_feed) (void* state, const void* input, int len); +unsigned int CPUCAP_NM(XXH32_result) (void* state); /* These functions calculate the xxhash of an input provided in several small packets, @@ -113,7 +117,7 @@ Memory will be freed by XXH32_result(). */ -unsigned int XXH32_getIntermediateResult (void* state); +unsigned int CPUCAP_NM(XXH32_getIntermediateResult) (void* state); /* This function does the same as XXH32_result(), generating a 32-bit hash, but preserve memory context. @@ -121,7 +125,7 @@ This way, it becomes possible to generate intermediate hashes, and then continue To free memory context, use XXH32_result(). */ - +void XXH32_module_init(); #if defined (__cplusplus) } diff --git a/utils/xxhash_base.c b/utils/xxhash_base.c new file mode 100644 index 0000000..868994d --- /dev/null +++ b/utils/xxhash_base.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include + +extern void* XXH32_init_SSE4 (unsigned int seed); +extern int XXH32_feed_SSE4 (void* state, const void* input, int len); +extern unsigned int XXH32_result_SSE4 (void* state); +extern unsigned int XXH32_getIntermediateResult_SSE4 (void* state); +extern unsigned int XXH32_SSE4 (const void* input, int len, unsigned int seed); + +extern void* XXH32_init_SSE2 (unsigned int seed); +extern int XXH32_feed_SSE2 (void* state, const void* input, int len); +extern unsigned int XXH32_result_SSE2 (void* state); +extern unsigned int XXH32_getIntermediateResult_SSE2 (void* state); +extern unsigned int XXH32_SSE2 (const void* input, int len, unsigned int seed); + +unsigned int (*xxh32)(const void* input, int len, unsigned int seed) = NULL; +void * (*xxh32_init)(unsigned int seed) = NULL; +int (*xxh32_feed)(void* state, const void* input, int len) = NULL; +unsigned int (*xxh32_result)(void* state) = NULL; +unsigned int (*xxh32_getIntermediateResult)(void* state) = NULL; +#include + +void +XXH32_module_init() { + if (proc_info.sse_level >= 4) { + xxh32 = XXH32_SSE4; + xxh32_init = XXH32_init_SSE4; + xxh32_feed = XXH32_feed_SSE4; + xxh32_result = XXH32_result_SSE4; + xxh32_getIntermediateResult = XXH32_getIntermediateResult_SSE4; + } else { + xxh32 = XXH32_SSE2; + xxh32_init = XXH32_init_SSE2; + xxh32_feed = XXH32_feed_SSE2; + xxh32_result = XXH32_result_SSE2; + xxh32_getIntermediateResult = XXH32_getIntermediateResult_SSE2; + } +} + +unsigned int +XXH32(const void* input, int len, unsigned int seed) +{ + return xxh32(input, len, seed); +} + +void* +XXH32_init(unsigned int seed) +{ + return xxh32_init(seed); +} + +int +XXH32_feed(void* state, const void* input, int len) +{ + return xxh32_feed(state, input, len); +} + +unsigned int +XXH32_result(void* state) +{ + return xxh32_result(state); +} + +unsigned int +XXH32_getIntermediateResult(void* state) +{ + return xxh32_getIntermediateResult(state); +} + diff --git a/utils/xxhash_sse2.c b/utils/xxhash_sse2.c new file mode 100644 index 0000000..1f13d2c --- /dev/null +++ b/utils/xxhash_sse2.c @@ -0,0 +1,3 @@ +#define CPUCAP_NM(x) x##_SSE2 +#include "xxhash.c" + diff --git a/utils/xxhash_sse4.c b/utils/xxhash_sse4.c new file mode 100644 index 0000000..4561c7e --- /dev/null +++ b/utils/xxhash_sse4.c @@ -0,0 +1,3 @@ +#define CPUCAP_NM(x) x##_SSE4 +#include "xxhash.c" +