Update,simplify analyzer function to indicate text data for Dict filter.

Fix archive header writing bug. Strip ^M chars from dict filter files. Include DICT preprocessing type. Fix a bunch of bugs found by Xcode.
2014-09-20 12:49:00 +05:30 · 2014-09-20 12:49:00 +05:30 · 071a9e2b26
commit 071a9e2b26
parent 4fedebc607
13 changed files with 617 additions and 448 deletions
--- a/.gitignore
+++ b/.gitignore
@ -24,3 +24,4 @@ libtool
 stamp-h1
 .libs
 buildtmp
 *.dSYM
--- a/Makefile.in
+++ b/Makefile.in
@ -31,9 +31,11 @@ GPP=@GPP@
 LIBVER=1
 MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
 	adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
-	utils/xxhash_base.c utils/heap.c utils/cpuid.c pcompress.c
+	utils/xxhash_base.c utils/heap.c utils/cpuid.c filters/analyzer/analyzer.c \
 	pcompress.c
 MAINHDRS = allocator.h  pcompress.h  utils/utils.h utils/xxhash.h utils/heap.h \
-	utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp
+	utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp \
 	filters/analyzer/analyzer.h
 MAINOBJS = $(MAINSRCS:.c=.o)
 PROGSRCS = main.c
@ -233,7 +235,7 @@ BASE_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_P
 	-I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ -I./rabin/global \
 	-I./crypto/keccak -I./filters/transpose -I./crypto/blake2 $(EXTRA_CPPFLAGS) \
 	-I./crypto/xsalsa20 -I./archive -pedantic -Wall -I./filters -fno-strict-aliasing \
-	-Wno-unused-but-set-variable -Wno-enum-compare \
+	-Wno-unused-but-set-variable -Wno-enum-compare -I./filters/analyzer \
 	@COMPAT_CPPFLAGS@ @XSALSA20_DEBUG@ -I@LIBARCHIVE_DIR@/libarchive -I./filters/packjpg \
 	-I./filters/packpnm @ENABLE_WAVPACK@
 COMMON_CPPFLAGS = $(BASE_CPPFLAGS) -std=gnu99
--- a/archive/pc_archive.c
+++ b/archive/pc_archive.c
@ -175,7 +175,7 @@ creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len
 				pctx->btype = pctx->ctype;
 			} else {
 				if (pctx->arc_buf_pos < pctx->min_chunk) {
-					uint32_t diff = pctx->min_chunk - pctx->arc_buf_pos;
+					int diff = pctx->min_chunk - (int)(pctx->arc_buf_pos);
 					if (len > diff)
 						pctx->btype = pctx->ctype;
 					else
@ -918,9 +918,10 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry,
 	size_t sz, offset, len;
 	ssize_t bytes_to_write;
 	uchar_t *mapbuf;
-	int rv, fd;
+	int rv, fd, typ1;
 	const char *fpath;
 	typ1 = typ;
 	offset = 0;
 	rv = 0;
 	sz = archive_entry_size(entry);
@ -1014,6 +1015,11 @@ do_map:
 					} else {
 						return (ARCHIVE_OK);
 					}
 				} else {
 					if (write_header(arc, entry) == -1) {
 						close(fd);
 						return (-1);
 					}
 				}
 			} else {
 				if (write_header(arc, entry) == -1) {
@ -1029,7 +1035,7 @@ do_map:
 		 * stage there is no need for blocking.
 		 */
 		wrtn = archive_write_data(arc, src, wlen);
-		if (wrtn < wlen) {
+		if (wrtn < (ssize_t)wlen) {
 			/* Write failed; this is bad */
 			log_msg(LOG_ERR, 0, "Data write error: %s", archive_error_string(arc));
 			rv = -1;
--- a/1
+++ b/1
@ -714,7 +714,6 @@ echo "*************** Running configure in libarchive ****************"
 (cd $libarchive_dir
 CC=${GCC} ./configure --disable-bsdtar --disable-bsdcpio --without-zlib --without-bz2lib --without-lzmadec --without-lzma  --without-lzo2  --without-nettle --without-openssl --without-xml2 --without-expat --disable-silent-rules --enable-shared=no --enable-static=yes --enable-xattr
 [ $? -ne 0 ] && exit $?
 cp Makefile Makefile.orig
 cat Makefile | sed '
 s@$(BUILT_SOURCES)@@
 s@$(libarchive_test_SOURCES)@@
--- a/filters/analyzer/analyzer.c
+++ b/filters/analyzer/analyzer.c
@ -0,0 +1,69 @@
 /*
 * This file is a part of Pcompress, a chunked parallel multi-
 * algorithm lossless compression and decompression program.
 *
 * Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
 * Use is subject to license terms.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program.
 * If not, see <http://www.gnu.org/licenses/>.
 *
 * moinakg@belenix.org, http://moinakg.wordpress.com/
 */
 #include "utils.h"
 int
 analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode)
 {
 	uchar_t *src1 = (uchar_t *)src;
 	int stype = PC_SUBTYPE(btype);
 	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
 		uint32_t freq[256], freq0x80[2] = {0};
 		uint64_t i, alphabetNum = 0, tot8b = 0;
 		uchar_t cur_byte;
                /*
                 * Count number of 8-bit binary bytes and XML tags in source.
                 */
                tot8b = 0;
 		for (i = 0; i < srclen; i++) {
                        cur_byte = src1[i];
                        tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
 			freq[cur_byte]++;
 		}
 		for (i = 0; i < 256; i++)
 			freq0x80[i>>7]+=freq[i];
 		for(i = 'a'; i <= 'z'; i++)
 			alphabetNum+=freq[i];
                /*
                 * Heuristics for detecting BINARY vs generic TEXT
                 */
                tot8b /= 0x80;
 		if (tot8b < (srclen>>2 + srclen>>3)) {
                        btype = TYPE_TEXT;
 			if (freq0x80[1]<(srclen>>3) && (freq[' ']>(srclen>>7)) 
 			    && (freq['a']+freq['e']+freq['t']>(srclen>>4))
 			    && alphabetNum>(srclen>>2)) {
 				btype |= TYPE_ENGLISH;
 			}
 		}
 	}
 	return (btype);
 }
--- a/filters/analyzer/analyzer.h
+++ b/filters/analyzer/analyzer.h
@ -0,0 +1,30 @@
 /*
 * This file is a part of Pcompress, a chunked parallel multi-
 * algorithm lossless compression and decompression program.
 *
 * Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
 * Use is subject to license terms.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program.
 * If not, see <http://www.gnu.org/licenses/>.
 *
 * moinakg@belenix.org, http://moinakg.wordpress.com/
 */
 #ifndef	_ANALYZER_H
 #define	_ANALYZER_H
 int analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode);
 #endif
--- a/filters/delta2/delta2.c
+++ b/filters/delta2/delta2.c
@ -432,7 +432,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
 		val = gtot1 + (srclen - (pos - src));
 	}
 	*dstlen = pos2 - dst;
-	return (val);
+	return ((int)val);
 }
 int
--- a/filters/dict/Common.h
+++ b/filters/dict/Common.h
@ -13,14 +13,14 @@ typedef int64_t i64;
 const u32 KB=1024;
 const u32 MB=1048576;
-const u32 MinBlockSize=8*KB;
+const u32 MinBlockSize=8*1024;
 const u32 MaxChunkBits=21;
-const u32 MaxChunkSize=(1<<(MaxChunkBits-1));
+const u32 MaxChunkSize=(1<<(21-1));
-const u32 MaxDictSize=512*MB;//Don't change
+const u32 MaxDictSize=512*1048576;//Don't change
-const u32 DefaultOutStreamBlockSize=128*KB;
+const u32 DefaultOutStreamBlockSize=128*1024;
-const u32 DefaultInBufferSize=MaxChunkSize;  //Should >=MaxChunkSize
+const u32 DefaultInBufferSize=21;  //Should >=MaxChunkSize
 #define DLT_CHANNEL_MAX 5
 const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8};
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@ -34,6 +34,34 @@
 #include "DictFilter.h"
 #include "Common.h"
 class DictFilter
 {
 public:
 	~DictFilter();
 	DictFilter();
 	u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
 	void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
 private:
 	typedef struct
 	{
 		u32 next[26];
 		u8 symbol;
 	} CTreeNode;
 	CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
 	u32 nodeMum;
 	u8 maxSymbol;
 	//Used for DICT transformer. Words are stored in trees.
 	u32 wordIndex[256];
 	//Used for DICT untransformer.choose words by symbols.
 	void MakeWordTree();  //Init the DICT transformer
 	u32 x0,x1;
 	u32 i,k;
 };
 const u32 wordNum = 123;
 u8 wordList[wordNum][8] =
--- a/filters/dict/DictFilter.h
+++ b/filters/dict/DictFilter.h
@ -35,34 +35,6 @@
 #include "Common.h"
 #define MAX_WORDTREE_NODE_NUM 300 //Enough now!
 class DictFilter
 {
 public:
 	~DictFilter();
 	DictFilter();
 	u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
 	void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
 private:
 	typedef struct
 	{
 		u32 next[26];
 		u8 symbol;
 	} CTreeNode;
 	CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
 	u32 nodeMum;
 	u8 maxSymbol;
 	//Used for DICT transformer. Words are stored in trees.
 	u32 wordIndex[256];
 	//Used for DICT untransformer.choose words by symbols.
 	void MakeWordTree();  //Init the DICT transformer
 	u32 x0,x1;
 	u32 i,k;
 };
 #ifdef  __cplusplus
 extern "C" {
 #endif
--- a/pcompress.c
+++ b/pcompress.c
@ -56,6 +56,8 @@
 #include <errno.h>
 #include <pc_archive.h>
 #include <filters/dispack/dis.hpp>
 #include "analyzer.h"
 #include "filters/dict/DictFilter.h"
 /*
 * We use 8MB chunks by default.
@ -204,7 +206,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
    void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data,
    algo_props_t *props)
 {
-	uchar_t *dest = (uchar_t *)dst, type = 0;
+	uchar_t *dest = (uchar_t *)dst, type = 0, atype;
 	int64_t result;
 	uint64_t _dstlen, fromlen;
 	uchar_t *from, *to;
@ -238,13 +240,45 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 		}
 	}
 	/*
 	 * The analyzer is run below only for non-archive mode. When archiving the
 	 * archiver thread runs analyzer on incremental blocks and sets the type
 	 * accordingly.
 	 */
 	atype = btype;
 	/*
 	 * Run an analyzer on the data. At present the analyzer only tries
 	 * to detect if this is text for running the dict filter.
 	 */
 	if (pctx->enable_analyzer) {
 		atype = analyze_buffer(src, srclen, btype, pctx->adapt_mode);
 	}
 	/*
 	 * Enabling LZP also enables the DICT filter since we are dealing with text
 	 * in any case.
 	 */
 	if (pctx->lzp_preprocess && PC_TYPE(atype) == TYPE_TEXT) {
 		void *dct = new_dict_context();
 		_dstlen = fromlen;
 		result = dict_encode(dct, from, fromlen, to, &_dstlen);
 		delete_dict_context(dct);
 		if (result != -1) {
 			uchar_t *tmp;
 			tmp = from;
 			from = to;
 			to = tmp;
 			fromlen = _dstlen;
 			type |= PREPROC_TYPE_DICT;
 		}
 	}
 #ifndef _MPLV2_LICENSE_
 	if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
 		int hashsize;
 		hashsize = lzp_hash_size(level);
 		result = lzp_compress((const uchar_t *)from, to, fromlen,
-				      hashsize, LZP_DEFAULT_LZPMINLEN, 0);
+					      hashsize, LZP_DEFAULT_LZPMINLEN, 0);
 		if (result >= 0 && result < srclen) {
 			uchar_t *tmp;
 			tmp = from;
@ -375,6 +409,20 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
 #endif
 	}
 	if (type & PREPROC_TYPE_DICT) {
 		void *dct = new_dict_context();
 		result = dict_decode(dct, src, srclen, dst, &_dstlen);
 		delete_dict_context(dct);
 		if (result != -1) {
 			memcpy(src, dst, _dstlen);
 			srclen = _dstlen;
 			*dstlen = _dstlen;
 		} else {
 			log_msg(LOG_ERR, 0, "DICT decoding failed.");
 			return (result);
 		}
 	}
 	if (type & PREPROC_TYPE_DISPACK) {
 		result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen1);
 		if (result != -1) {
@ -1070,7 +1118,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
 			memset(zero, 0, MAX_PW_LEN);
 			fd = open(pctx->pwd_file, O_RDWR);
 			if (fd != -1) {
-				pw_len = lseek(fd, 0, SEEK_END);
+				pw_len = (int)lseek(fd, 0, SEEK_END);
 				if (pw_len != -1) {
 					if (pw_len > MAX_PW_LEN) pw_len = MAX_PW_LEN-1;
 					lseek(fd, 0, SEEK_SET);
@ -1552,9 +1600,11 @@ redo:
 	dedupe_index_sz = 0;
 	type = COMPRESSED;
 	/* Perform Dedup if enabled. */
 	if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan)) {
 		dedupe_context_t *rctx;
 		uint64_t rb = tdat->rbytes;
 		/*
 		 * Compute checksum of original uncompressed chunk. When doing dedup
@ -1569,8 +1619,9 @@ redo:
 		rctx = tdat->rctx;
 		reset_dedupe_context(tdat->rctx);
 		rctx->cbuf = tdat->uncompressed_chunk;
-		dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0,
+		dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &rb, 0,
 						  NULL, tdat->cksum_mt);
 		tdat->rbytes = rb;
 		if (!rctx->valid) {
 			memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
 			tdat->rbytes = rbytes;
@ -1744,6 +1795,10 @@ plain_index:
 	tdat->len_cmp += (pctx->cksum_bytes + pctx->mac_bytes);
 	rbytes = tdat->len_cmp - len_cmp; // HDR size for HMAC
 	/*
 	 * In adaptive mode return value from compression function function indicates
 	 * which algorithm was used on the chunk. We have to store that.
 	 */
 	if (pctx->adapt_mode)
 		type |= (rv << 4);
@ -2750,7 +2805,8 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
 		pctx->_deinit_func = adapt_deinit;
 		pctx->_stats_func = adapt_stats;
 		pctx->_props_func = adapt_props;
-		pctx->adapt_mode = 1;
+		pctx->adapt_mode = 2;
 		pctx->enable_analyzer = 1;
 		rv = 0;
 	} else if (memcmp(algorithm, "adapt", 5) == 0) {
@ -2761,6 +2817,7 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
 		pctx->_stats_func = adapt_stats;
 		pctx->_props_func = adapt_props;
 		pctx->adapt_mode = 1;
 		pctx->enable_analyzer = 1;
 		rv = 0;
 #ifdef ENABLE_PC_LIBBSC
 	} else if (memcmp(algorithm, "libbsc", 6) == 0) {
@ -2770,7 +2827,6 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
 		pctx->_deinit_func = libbsc_deinit;
 		pctx->_stats_func = libbsc_stats;
 		pctx->_props_func = libbsc_props;
 		pctx->adapt_mode = 1;
 		rv = 0;
 #endif
 	}
@ -3337,6 +3393,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
 		}
 		if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) {
 			pctx->preprocess_mode = 1;
 			pctx->enable_analyzer = 1;
 		}
 		if (pctx->chunksize == 0) {
 			if (pctx->level < 9) {
--- a/pcompress.h
+++ b/pcompress.h
@ -60,13 +60,14 @@ extern "C" {
 #define	CHSIZE_MASK	0x80
 #define	BZIP2_A_NUM	16
 #define	LZMA_A_NUM	32
-#define	CHUNK_FLAG_DEDUP		2
+#define	CHUNK_FLAG_DEDUP	2
 #define	CHUNK_FLAG_PREPROC	4
 #define	COMP_EXTN	".pz"
-#define	PREPROC_TYPE_LZP		1
+#define	PREPROC_TYPE_LZP	1
 #define	PREPROC_TYPE_DELTA2	2
 #define	PREPROC_TYPE_DISPACK	4
 #define	PREPROC_TYPE_DICT	8
 #define	PREPROC_COMPRESSED	128
 /*
@ -212,6 +213,7 @@ typedef struct pc_ctx {
 	int delta2_nstrides;
 	int enable_rabin_split;
 	int enable_fixed_scan;
 	int enable_analyzer;
 	int preprocess_mode;
 	int lzp_preprocess;
 	int dispack_preprocess;
@ -275,7 +277,7 @@ struct cmp_data {
 	uchar_t *compressed_chunk;
 	uchar_t *uncompressed_chunk;
 	dedupe_context_t *rctx;
-	uint64_t rbytes;
+	int64_t rbytes;
 	uint64_t chunksize;
 	uint64_t len_cmp, len_cmp_be;
 	uchar_t checksum[CKSUM_MAX_BYTES];
--- a/utils/utils.c
+++ b/utils/utils.c
@ -383,14 +383,18 @@ get_total_ram()
 }
 #ifdef __APPLE__
 #define	NANO_SEC (1000000000ULL)
 int
 clock_gettime(int clk_id, struct timespec *ts)
 {
 	if (clk_id == CLOCK_MONOTONIC) {
-		uint64_t abstime = mach_absolute_time();
+		uint64_t nanotime = mach_absolute_time() *
-		return (abstime * sTimebaseInfo.numer / sTimebaseInfo.denom);
+		    sTimebaseInfo.numer / sTimebaseInfo.denom;
 		ts->tv_sec = nanotime / NANO_SEC;
 		ts->tv_nsec = nanotime % NANO_SEC;
 		return (0);
 	}
-	return (0);
+	return (EINVAL);
 }
 #endif
@ -543,8 +547,7 @@ log_msg(log_level_t log_level, int show_errno, const char *format, ...)
 		fputs(msg, stderr);
 	} else if (ldest.type == LOG_FILE) {
-		int rv;
+		(void) write(ldest.fd, msg, strlen(msg));
 		rv = write(ldest.fd, msg, strlen(msg));
 	} else {
 		ldest.cb(msg);
 	}