diff --git a/.gitignore b/.gitignore
index 2dc6b14..39e979a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@ libtool
stamp-h1
.libs
buildtmp
+*.dSYM
diff --git a/Makefile.in b/Makefile.in
index a327204..8a71942 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -31,9 +31,11 @@ GPP=@GPP@
LIBVER=1
MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
- utils/xxhash_base.c utils/heap.c utils/cpuid.c pcompress.c
+ utils/xxhash_base.c utils/heap.c utils/cpuid.c filters/analyzer/analyzer.c \
+ pcompress.c
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heap.h \
- utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp
+ utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp \
+ filters/analyzer/analyzer.h
MAINOBJS = $(MAINSRCS:.c=.o)
PROGSRCS = main.c
@@ -233,7 +235,7 @@ BASE_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_P
-I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ -I./rabin/global \
-I./crypto/keccak -I./filters/transpose -I./crypto/blake2 $(EXTRA_CPPFLAGS) \
-I./crypto/xsalsa20 -I./archive -pedantic -Wall -I./filters -fno-strict-aliasing \
- -Wno-unused-but-set-variable -Wno-enum-compare \
+ -Wno-unused-but-set-variable -Wno-enum-compare -I./filters/analyzer \
@COMPAT_CPPFLAGS@ @XSALSA20_DEBUG@ -I@LIBARCHIVE_DIR@/libarchive -I./filters/packjpg \
-I./filters/packpnm @ENABLE_WAVPACK@
COMMON_CPPFLAGS = $(BASE_CPPFLAGS) -std=gnu99
diff --git a/archive/pc_archive.c b/archive/pc_archive.c
index 0b15d72..506c7d4 100644
--- a/archive/pc_archive.c
+++ b/archive/pc_archive.c
@@ -175,7 +175,7 @@ creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len
pctx->btype = pctx->ctype;
} else {
if (pctx->arc_buf_pos < pctx->min_chunk) {
- uint32_t diff = pctx->min_chunk - pctx->arc_buf_pos;
+ int diff = pctx->min_chunk - (int)(pctx->arc_buf_pos);
if (len > diff)
pctx->btype = pctx->ctype;
else
@@ -918,9 +918,10 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry,
size_t sz, offset, len;
ssize_t bytes_to_write;
uchar_t *mapbuf;
- int rv, fd;
+ int rv, fd, typ1;
const char *fpath;
+ typ1 = typ;
offset = 0;
rv = 0;
sz = archive_entry_size(entry);
@@ -1014,6 +1015,11 @@ do_map:
} else {
return (ARCHIVE_OK);
}
+ } else {
+ if (write_header(arc, entry) == -1) {
+ close(fd);
+ return (-1);
+ }
}
} else {
if (write_header(arc, entry) == -1) {
@@ -1029,7 +1035,7 @@ do_map:
* stage there is no need for blocking.
*/
wrtn = archive_write_data(arc, src, wlen);
- if (wrtn < wlen) {
+ if (wrtn < (ssize_t)wlen) {
/* Write failed; this is bad */
log_msg(LOG_ERR, 0, "Data write error: %s", archive_error_string(arc));
rv = -1;
diff --git a/config b/config
index df4b21b..5ed7569 100755
--- a/config
+++ b/config
@@ -714,7 +714,6 @@ echo "*************** Running configure in libarchive ****************"
(cd $libarchive_dir
CC=${GCC} ./configure --disable-bsdtar --disable-bsdcpio --without-zlib --without-bz2lib --without-lzmadec --without-lzma --without-lzo2 --without-nettle --without-openssl --without-xml2 --without-expat --disable-silent-rules --enable-shared=no --enable-static=yes --enable-xattr
[ $? -ne 0 ] && exit $?
- cp Makefile Makefile.orig
cat Makefile | sed '
s@$(BUILT_SOURCES)@@
s@$(libarchive_test_SOURCES)@@
diff --git a/filters/analyzer/analyzer.c b/filters/analyzer/analyzer.c
new file mode 100644
index 0000000..1dce8c4
--- /dev/null
+++ b/filters/analyzer/analyzer.c
@@ -0,0 +1,69 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see .
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ */
+
+#include "utils.h"
+
+int
+analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode)
+{
+ uchar_t *src1 = (uchar_t *)src;
+ int stype = PC_SUBTYPE(btype);
+
+ if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
+ uint32_t freq[256], freq0x80[2] = {0};
+ uint64_t i, alphabetNum = 0, tot8b = 0;
+ uchar_t cur_byte;
+
+ /*
+ * Count number of 8-bit binary bytes and XML tags in source.
+ */
+ tot8b = 0;
+ for (i = 0; i < srclen; i++) {
+ cur_byte = src1[i];
+ tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
+ freq[cur_byte]++;
+ }
+
+ for (i = 0; i < 256; i++)
+ freq0x80[i>>7]+=freq[i];
+
+ for(i = 'a'; i <= 'z'; i++)
+ alphabetNum+=freq[i];
+
+ /*
+ * Heuristics for detecting BINARY vs generic TEXT
+ */
+ tot8b /= 0x80;
+ if (tot8b < (srclen>>2 + srclen>>3)) {
+ btype = TYPE_TEXT;
+ if (freq0x80[1]<(srclen>>3) && (freq[' ']>(srclen>>7))
+ && (freq['a']+freq['e']+freq['t']>(srclen>>4))
+ && alphabetNum>(srclen>>2)) {
+ btype |= TYPE_ENGLISH;
+ }
+ }
+ }
+
+ return (btype);
+}
diff --git a/filters/analyzer/analyzer.h b/filters/analyzer/analyzer.h
new file mode 100644
index 0000000..922b596
--- /dev/null
+++ b/filters/analyzer/analyzer.h
@@ -0,0 +1,30 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see .
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ */
+
+#ifndef _ANALYZER_H
+#define _ANALYZER_H
+
+int analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode);
+
+#endif
diff --git a/filters/delta2/delta2.c b/filters/delta2/delta2.c
index 5ea572d..3503a88 100644
--- a/filters/delta2/delta2.c
+++ b/filters/delta2/delta2.c
@@ -432,7 +432,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
val = gtot1 + (srclen - (pos - src));
}
*dstlen = pos2 - dst;
- return (val);
+ return ((int)val);
}
int
diff --git a/filters/dict/Common.h b/filters/dict/Common.h
index 166dee8..98061ee 100644
--- a/filters/dict/Common.h
+++ b/filters/dict/Common.h
@@ -1,66 +1,66 @@
-#ifndef _DATATYPE_H
-#define _DATATYPE_H
-
-#define CSA_VERSION 8
-
-
-typedef unsigned char u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-typedef int i32;
-typedef int64_t i64;
-
-const u32 KB=1024;
-const u32 MB=1048576;
-const u32 MinBlockSize=8*KB;
-
-
-const u32 MaxChunkBits=21;
-const u32 MaxChunkSize=(1<<(MaxChunkBits-1));
-const u32 MaxDictSize=512*MB;//Don't change
-const u32 DefaultOutStreamBlockSize=128*KB;
-const u32 DefaultInBufferSize=MaxChunkSize; //Should >=MaxChunkSize
-#define DLT_CHANNEL_MAX 5
-const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8};
-
-
-#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0)
-
-
-#define ENCODE 1
-#define DECODE 2
-
-
-/*****ERRORS*****************/
-#define NO_ERROR 0
-#define CANT_OPEN_FILE (-100)
-#define CANT_CREATE_FILE (-99)
-#define NOT_CSC_FILE (-98)
-#define VERSION_INVALID (-97)
-#define CSC_FILE_INVALID (-95)
-#define DECODE_ERROR (-96)
-#define CANT_ALLOC_MEM (-94)
-#define ALREADY_INITIALIZED (-93)
-#define OPERATION_ERROR (-92)
-#define FILE_DIDNT_OPEN (-91)
-/*****ERRORS*****************/
-
-/******Block Type*************/
-#define DT_NONE 0
-#define DT_HARD 0x05
-#define DT_EXE 0x04
-#define DT_BAD 0x03
-#define DT_NORMAL 0x02
-#define DT_SKIP 0x01
-#define DT_AUDIO 0x06
-#define DT_RGB 0x07
-#define DT_FAST 0x08
-#define SIG_EOF 0x09
-#define DT_ENGTXT 0x0A
-#define DT_DLT 0x10
-#define DT_MAXINDEX 0x1F
-/******Block Type*************/
-
-
-#endif
+#ifndef _DATATYPE_H
+#define _DATATYPE_H
+
+#define CSA_VERSION 8
+
+
+typedef unsigned char u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int i32;
+typedef int64_t i64;
+
+const u32 KB=1024;
+const u32 MB=1048576;
+const u32 MinBlockSize=8*1024;
+
+
+const u32 MaxChunkBits=21;
+const u32 MaxChunkSize=(1<<(21-1));
+const u32 MaxDictSize=512*1048576;//Don't change
+const u32 DefaultOutStreamBlockSize=128*1024;
+const u32 DefaultInBufferSize=21; //Should >=MaxChunkSize
+#define DLT_CHANNEL_MAX 5
+const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8};
+
+
+#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0)
+
+
+#define ENCODE 1
+#define DECODE 2
+
+
+/*****ERRORS*****************/
+#define NO_ERROR 0
+#define CANT_OPEN_FILE (-100)
+#define CANT_CREATE_FILE (-99)
+#define NOT_CSC_FILE (-98)
+#define VERSION_INVALID (-97)
+#define CSC_FILE_INVALID (-95)
+#define DECODE_ERROR (-96)
+#define CANT_ALLOC_MEM (-94)
+#define ALREADY_INITIALIZED (-93)
+#define OPERATION_ERROR (-92)
+#define FILE_DIDNT_OPEN (-91)
+/*****ERRORS*****************/
+
+/******Block Type*************/
+#define DT_NONE 0
+#define DT_HARD 0x05
+#define DT_EXE 0x04
+#define DT_BAD 0x03
+#define DT_NORMAL 0x02
+#define DT_SKIP 0x01
+#define DT_AUDIO 0x06
+#define DT_RGB 0x07
+#define DT_FAST 0x08
+#define SIG_EOF 0x09
+#define DT_ENGTXT 0x0A
+#define DT_DLT 0x10
+#define DT_MAXINDEX 0x1F
+/******Block Type*************/
+
+
+#endif
diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp
index 6aa6d87..bb0d151 100644
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@@ -1,278 +1,306 @@
-/*
- * This file is a part of Pcompress, a chunked parallel multi-
- * algorithm lossless compression and decompression program.
- *
- * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
- * Use is subject to license terms.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 3 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program.
- * If not, see .
- *
- * moinakg@gmail.com, http://moinakg.wordpress.com/
- */
-
-/*
- * Dict filter for text files. Adapted from Public Domain sources
- * of Fu Siyuan's CSC 3.2 archiver.
- */
-
-#include
-#include
-#include
-#include
-#include "DictFilter.h"
-#include "Common.h"
-
-const u32 wordNum = 123;
-
-u8 wordList[wordNum][8] =
-{
- "",
- "ac","ad","ai","al","am",
- "an","ar","as","at","ea",
- "ec","ed","ee","el","en",
- "er","es","et","id","ie",
- "ig","il","in","io","is",
- "it","of","ol","on","oo",
- "or","os","ou","ow","ul",
- "un","ur","us","ba","be",
- "ca","ce","co","ch","de",
- "di","ge","gh","ha","he",
- "hi","ho","ra","re","ri",
- "ro","rs","la","le","li",
- "lo","ld","ll","ly","se",
- "si","so","sh","ss","st",
- "ma","me","mi","ne","nc",
- "nd","ng","nt","pa","pe",
- "ta","te","ti","to","th",
- "tr","wa","ve",
- "all","and","but","dow",
- "for","had","hav","her",
- "him","his","man","mor",
- "not","now","one","out",
- "she","the","was","wer",
- "whi","whe","wit","you",
- "any","are",
- "that","said","with","have",
- "this","from","were","tion",
-};
-
-
-void
-DictFilter::MakeWordTree()
-{
- u32 i,j;
- u32 treePos;
- u8 symbolIndex = 0x82;
-
- nodeMum = 1;
-
- memset(wordTree,0,sizeof(wordTree));
-
- for (i = 1; i < wordNum; i++) {
- treePos = 0;
- for(j = 0; wordList[i][j] != 0; j++) {
- u32 idx = wordList[i][j] - 'a';
- if (wordTree[treePos].next[idx]) {
- treePos = wordTree[treePos].next[idx];
- } else {
- wordTree[treePos].next[idx] = nodeMum;
- treePos = nodeMum;
- nodeMum++;
- }
- }
- wordIndex[symbolIndex] = i;
- wordTree[treePos].symbol = symbolIndex++;
- }
-
- maxSymbol=symbolIndex;
-
-}
-
-
-DictFilter::DictFilter()
-{
- MakeWordTree();
-}
-
-
-
-DictFilter::~DictFilter()
-{
-}
-
-
-u32
-DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
-{
- if (size < 16384)
- return 0;
-
- u32 i,j,treePos = 0;
- u32 lastSymbol = 0;
- u32 dstSize = 0;
- u32 idx;
-
-
- for(i = 0; i < size-5;) {
- if (src[i] >= 'a' && src[i] <= 'z') {
-
- u32 matchSymbol = 0,longestWord = 0;
- treePos = 0;
- for(j = 0;;) {
- idx = src[i+j] - 'a';
- if (idx < 0 || idx > 25)
- break;
- if (wordTree[treePos].next[idx] == 0)
- break;
-
- treePos=wordTree[treePos].next[idx];
- j++;
- if (wordTree[treePos].symbol) {
- matchSymbol = wordTree[treePos].symbol;
- longestWord = j;
- }
- }
-
- if (matchSymbol) {
- dst[dstSize++] = matchSymbol;
- i += longestWord;
- continue;
- }
- lastSymbol = 0;
- dst[dstSize++] = src[i];
- i++;
- } else {
- if (src[i] >= 0x82) {
- dst[dstSize++] = 254;
- dst[dstSize++] = src[i];
- }
- else
- dst[dstSize++] = src[i];
-
- lastSymbol = 0;
- treePos = 0;
- i++;
- }
-
- }
-
- for (; i= 0x82) {
- dst[dstSize++] = 254;
- dst[dstSize++] = src[i];
- }
- else
- dst[dstSize++] = src[i];
- }
-
- if (dstSize > size*0.82)
- return 0;
-
- *dstsize = dstSize;
- return 1;
-}
-
-void
-DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
-{
-
- u32 i = 0,j;
- u32 dstPos = 0,idx;
-
- while(dstPos < *dstsize && i < size) {
- if (src[i] >= 0x82 && src[i] < maxSymbol) {
- idx = wordIndex[src[i]];
- for(j=0; wordList[idx][j]; j++)
- dst[dstPos++] = wordList[idx][j];
- }
- else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
- i++;
- dst[dstPos++] = src[i];
- }
- else {
- dst[dstPos++] = src[i];
- }
-
- i++;
- }
- *dstsize = dstPos;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void *
-new_dict_context()
-{
- DictFilter *df = new DictFilter();
- return (static_cast(df));
-}
-
-void
-delete_dict_context(void *dict_ctx)
-{
- if (dict_ctx) {
- DictFilter *df = static_cast(dict_ctx);
- delete df;
- }
-}
-
-int
-dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
-{
- DictFilter *df = static_cast(dict_ctx);
- u32 fl = fromlen;
- u32 dl = *dstlen;
- u8 *dst;
-
- if (fromlen > UINT32_MAX)
- return (-1);
- U32_P(to) = LE32(fromlen);
- dst = to + 4;
- dl -= 4;
- if (df->Forward_Dict(from, fl, dst, &dl)) {
- *dstlen = dl + 4;
- return (0);
- }
- return (-1);
-}
-
-int
-dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
-{
- DictFilter *df = static_cast(dict_ctx);
- u32 fl = fromlen;
- u32 dl;
- u8 *src;
-
- dl = U32_P(from);
- if (dl > *dstlen) {
- log_msg(LOG_ERR, 0, "Destination overflow in dict_decode.");
- return (-1);
- }
- *dstlen = dl;
- src = from + 4;
- fl -= 4;
-
- df->Inverse_Dict(src, fl, to, &dl);
- if (dl < *dstlen)
- return (-1);
- return (0);
-}
-
-#ifdef __cplusplus
-}
-#endif
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see .
+ *
+ * moinakg@gmail.com, http://moinakg.wordpress.com/
+ */
+
+/*
+ * Dict filter for text files. Adapted from Public Domain sources
+ * of Fu Siyuan's CSC 3.2 archiver.
+ */
+
+#include
+#include
+#include
+#include
+#include "DictFilter.h"
+#include "Common.h"
+
+class DictFilter
+{
+public:
+ ~DictFilter();
+ DictFilter();
+
+ u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
+ void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
+
+private:
+ typedef struct
+ {
+ u32 next[26];
+ u8 symbol;
+ } CTreeNode;
+ CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
+ u32 nodeMum;
+ u8 maxSymbol;
+ //Used for DICT transformer. Words are stored in trees.
+
+ u32 wordIndex[256];
+ //Used for DICT untransformer.choose words by symbols.
+ void MakeWordTree(); //Init the DICT transformer
+
+ u32 x0,x1;
+ u32 i,k;
+};
+
+const u32 wordNum = 123;
+
+u8 wordList[wordNum][8] =
+{
+ "",
+ "ac","ad","ai","al","am",
+ "an","ar","as","at","ea",
+ "ec","ed","ee","el","en",
+ "er","es","et","id","ie",
+ "ig","il","in","io","is",
+ "it","of","ol","on","oo",
+ "or","os","ou","ow","ul",
+ "un","ur","us","ba","be",
+ "ca","ce","co","ch","de",
+ "di","ge","gh","ha","he",
+ "hi","ho","ra","re","ri",
+ "ro","rs","la","le","li",
+ "lo","ld","ll","ly","se",
+ "si","so","sh","ss","st",
+ "ma","me","mi","ne","nc",
+ "nd","ng","nt","pa","pe",
+ "ta","te","ti","to","th",
+ "tr","wa","ve",
+ "all","and","but","dow",
+ "for","had","hav","her",
+ "him","his","man","mor",
+ "not","now","one","out",
+ "she","the","was","wer",
+ "whi","whe","wit","you",
+ "any","are",
+ "that","said","with","have",
+ "this","from","were","tion",
+};
+
+
+void
+DictFilter::MakeWordTree()
+{
+ u32 i,j;
+ u32 treePos;
+ u8 symbolIndex = 0x82;
+
+ nodeMum = 1;
+
+ memset(wordTree,0,sizeof(wordTree));
+
+ for (i = 1; i < wordNum; i++) {
+ treePos = 0;
+ for(j = 0; wordList[i][j] != 0; j++) {
+ u32 idx = wordList[i][j] - 'a';
+ if (wordTree[treePos].next[idx]) {
+ treePos = wordTree[treePos].next[idx];
+ } else {
+ wordTree[treePos].next[idx] = nodeMum;
+ treePos = nodeMum;
+ nodeMum++;
+ }
+ }
+ wordIndex[symbolIndex] = i;
+ wordTree[treePos].symbol = symbolIndex++;
+ }
+
+ maxSymbol=symbolIndex;
+
+}
+
+
+DictFilter::DictFilter()
+{
+ MakeWordTree();
+}
+
+
+
+DictFilter::~DictFilter()
+{
+}
+
+
+u32
+DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+{
+ if (size < 16384)
+ return 0;
+
+ u32 i,j,treePos = 0;
+ u32 lastSymbol = 0;
+ u32 dstSize = 0;
+ u32 idx;
+
+
+ for(i = 0; i < size-5;) {
+ if (src[i] >= 'a' && src[i] <= 'z') {
+
+ u32 matchSymbol = 0,longestWord = 0;
+ treePos = 0;
+ for(j = 0;;) {
+ idx = src[i+j] - 'a';
+ if (idx < 0 || idx > 25)
+ break;
+ if (wordTree[treePos].next[idx] == 0)
+ break;
+
+ treePos=wordTree[treePos].next[idx];
+ j++;
+ if (wordTree[treePos].symbol) {
+ matchSymbol = wordTree[treePos].symbol;
+ longestWord = j;
+ }
+ }
+
+ if (matchSymbol) {
+ dst[dstSize++] = matchSymbol;
+ i += longestWord;
+ continue;
+ }
+ lastSymbol = 0;
+ dst[dstSize++] = src[i];
+ i++;
+ } else {
+ if (src[i] >= 0x82) {
+ dst[dstSize++] = 254;
+ dst[dstSize++] = src[i];
+ }
+ else
+ dst[dstSize++] = src[i];
+
+ lastSymbol = 0;
+ treePos = 0;
+ i++;
+ }
+
+ }
+
+ for (; i= 0x82) {
+ dst[dstSize++] = 254;
+ dst[dstSize++] = src[i];
+ }
+ else
+ dst[dstSize++] = src[i];
+ }
+
+ if (dstSize > size*0.82)
+ return 0;
+
+ *dstsize = dstSize;
+ return 1;
+}
+
+void
+DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+{
+
+ u32 i = 0,j;
+ u32 dstPos = 0,idx;
+
+ while(dstPos < *dstsize && i < size) {
+ if (src[i] >= 0x82 && src[i] < maxSymbol) {
+ idx = wordIndex[src[i]];
+ for(j=0; wordList[idx][j]; j++)
+ dst[dstPos++] = wordList[idx][j];
+ }
+ else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
+ i++;
+ dst[dstPos++] = src[i];
+ }
+ else {
+ dst[dstPos++] = src[i];
+ }
+
+ i++;
+ }
+ *dstsize = dstPos;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void *
+new_dict_context()
+{
+ DictFilter *df = new DictFilter();
+ return (static_cast(df));
+}
+
+void
+delete_dict_context(void *dict_ctx)
+{
+ if (dict_ctx) {
+ DictFilter *df = static_cast(dict_ctx);
+ delete df;
+ }
+}
+
+int
+dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+ DictFilter *df = static_cast(dict_ctx);
+ u32 fl = fromlen;
+ u32 dl = *dstlen;
+ u8 *dst;
+
+ if (fromlen > UINT32_MAX)
+ return (-1);
+ U32_P(to) = LE32(fromlen);
+ dst = to + 4;
+ dl -= 4;
+ if (df->Forward_Dict(from, fl, dst, &dl)) {
+ *dstlen = dl + 4;
+ return (0);
+ }
+ return (-1);
+}
+
+int
+dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+ DictFilter *df = static_cast(dict_ctx);
+ u32 fl = fromlen;
+ u32 dl;
+ u8 *src;
+
+ dl = U32_P(from);
+ if (dl > *dstlen) {
+ log_msg(LOG_ERR, 0, "Destination overflow in dict_decode.");
+ return (-1);
+ }
+ *dstlen = dl;
+ src = from + 4;
+ fl -= 4;
+
+ df->Inverse_Dict(src, fl, to, &dl);
+ if (dl < *dstlen)
+ return (-1);
+ return (0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/filters/dict/DictFilter.h b/filters/dict/DictFilter.h
index 4b4baee..08187c6 100644
--- a/filters/dict/DictFilter.h
+++ b/filters/dict/DictFilter.h
@@ -1,80 +1,52 @@
-/*
- * This file is a part of Pcompress, a chunked parallel multi-
- * algorithm lossless compression and decompression program.
- *
- * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
- * Use is subject to license terms.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 3 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program.
- * If not, see .
- *
- * moinakg@gmail.com, http://moinakg.wordpress.com/
- */
-
-/*
- * Dict filter for text files. Adapted from Public Domain sources
- * of Fu Siyuan's CSC 3.2 archiver.
- */
-
-#ifndef _FILTERS_H
-#define _FILTERS_H
-
-#include
-
-#include "Common.h"
-#define MAX_WORDTREE_NODE_NUM 300 //Enough now!
-
-class DictFilter
-{
-public:
- ~DictFilter();
- DictFilter();
-
- u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
- void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
-
-private:
- typedef struct
- {
- u32 next[26];
- u8 symbol;
- } CTreeNode;
- CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
- u32 nodeMum;
- u8 maxSymbol;
- //Used for DICT transformer. Words are stored in trees.
-
- u32 wordIndex[256];
- //Used for DICT untransformer.choose words by symbols.
- void MakeWordTree(); //Init the DICT transformer
-
- u32 x0,x1;
- u32 i,k;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void *new_dict_context();
-void delete_dict_context(void *dict_ctx);
-
-int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
-int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see .
+ *
+ * moinakg@gmail.com, http://moinakg.wordpress.com/
+ */
+
+/*
+ * Dict filter for text files. Adapted from Public Domain sources
+ * of Fu Siyuan's CSC 3.2 archiver.
+ */
+
+#ifndef _FILTERS_H
+#define _FILTERS_H
+
+#include
+
+#include "Common.h"
+#define MAX_WORDTREE_NODE_NUM 300 //Enough now!
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void *new_dict_context();
+void delete_dict_context(void *dict_ctx);
+
+int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/pcompress.c b/pcompress.c
index b7d0073..d6b9f2a 100644
--- a/pcompress.c
+++ b/pcompress.c
@@ -56,6 +56,8 @@
#include
#include
#include
+#include "analyzer.h"
+#include "filters/dict/DictFilter.h"
/*
* We use 8MB chunks by default.
@@ -204,7 +206,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data,
algo_props_t *props)
{
- uchar_t *dest = (uchar_t *)dst, type = 0;
+ uchar_t *dest = (uchar_t *)dst, type = 0, atype;
int64_t result;
uint64_t _dstlen, fromlen;
uchar_t *from, *to;
@@ -238,13 +240,45 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
}
}
+ /*
+ * The analyzer is run below only for non-archive mode. When archiving the
+ * archiver thread runs analyzer on incremental blocks and sets the type
+ * accordingly.
+ */
+ atype = btype;
+ /*
+ * Run an analyzer on the data. At present the analyzer only tries
+ * to detect if this is text for running the dict filter.
+ */
+ if (pctx->enable_analyzer) {
+ atype = analyze_buffer(src, srclen, btype, pctx->adapt_mode);
+ }
+
+ /*
+ * Enabling LZP also enables the DICT filter since we are dealing with text
+ * in any case.
+ */
+ if (pctx->lzp_preprocess && PC_TYPE(atype) == TYPE_TEXT) {
+ void *dct = new_dict_context();
+ _dstlen = fromlen;
+ result = dict_encode(dct, from, fromlen, to, &_dstlen);
+ delete_dict_context(dct);
+ if (result != -1) {
+ uchar_t *tmp;
+ tmp = from;
+ from = to;
+ to = tmp;
+ fromlen = _dstlen;
+ type |= PREPROC_TYPE_DICT;
+ }
+ }
#ifndef _MPLV2_LICENSE_
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
int hashsize;
hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)from, to, fromlen,
- hashsize, LZP_DEFAULT_LZPMINLEN, 0);
+ hashsize, LZP_DEFAULT_LZPMINLEN, 0);
if (result >= 0 && result < srclen) {
uchar_t *tmp;
tmp = from;
@@ -375,6 +409,20 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
#endif
}
+ if (type & PREPROC_TYPE_DICT) {
+ void *dct = new_dict_context();
+ result = dict_decode(dct, src, srclen, dst, &_dstlen);
+ delete_dict_context(dct);
+ if (result != -1) {
+ memcpy(src, dst, _dstlen);
+ srclen = _dstlen;
+ *dstlen = _dstlen;
+ } else {
+ log_msg(LOG_ERR, 0, "DICT decoding failed.");
+ return (result);
+ }
+ }
+
if (type & PREPROC_TYPE_DISPACK) {
result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen1);
if (result != -1) {
@@ -689,13 +737,13 @@ cont:
* Compressed length: 8 bytes.
* Checksum: Upto 64 bytes.
* Chunk flags: 1 byte.
- *
+ *
* Chunk Flags, 8 bits:
* I I I I I I I I
* | | | | | |
* | '-----' | | `- 0 - Uncompressed
* | | | | 1 - Compressed
- * | | | |
+ * | | | |
* | | | `---- 1 - Chunk was Deduped
* | | `------- 1 - Chunk was pre-compressed
* | |
@@ -1070,7 +1118,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
memset(zero, 0, MAX_PW_LEN);
fd = open(pctx->pwd_file, O_RDWR);
if (fd != -1) {
- pw_len = lseek(fd, 0, SEEK_END);
+ pw_len = (int)lseek(fd, 0, SEEK_END);
if (pw_len != -1) {
if (pw_len > MAX_PW_LEN) pw_len = MAX_PW_LEN-1;
lseek(fd, 0, SEEK_SET);
@@ -1552,9 +1600,11 @@ redo:
dedupe_index_sz = 0;
type = COMPRESSED;
+
/* Perform Dedup if enabled. */
if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan)) {
dedupe_context_t *rctx;
+ uint64_t rb = tdat->rbytes;
/*
* Compute checksum of original uncompressed chunk. When doing dedup
@@ -1569,8 +1619,9 @@ redo:
rctx = tdat->rctx;
reset_dedupe_context(tdat->rctx);
rctx->cbuf = tdat->uncompressed_chunk;
- dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0,
+ dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &rb, 0,
NULL, tdat->cksum_mt);
+ tdat->rbytes = rb;
if (!rctx->valid) {
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
tdat->rbytes = rbytes;
@@ -1744,6 +1795,10 @@ plain_index:
tdat->len_cmp += (pctx->cksum_bytes + pctx->mac_bytes);
rbytes = tdat->len_cmp - len_cmp; // HDR size for HMAC
+ /*
+ * In adaptive mode return value from compression function function indicates
+ * which algorithm was used on the chunk. We have to store that.
+ */
if (pctx->adapt_mode)
type |= (rv << 4);
@@ -2750,7 +2805,8 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
pctx->_deinit_func = adapt_deinit;
pctx->_stats_func = adapt_stats;
pctx->_props_func = adapt_props;
- pctx->adapt_mode = 1;
+ pctx->adapt_mode = 2;
+ pctx->enable_analyzer = 1;
rv = 0;
} else if (memcmp(algorithm, "adapt", 5) == 0) {
@@ -2761,6 +2817,7 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
pctx->_stats_func = adapt_stats;
pctx->_props_func = adapt_props;
pctx->adapt_mode = 1;
+ pctx->enable_analyzer = 1;
rv = 0;
#ifdef ENABLE_PC_LIBBSC
} else if (memcmp(algorithm, "libbsc", 6) == 0) {
@@ -2770,7 +2827,6 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
pctx->_deinit_func = libbsc_deinit;
pctx->_stats_func = libbsc_stats;
pctx->_props_func = libbsc_props;
- pctx->adapt_mode = 1;
rv = 0;
#endif
}
@@ -3337,6 +3393,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
}
if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) {
pctx->preprocess_mode = 1;
+ pctx->enable_analyzer = 1;
}
if (pctx->chunksize == 0) {
if (pctx->level < 9) {
diff --git a/pcompress.h b/pcompress.h
index c383c70..9787f6c 100644
--- a/pcompress.h
+++ b/pcompress.h
@@ -60,13 +60,14 @@ extern "C" {
#define CHSIZE_MASK 0x80
#define BZIP2_A_NUM 16
#define LZMA_A_NUM 32
-#define CHUNK_FLAG_DEDUP 2
+#define CHUNK_FLAG_DEDUP 2
#define CHUNK_FLAG_PREPROC 4
#define COMP_EXTN ".pz"
-#define PREPROC_TYPE_LZP 1
+#define PREPROC_TYPE_LZP 1
#define PREPROC_TYPE_DELTA2 2
#define PREPROC_TYPE_DISPACK 4
+#define PREPROC_TYPE_DICT 8
#define PREPROC_COMPRESSED 128
/*
@@ -212,6 +213,7 @@ typedef struct pc_ctx {
int delta2_nstrides;
int enable_rabin_split;
int enable_fixed_scan;
+ int enable_analyzer;
int preprocess_mode;
int lzp_preprocess;
int dispack_preprocess;
@@ -275,7 +277,7 @@ struct cmp_data {
uchar_t *compressed_chunk;
uchar_t *uncompressed_chunk;
dedupe_context_t *rctx;
- uint64_t rbytes;
+ int64_t rbytes;
uint64_t chunksize;
uint64_t len_cmp, len_cmp_be;
uchar_t checksum[CKSUM_MAX_BYTES];
diff --git a/utils/utils.c b/utils/utils.c
index 41669d2..08ab227 100644
--- a/utils/utils.c
+++ b/utils/utils.c
@@ -383,14 +383,18 @@ get_total_ram()
}
#ifdef __APPLE__
+#define NANO_SEC (1000000000ULL)
int
clock_gettime(int clk_id, struct timespec *ts)
{
if (clk_id == CLOCK_MONOTONIC) {
- uint64_t abstime = mach_absolute_time();
- return (abstime * sTimebaseInfo.numer / sTimebaseInfo.denom);
+ uint64_t nanotime = mach_absolute_time() *
+ sTimebaseInfo.numer / sTimebaseInfo.denom;
+ ts->tv_sec = nanotime / NANO_SEC;
+ ts->tv_nsec = nanotime % NANO_SEC;
+ return (0);
}
- return (0);
+ return (EINVAL);
}
#endif
@@ -543,8 +547,7 @@ log_msg(log_level_t log_level, int show_errno, const char *format, ...)
fputs(msg, stderr);
} else if (ldest.type == LOG_FILE) {
- int rv;
- rv = write(ldest.fd, msg, strlen(msg));
+ (void) write(ldest.fd, msg, strlen(msg));
} else {
ldest.cb(msg);
}