diff --git a/Makefile.in b/Makefile.in
index e5fcdf4..a327204 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -158,6 +158,10 @@ DISPACKSRCS = filters/dispack/dis.cpp
DISPACKHDRS = filters/dispack/dis.hpp filters/dispack/types.hpp
DISPACKOBJS = $(DISPACKSRCS:.cpp=.o)
+DICTSRCS = filters/dict/DictFilter.cpp
+DICTHDRS = filters/dict/DictFilter.h filters/dict/Common.h
+DICTOBJS = $(DICTSRCS:.cpp=.o)
+
SKEIN_BLOCK_C = crypto/skein/skein_block.c
SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s
SKEIN_BLOCK_SRC = @SKEIN_BLOCK@
@@ -246,7 +250,7 @@ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS)
$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(BLAKE2_OBJS) \
@CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS) $(DISPACKOBJS) $(PPNMOBJS) \
-$(WAVPKOBJS)
+$(WAVPKOBJS) $(DICTOBJS)
DEBUG_LINK = $(GPP) -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp -fPIC
DEBUG_COMPILE = $(GCC) -g -c @EXTRA_OPT_FLAGS@ -fPIC @USE_CLANG_AS@
@@ -340,6 +344,10 @@ $(DISPACKOBJS): $(DISPACKSRCS) $(DISPACKHDRS)
$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \
-Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
+$(DICTOBJS): $(DICTSRCS) $(DICTHDRS)
+ $(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \
+ -Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
+
$(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
$(COMPILE) $(SKEIN_FLAGS) $(SKEIN_BLOCK_SRC) -o $@
diff --git a/filters/dict/Common.h b/filters/dict/Common.h
new file mode 100644
index 0000000..166dee8
--- /dev/null
+++ b/filters/dict/Common.h
@@ -0,0 +1,66 @@
+#ifndef _DATATYPE_H
+#define _DATATYPE_H
+
+#define CSA_VERSION 8
+
+
+typedef unsigned char u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int i32;
+typedef int64_t i64;
+
+const u32 KB=1024;
+const u32 MB=1048576;
+const u32 MinBlockSize=8*KB;
+
+
+const u32 MaxChunkBits=21;
+const u32 MaxChunkSize=(1<<(MaxChunkBits-1));
+const u32 MaxDictSize=512*MB;//Don't change
+const u32 DefaultOutStreamBlockSize=128*KB;
+const u32 DefaultInBufferSize=MaxChunkSize; //Should >=MaxChunkSize
+#define DLT_CHANNEL_MAX 5
+const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8};
+
+
+#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0)
+
+
+#define ENCODE 1
+#define DECODE 2
+
+
+/*****ERRORS*****************/
+#define NO_ERROR 0
+#define CANT_OPEN_FILE (-100)
+#define CANT_CREATE_FILE (-99)
+#define NOT_CSC_FILE (-98)
+#define VERSION_INVALID (-97)
+#define CSC_FILE_INVALID (-95)
+#define DECODE_ERROR (-96)
+#define CANT_ALLOC_MEM (-94)
+#define ALREADY_INITIALIZED (-93)
+#define OPERATION_ERROR (-92)
+#define FILE_DIDNT_OPEN (-91)
+/*****ERRORS*****************/
+
+/******Block Type*************/
+#define DT_NONE 0
+#define DT_HARD 0x05
+#define DT_EXE 0x04
+#define DT_BAD 0x03
+#define DT_NORMAL 0x02
+#define DT_SKIP 0x01
+#define DT_AUDIO 0x06
+#define DT_RGB 0x07
+#define DT_FAST 0x08
+#define SIG_EOF 0x09
+#define DT_ENGTXT 0x0A
+#define DT_DLT 0x10
+#define DT_MAXINDEX 0x1F
+/******Block Type*************/
+
+
+#endif
diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp
new file mode 100644
index 0000000..6aa6d87
--- /dev/null
+++ b/filters/dict/DictFilter.cpp
@@ -0,0 +1,278 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see .
+ *
+ * moinakg@gmail.com, http://moinakg.wordpress.com/
+ */
+
+/*
+ * Dict filter for text files. Adapted from Public Domain sources
+ * of Fu Siyuan's CSC 3.2 archiver.
+ */
+
+#include
+#include
+#include
+#include
+#include "DictFilter.h"
+#include "Common.h"
+
+const u32 wordNum = 123;
+
+u8 wordList[wordNum][8] =
+{
+ "",
+ "ac","ad","ai","al","am",
+ "an","ar","as","at","ea",
+ "ec","ed","ee","el","en",
+ "er","es","et","id","ie",
+ "ig","il","in","io","is",
+ "it","of","ol","on","oo",
+ "or","os","ou","ow","ul",
+ "un","ur","us","ba","be",
+ "ca","ce","co","ch","de",
+ "di","ge","gh","ha","he",
+ "hi","ho","ra","re","ri",
+ "ro","rs","la","le","li",
+ "lo","ld","ll","ly","se",
+ "si","so","sh","ss","st",
+ "ma","me","mi","ne","nc",
+ "nd","ng","nt","pa","pe",
+ "ta","te","ti","to","th",
+ "tr","wa","ve",
+ "all","and","but","dow",
+ "for","had","hav","her",
+ "him","his","man","mor",
+ "not","now","one","out",
+ "she","the","was","wer",
+ "whi","whe","wit","you",
+ "any","are",
+ "that","said","with","have",
+ "this","from","were","tion",
+};
+
+
+void
+DictFilter::MakeWordTree()
+{
+ u32 i,j;
+ u32 treePos;
+ u8 symbolIndex = 0x82;
+
+ nodeMum = 1;
+
+ memset(wordTree,0,sizeof(wordTree));
+
+ for (i = 1; i < wordNum; i++) {
+ treePos = 0;
+ for(j = 0; wordList[i][j] != 0; j++) {
+ u32 idx = wordList[i][j] - 'a';
+ if (wordTree[treePos].next[idx]) {
+ treePos = wordTree[treePos].next[idx];
+ } else {
+ wordTree[treePos].next[idx] = nodeMum;
+ treePos = nodeMum;
+ nodeMum++;
+ }
+ }
+ wordIndex[symbolIndex] = i;
+ wordTree[treePos].symbol = symbolIndex++;
+ }
+
+ maxSymbol=symbolIndex;
+
+}
+
+
+DictFilter::DictFilter()
+{
+ MakeWordTree();
+}
+
+
+
+DictFilter::~DictFilter()
+{
+}
+
+
+u32
+DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+{
+ if (size < 16384)
+ return 0;
+
+ u32 i,j,treePos = 0;
+ u32 lastSymbol = 0;
+ u32 dstSize = 0;
+ u32 idx;
+
+
+ for(i = 0; i < size-5;) {
+ if (src[i] >= 'a' && src[i] <= 'z') {
+
+ u32 matchSymbol = 0,longestWord = 0;
+ treePos = 0;
+ for(j = 0;;) {
+ idx = src[i+j] - 'a';
+ if (idx < 0 || idx > 25)
+ break;
+ if (wordTree[treePos].next[idx] == 0)
+ break;
+
+ treePos=wordTree[treePos].next[idx];
+ j++;
+ if (wordTree[treePos].symbol) {
+ matchSymbol = wordTree[treePos].symbol;
+ longestWord = j;
+ }
+ }
+
+ if (matchSymbol) {
+ dst[dstSize++] = matchSymbol;
+ i += longestWord;
+ continue;
+ }
+ lastSymbol = 0;
+ dst[dstSize++] = src[i];
+ i++;
+ } else {
+ if (src[i] >= 0x82) {
+ dst[dstSize++] = 254;
+ dst[dstSize++] = src[i];
+ }
+ else
+ dst[dstSize++] = src[i];
+
+ lastSymbol = 0;
+ treePos = 0;
+ i++;
+ }
+
+ }
+
+ for (; i= 0x82) {
+ dst[dstSize++] = 254;
+ dst[dstSize++] = src[i];
+ }
+ else
+ dst[dstSize++] = src[i];
+ }
+
+ if (dstSize > size*0.82)
+ return 0;
+
+ *dstsize = dstSize;
+ return 1;
+}
+
+void
+DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+{
+
+ u32 i = 0,j;
+ u32 dstPos = 0,idx;
+
+ while(dstPos < *dstsize && i < size) {
+ if (src[i] >= 0x82 && src[i] < maxSymbol) {
+ idx = wordIndex[src[i]];
+ for(j=0; wordList[idx][j]; j++)
+ dst[dstPos++] = wordList[idx][j];
+ }
+ else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
+ i++;
+ dst[dstPos++] = src[i];
+ }
+ else {
+ dst[dstPos++] = src[i];
+ }
+
+ i++;
+ }
+ *dstsize = dstPos;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void *
+new_dict_context()
+{
+ DictFilter *df = new DictFilter();
+ return (static_cast(df));
+}
+
+void
+delete_dict_context(void *dict_ctx)
+{
+ if (dict_ctx) {
+ DictFilter *df = static_cast(dict_ctx);
+ delete df;
+ }
+}
+
+int
+dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+ DictFilter *df = static_cast(dict_ctx);
+ u32 fl = fromlen;
+ u32 dl = *dstlen;
+ u8 *dst;
+
+ if (fromlen > UINT32_MAX)
+ return (-1);
+ U32_P(to) = LE32(fromlen);
+ dst = to + 4;
+ dl -= 4;
+ if (df->Forward_Dict(from, fl, dst, &dl)) {
+ *dstlen = dl + 4;
+ return (0);
+ }
+ return (-1);
+}
+
+int
+dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+ DictFilter *df = static_cast(dict_ctx);
+ u32 fl = fromlen;
+ u32 dl;
+ u8 *src;
+
+ dl = U32_P(from);
+ if (dl > *dstlen) {
+ log_msg(LOG_ERR, 0, "Destination overflow in dict_decode.");
+ return (-1);
+ }
+ *dstlen = dl;
+ src = from + 4;
+ fl -= 4;
+
+ df->Inverse_Dict(src, fl, to, &dl);
+ if (dl < *dstlen)
+ return (-1);
+ return (0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/filters/dict/DictFilter.h b/filters/dict/DictFilter.h
new file mode 100644
index 0000000..4b4baee
--- /dev/null
+++ b/filters/dict/DictFilter.h
@@ -0,0 +1,80 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see .
+ *
+ * moinakg@gmail.com, http://moinakg.wordpress.com/
+ */
+
+/*
+ * Dict filter for text files. Adapted from Public Domain sources
+ * of Fu Siyuan's CSC 3.2 archiver.
+ */
+
+#ifndef _FILTERS_H
+#define _FILTERS_H
+
+#include
+
+#include "Common.h"
+#define MAX_WORDTREE_NODE_NUM 300 //Enough now!
+
+class DictFilter
+{
+public:
+ ~DictFilter();
+ DictFilter();
+
+ u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
+ void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
+
+private:
+ typedef struct
+ {
+ u32 next[26];
+ u8 symbol;
+ } CTreeNode;
+ CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
+ u32 nodeMum;
+ u8 maxSymbol;
+ //Used for DICT transformer. Words are stored in trees.
+
+ u32 wordIndex[256];
+ //Used for DICT untransformer.choose words by symbols.
+ void MakeWordTree(); //Init the DICT transformer
+
+ u32 x0,x1;
+ u32 i,k;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void *new_dict_context();
+void delete_dict_context(void *dict_ctx);
+
+int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/pcompress.c b/pcompress.c
index 7d44fd7..b7d0073 100644
--- a/pcompress.c
+++ b/pcompress.c
@@ -191,7 +191,6 @@ show_compression_stats(pc_ctx_t *pctx)
/*
* Wrapper functions to pre-process the buffer and then call the main compression routine.
- * At present only LZP pre-compression is used below. Some extra metadata is added:
*
* Byte 0: A flag to indicate which pre-processor was used.
* Byte 1 - Byte 8: Size of buffer after pre-processing
diff --git a/utils/utils.h b/utils/utils.h
index e61855f..4271d3a 100644
--- a/utils/utils.h
+++ b/utils/utils.h
@@ -277,7 +277,7 @@ typedef enum {
/*
* Sub-types.
*/
-#define NUM_SUB_TYPES 33
+#define NUM_SUB_TYPES 34
TYPE_EXE32 = 8,
TYPE_JPEG = 16,
TYPE_MARKUP = 24,
@@ -309,7 +309,8 @@ typedef enum {
TYPE_DICOM = 232,
TYPE_PNM = 240,
TYPE_PACKPNM = 248,
- TYPE_WAV = 256
+ TYPE_WAV = 256,
+ TYPE_ENGLISH = 264
} data_type_t;
/*