Dict filter work in progress.

2014-09-18 22:51:25 +05:30 · 2014-09-18 22:51:25 +05:30 · 4fedebc607
commit 4fedebc607
parent f34962f8cc
6 changed files with 436 additions and 4 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -158,6 +158,10 @@ DISPACKSRCS = filters/dispack/dis.cpp
 DISPACKHDRS = filters/dispack/dis.hpp filters/dispack/types.hpp
 DISPACKOBJS = $(DISPACKSRCS:.cpp=.o)

+DICTSRCS = filters/dict/DictFilter.cpp
+DICTHDRS = filters/dict/DictFilter.h filters/dict/Common.h
+DICTOBJS = $(DICTSRCS:.cpp=.o)
+
 SKEIN_BLOCK_C = crypto/skein/skein_block.c
 SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s
 SKEIN_BLOCK_SRC = @SKEIN_BLOCK@
@ -246,7 +250,7 @@ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS)
 $(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
 $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(BLAKE2_OBJS) \
@CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS) $(DISPACKOBJS) $(PPNMOBJS) \
-$(WAVPKOBJS)
+$(WAVPKOBJS) $(DICTOBJS)

 DEBUG_LINK = $(GPP) -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp -fPIC
 DEBUG_COMPILE = $(GCC) -g -c @EXTRA_OPT_FLAGS@ -fPIC @USE_CLANG_AS@
@ -340,6 +344,10 @@ $(DISPACKOBJS): $(DISPACKSRCS) $(DISPACKHDRS)
 	$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \
 	-Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@

+$(DICTOBJS): $(DICTSRCS) $(DICTHDRS)
+	$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \
+	-Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
+
 $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
 	$(COMPILE) $(SKEIN_FLAGS) $(SKEIN_BLOCK_SRC) -o $@

--- a/filters/dict/Common.h
+++ b/filters/dict/Common.h
@ -0,0 +1,66 @@
+#ifndef _DATATYPE_H
+#define _DATATYPE_H
+
+#define CSA_VERSION 8
+
+
+typedef unsigned char u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int i32;
+typedef int64_t i64;
+
+const u32 KB=1024;
+const u32 MB=1048576;
+const u32 MinBlockSize=8*KB;
+
+
+const u32 MaxChunkBits=21;
+const u32 MaxChunkSize=(1<<(MaxChunkBits-1));
+const u32 MaxDictSize=512*MB;//Don't change
+const u32 DefaultOutStreamBlockSize=128*KB;
+const u32 DefaultInBufferSize=MaxChunkSize;  //Should >=MaxChunkSize
+#define DLT_CHANNEL_MAX 5
+const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8};
+
+
+#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0)
+
+
+#define ENCODE 1
+#define	DECODE 2
+
+
+/*****ERRORS*****************/
+#define NO_ERROR 0
+#define CANT_OPEN_FILE (-100)
+#define CANT_CREATE_FILE (-99)
+#define NOT_CSC_FILE (-98)
+#define VERSION_INVALID (-97)
+#define CSC_FILE_INVALID (-95)
+#define DECODE_ERROR (-96)
+#define CANT_ALLOC_MEM (-94)
+#define ALREADY_INITIALIZED (-93)
+#define OPERATION_ERROR (-92)
+#define FILE_DIDNT_OPEN (-91)
+/*****ERRORS*****************/
+
+/******Block Type*************/
+#define DT_NONE 0
+#define DT_HARD 0x05
+#define DT_EXE 0x04
+#define DT_BAD 0x03
+#define DT_NORMAL 0x02
+#define DT_SKIP 0x01
+#define DT_AUDIO 0x06
+#define DT_RGB 0x07
+#define DT_FAST 0x08
+#define SIG_EOF 0x09
+#define DT_ENGTXT 0x0A
+#define DT_DLT 0x10
+#define DT_MAXINDEX 0x1F
+/******Block Type*************/
+
+
+#endif
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@ -0,0 +1,278 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ * moinakg@gmail.com, http://moinakg.wordpress.com/
+ */
+
+/*
+ * Dict filter for text files. Adapted from Public Domain sources
+ * of Fu Siyuan's CSC 3.2 archiver.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <stdint.h>
+#include "DictFilter.h"
+#include "Common.h"
+
+const u32 wordNum = 123;
+
+u8 wordList[wordNum][8] =
+{
+	"",
+	"ac","ad","ai","al","am",
+	"an","ar","as","at","ea",
+	"ec","ed","ee","el","en",
+	"er","es","et","id","ie",
+	"ig","il","in","io","is",
+	"it","of","ol","on","oo",
+	"or","os","ou","ow","ul",
+	"un","ur","us","ba","be",
+	"ca","ce","co","ch","de",
+	"di","ge","gh","ha","he",
+	"hi","ho","ra","re","ri",
+	"ro","rs","la","le","li",
+	"lo","ld","ll","ly","se",
+	"si","so","sh","ss","st",
+	"ma","me","mi","ne","nc",
+	"nd","ng","nt","pa","pe",
+	"ta","te","ti","to","th",
+	"tr","wa","ve",
+	"all","and","but","dow",
+	"for","had","hav","her",
+	"him","his","man","mor",
+	"not","now","one","out",
+	"she","the","was","wer",
+	"whi","whe","wit","you",
+	"any","are",
+	"that","said","with","have",
+	"this","from","were","tion",
+};
+
+
+void
+DictFilter::MakeWordTree()
+{
+	u32 i,j;
+	u32 treePos;
+	u8 symbolIndex = 0x82;
+
+	nodeMum = 1;
+
+	memset(wordTree,0,sizeof(wordTree));
+
+	for (i = 1; i < wordNum; i++) {
+		treePos = 0;
+		for(j = 0; wordList[i][j] != 0; j++) {
+			u32 idx = wordList[i][j] - 'a';
+			if (wordTree[treePos].next[idx]) {
+				treePos = wordTree[treePos].next[idx];
+			} else {
+				wordTree[treePos].next[idx] = nodeMum;
+				treePos = nodeMum;
+				nodeMum++;
+			}
+		}
+		wordIndex[symbolIndex] = i;
+		wordTree[treePos].symbol = symbolIndex++;
+	}
+
+	maxSymbol=symbolIndex;
+
+}
+
+
+DictFilter::DictFilter()
+{
+	MakeWordTree();
+}
+
+
+
+DictFilter::~DictFilter()
+{
+}
+
+
+u32
+DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+{
+	if (size < 16384)
+		return 0;
+
+	u32 i,j,treePos = 0;
+	u32 lastSymbol = 0;
+	u32 dstSize = 0;
+	u32 idx;
+
+
+	for(i = 0; i < size-5;) {
+		if (src[i] >= 'a' && src[i] <= 'z') {
+
+			u32 matchSymbol = 0,longestWord = 0;
+			treePos = 0;
+			for(j = 0;;) {
+				idx = src[i+j] - 'a';
+				if (idx < 0 || idx > 25)
+					break;
+				if (wordTree[treePos].next[idx] == 0)
+					break;
+
+				treePos=wordTree[treePos].next[idx];
+				j++;
+				if (wordTree[treePos].symbol) {
+					matchSymbol = wordTree[treePos].symbol;
+					longestWord = j;
+				}
+			}
+
+			if (matchSymbol) {
+				dst[dstSize++] = matchSymbol;
+				i += longestWord;
+				continue;
+			}
+			lastSymbol = 0;
+			dst[dstSize++] = src[i];
+			i++;
+		} else {
+			if (src[i] >= 0x82) {
+				dst[dstSize++] = 254;
+				dst[dstSize++] = src[i];
+			}
+			else
+				dst[dstSize++] = src[i];
+
+			lastSymbol = 0;
+			treePos = 0;
+			i++;
+		}
+
+	}
+
+	for (; i<size; i++) {
+		if (src[i] >= 0x82) {
+			dst[dstSize++] = 254;
+			dst[dstSize++] = src[i];
+		}
+		else
+			dst[dstSize++] = src[i];
+	}
+
+	if (dstSize > size*0.82)
+		return 0;
+
+	*dstsize = dstSize;
+	return 1;
+}
+
+void
+DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+{
+
+	u32 i = 0,j;
+	u32 dstPos = 0,idx;
+
+	while(dstPos < *dstsize && i < size) {
+		if (src[i] >= 0x82 && src[i] < maxSymbol) {
+			idx = wordIndex[src[i]];
+			for(j=0; wordList[idx][j]; j++)
+				dst[dstPos++] = wordList[idx][j];
+		}
+		else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
+			i++;
+			dst[dstPos++] = src[i];
+		}
+		else {
+			dst[dstPos++] = src[i];
+		}
+
+		i++;
+	}
+	*dstsize = dstPos;
+}
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void *
+new_dict_context()
+{
+	DictFilter *df = new DictFilter();
+	return (static_cast<void *>(df));
+}
+
+void
+delete_dict_context(void *dict_ctx)
+{
+	if (dict_ctx) {
+		DictFilter *df = static_cast<DictFilter *>(dict_ctx);
+		delete df;
+	}
+}
+
+int
+dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+	DictFilter *df = static_cast<DictFilter *>(dict_ctx);
+	u32 fl = fromlen;
+	u32 dl = *dstlen;
+	u8 *dst;
+
+	if (fromlen > UINT32_MAX)
+		return (-1);
+	U32_P(to) = LE32(fromlen);
+	dst = to + 4;
+	dl -= 4;
+	if (df->Forward_Dict(from, fl, dst, &dl)) {
+		*dstlen = dl + 4;
+		return (0);
+	}
+	return (-1);
+}
+
+int
+dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+	DictFilter *df = static_cast<DictFilter *>(dict_ctx);
+	u32 fl = fromlen;
+	u32 dl;
+	u8 *src;
+
+	dl = U32_P(from);
+	if (dl > *dstlen) {
+		log_msg(LOG_ERR, 0, "Destination overflow in dict_decode.");
+		return (-1);
+	}
+	*dstlen = dl;
+	src = from + 4;
+	fl -= 4;
+
+	df->Inverse_Dict(src, fl, to, &dl);
+	if (dl < *dstlen)
+		return (-1);
+	return (0);
+}
+
+#ifdef  __cplusplus
+}
+#endif
--- a/filters/dict/DictFilter.h
+++ b/filters/dict/DictFilter.h
@ -0,0 +1,80 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ * moinakg@gmail.com, http://moinakg.wordpress.com/
+ */
+
+/*
+ * Dict filter for text files. Adapted from Public Domain sources
+ * of Fu Siyuan's CSC 3.2 archiver.
+ */
+
+#ifndef _FILTERS_H
+#define _FILTERS_H
+
+#include <utils.h>
+
+#include "Common.h"
+#define MAX_WORDTREE_NODE_NUM 300 //Enough now!
+
+class DictFilter
+{
+public:
+	~DictFilter();
+	DictFilter();
+
+	u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
+	void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
+
+private:
+	typedef struct
+	{
+		u32 next[26];
+		u8 symbol;
+	} CTreeNode;
+	CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
+	u32 nodeMum;
+	u8 maxSymbol;
+	//Used for DICT transformer. Words are stored in trees.
+
+	u32 wordIndex[256];
+	//Used for DICT untransformer.choose words by symbols.
+	void MakeWordTree();  //Init the DICT transformer
+
+	u32 x0,x1;
+	u32 i,k;
+};
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void *new_dict_context();
+void delete_dict_context(void *dict_ctx);
+
+int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
--- a/pcompress.c
+++ b/pcompress.c
@ -191,7 +191,6 @@ show_compression_stats(pc_ctx_t *pctx)

 /*
 * Wrapper functions to pre-process the buffer and then call the main compression routine.
- * At present only LZP pre-compression is used below. Some extra metadata is added:
 *
 * Byte 0: A flag to indicate which pre-processor was used.
 * Byte 1 - Byte 8: Size of buffer after pre-processing
--- a/utils/utils.h
+++ b/utils/utils.h
@ -277,7 +277,7 @@ typedef enum {
 	/*
 	 * Sub-types.
 	 */
-#define	NUM_SUB_TYPES	33
+#define	NUM_SUB_TYPES	34
 	TYPE_EXE32 = 8,
 	TYPE_JPEG = 16,
 	TYPE_MARKUP = 24,
@ -309,7 +309,8 @@ typedef enum {
 	TYPE_DICOM = 232,
 	TYPE_PNM = 240,
 	TYPE_PACKPNM = 248,
-	TYPE_WAV = 256
+	TYPE_WAV = 256,
+	TYPE_ENGLISH = 264
 } data_type_t;

 /*