diff --git a/Makefile.in b/Makefile.in index e5fcdf4..a327204 100644 --- a/Makefile.in +++ b/Makefile.in @@ -158,6 +158,10 @@ DISPACKSRCS = filters/dispack/dis.cpp DISPACKHDRS = filters/dispack/dis.hpp filters/dispack/types.hpp DISPACKOBJS = $(DISPACKSRCS:.cpp=.o) +DICTSRCS = filters/dict/DictFilter.cpp +DICTHDRS = filters/dict/DictFilter.h filters/dict/Common.h +DICTOBJS = $(DICTSRCS:.cpp=.o) + SKEIN_BLOCK_C = crypto/skein/skein_block.c SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s SKEIN_BLOCK_SRC = @SKEIN_BLOCK@ @@ -246,7 +250,7 @@ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) $(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \ $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(BLAKE2_OBJS) \ @CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS) $(DISPACKOBJS) $(PPNMOBJS) \ -$(WAVPKOBJS) +$(WAVPKOBJS) $(DICTOBJS) DEBUG_LINK = $(GPP) -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp -fPIC DEBUG_COMPILE = $(GCC) -g -c @EXTRA_OPT_FLAGS@ -fPIC @USE_CLANG_AS@ @@ -340,6 +344,10 @@ $(DISPACKOBJS): $(DISPACKSRCS) $(DISPACKHDRS) $(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \ -Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@ +$(DICTOBJS): $(DICTSRCS) $(DICTHDRS) + $(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \ + -Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@ + $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC) $(COMPILE) $(SKEIN_FLAGS) $(SKEIN_BLOCK_SRC) -o $@ diff --git a/filters/dict/Common.h b/filters/dict/Common.h new file mode 100644 index 0000000..166dee8 --- /dev/null +++ b/filters/dict/Common.h @@ -0,0 +1,66 @@ +#ifndef _DATATYPE_H +#define _DATATYPE_H + +#define CSA_VERSION 8 + + +typedef unsigned char u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int i32; +typedef int64_t i64; + +const u32 KB=1024; +const u32 MB=1048576; +const u32 MinBlockSize=8*KB; + + +const u32 MaxChunkBits=21; +const u32 MaxChunkSize=(1<<(MaxChunkBits-1)); +const u32 MaxDictSize=512*MB;//Don't change +const u32 DefaultOutStreamBlockSize=128*KB; +const u32 DefaultInBufferSize=MaxChunkSize; //Should >=MaxChunkSize +#define DLT_CHANNEL_MAX 5 +const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8}; + + +#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0) + + +#define ENCODE 1 +#define DECODE 2 + + +/*****ERRORS*****************/ +#define NO_ERROR 0 +#define CANT_OPEN_FILE (-100) +#define CANT_CREATE_FILE (-99) +#define NOT_CSC_FILE (-98) +#define VERSION_INVALID (-97) +#define CSC_FILE_INVALID (-95) +#define DECODE_ERROR (-96) +#define CANT_ALLOC_MEM (-94) +#define ALREADY_INITIALIZED (-93) +#define OPERATION_ERROR (-92) +#define FILE_DIDNT_OPEN (-91) +/*****ERRORS*****************/ + +/******Block Type*************/ +#define DT_NONE 0 +#define DT_HARD 0x05 +#define DT_EXE 0x04 +#define DT_BAD 0x03 +#define DT_NORMAL 0x02 +#define DT_SKIP 0x01 +#define DT_AUDIO 0x06 +#define DT_RGB 0x07 +#define DT_FAST 0x08 +#define SIG_EOF 0x09 +#define DT_ENGTXT 0x0A +#define DT_DLT 0x10 +#define DT_MAXINDEX 0x1F +/******Block Type*************/ + + +#endif diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp new file mode 100644 index 0000000..6aa6d87 --- /dev/null +++ b/filters/dict/DictFilter.cpp @@ -0,0 +1,278 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program. + * If not, see . + * + * moinakg@gmail.com, http://moinakg.wordpress.com/ + */ + +/* + * Dict filter for text files. Adapted from Public Domain sources + * of Fu Siyuan's CSC 3.2 archiver. + */ + +#include +#include +#include +#include +#include "DictFilter.h" +#include "Common.h" + +const u32 wordNum = 123; + +u8 wordList[wordNum][8] = +{ + "", + "ac","ad","ai","al","am", + "an","ar","as","at","ea", + "ec","ed","ee","el","en", + "er","es","et","id","ie", + "ig","il","in","io","is", + "it","of","ol","on","oo", + "or","os","ou","ow","ul", + "un","ur","us","ba","be", + "ca","ce","co","ch","de", + "di","ge","gh","ha","he", + "hi","ho","ra","re","ri", + "ro","rs","la","le","li", + "lo","ld","ll","ly","se", + "si","so","sh","ss","st", + "ma","me","mi","ne","nc", + "nd","ng","nt","pa","pe", + "ta","te","ti","to","th", + "tr","wa","ve", + "all","and","but","dow", + "for","had","hav","her", + "him","his","man","mor", + "not","now","one","out", + "she","the","was","wer", + "whi","whe","wit","you", + "any","are", + "that","said","with","have", + "this","from","were","tion", +}; + + +void +DictFilter::MakeWordTree() +{ + u32 i,j; + u32 treePos; + u8 symbolIndex = 0x82; + + nodeMum = 1; + + memset(wordTree,0,sizeof(wordTree)); + + for (i = 1; i < wordNum; i++) { + treePos = 0; + for(j = 0; wordList[i][j] != 0; j++) { + u32 idx = wordList[i][j] - 'a'; + if (wordTree[treePos].next[idx]) { + treePos = wordTree[treePos].next[idx]; + } else { + wordTree[treePos].next[idx] = nodeMum; + treePos = nodeMum; + nodeMum++; + } + } + wordIndex[symbolIndex] = i; + wordTree[treePos].symbol = symbolIndex++; + } + + maxSymbol=symbolIndex; + +} + + +DictFilter::DictFilter() +{ + MakeWordTree(); +} + + + +DictFilter::~DictFilter() +{ +} + + +u32 +DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) +{ + if (size < 16384) + return 0; + + u32 i,j,treePos = 0; + u32 lastSymbol = 0; + u32 dstSize = 0; + u32 idx; + + + for(i = 0; i < size-5;) { + if (src[i] >= 'a' && src[i] <= 'z') { + + u32 matchSymbol = 0,longestWord = 0; + treePos = 0; + for(j = 0;;) { + idx = src[i+j] - 'a'; + if (idx < 0 || idx > 25) + break; + if (wordTree[treePos].next[idx] == 0) + break; + + treePos=wordTree[treePos].next[idx]; + j++; + if (wordTree[treePos].symbol) { + matchSymbol = wordTree[treePos].symbol; + longestWord = j; + } + } + + if (matchSymbol) { + dst[dstSize++] = matchSymbol; + i += longestWord; + continue; + } + lastSymbol = 0; + dst[dstSize++] = src[i]; + i++; + } else { + if (src[i] >= 0x82) { + dst[dstSize++] = 254; + dst[dstSize++] = src[i]; + } + else + dst[dstSize++] = src[i]; + + lastSymbol = 0; + treePos = 0; + i++; + } + + } + + for (; i= 0x82) { + dst[dstSize++] = 254; + dst[dstSize++] = src[i]; + } + else + dst[dstSize++] = src[i]; + } + + if (dstSize > size*0.82) + return 0; + + *dstsize = dstSize; + return 1; +} + +void +DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) +{ + + u32 i = 0,j; + u32 dstPos = 0,idx; + + while(dstPos < *dstsize && i < size) { + if (src[i] >= 0x82 && src[i] < maxSymbol) { + idx = wordIndex[src[i]]; + for(j=0; wordList[idx][j]; j++) + dst[dstPos++] = wordList[idx][j]; + } + else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) { + i++; + dst[dstPos++] = src[i]; + } + else { + dst[dstPos++] = src[i]; + } + + i++; + } + *dstsize = dstPos; +} + +#ifdef __cplusplus +extern "C" { +#endif + +void * +new_dict_context() +{ + DictFilter *df = new DictFilter(); + return (static_cast(df)); +} + +void +delete_dict_context(void *dict_ctx) +{ + if (dict_ctx) { + DictFilter *df = static_cast(dict_ctx); + delete df; + } +} + +int +dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) +{ + DictFilter *df = static_cast(dict_ctx); + u32 fl = fromlen; + u32 dl = *dstlen; + u8 *dst; + + if (fromlen > UINT32_MAX) + return (-1); + U32_P(to) = LE32(fromlen); + dst = to + 4; + dl -= 4; + if (df->Forward_Dict(from, fl, dst, &dl)) { + *dstlen = dl + 4; + return (0); + } + return (-1); +} + +int +dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) +{ + DictFilter *df = static_cast(dict_ctx); + u32 fl = fromlen; + u32 dl; + u8 *src; + + dl = U32_P(from); + if (dl > *dstlen) { + log_msg(LOG_ERR, 0, "Destination overflow in dict_decode."); + return (-1); + } + *dstlen = dl; + src = from + 4; + fl -= 4; + + df->Inverse_Dict(src, fl, to, &dl); + if (dl < *dstlen) + return (-1); + return (0); +} + +#ifdef __cplusplus +} +#endif diff --git a/filters/dict/DictFilter.h b/filters/dict/DictFilter.h new file mode 100644 index 0000000..4b4baee --- /dev/null +++ b/filters/dict/DictFilter.h @@ -0,0 +1,80 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program. + * If not, see . + * + * moinakg@gmail.com, http://moinakg.wordpress.com/ + */ + +/* + * Dict filter for text files. Adapted from Public Domain sources + * of Fu Siyuan's CSC 3.2 archiver. + */ + +#ifndef _FILTERS_H +#define _FILTERS_H + +#include + +#include "Common.h" +#define MAX_WORDTREE_NODE_NUM 300 //Enough now! + +class DictFilter +{ +public: + ~DictFilter(); + DictFilter(); + + u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize); + void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize); + +private: + typedef struct + { + u32 next[26]; + u8 symbol; + } CTreeNode; + CTreeNode wordTree[MAX_WORDTREE_NODE_NUM]; + u32 nodeMum; + u8 maxSymbol; + //Used for DICT transformer. Words are stored in trees. + + u32 wordIndex[256]; + //Used for DICT untransformer.choose words by symbols. + void MakeWordTree(); //Init the DICT transformer + + u32 x0,x1; + u32 i,k; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +void *new_dict_context(); +void delete_dict_context(void *dict_ctx); + +int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); +int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/pcompress.c b/pcompress.c index 7d44fd7..b7d0073 100644 --- a/pcompress.c +++ b/pcompress.c @@ -191,7 +191,6 @@ show_compression_stats(pc_ctx_t *pctx) /* * Wrapper functions to pre-process the buffer and then call the main compression routine. - * At present only LZP pre-compression is used below. Some extra metadata is added: * * Byte 0: A flag to indicate which pre-processor was used. * Byte 1 - Byte 8: Size of buffer after pre-processing diff --git a/utils/utils.h b/utils/utils.h index e61855f..4271d3a 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -277,7 +277,7 @@ typedef enum { /* * Sub-types. */ -#define NUM_SUB_TYPES 33 +#define NUM_SUB_TYPES 34 TYPE_EXE32 = 8, TYPE_JPEG = 16, TYPE_MARKUP = 24, @@ -309,7 +309,8 @@ typedef enum { TYPE_DICOM = 232, TYPE_PNM = 240, TYPE_PACKPNM = 248, - TYPE_WAV = 256 + TYPE_WAV = 256, + TYPE_ENGLISH = 264 } data_type_t; /*