Dict filter work in progress.
This commit is contained in:
parent
f34962f8cc
commit
4fedebc607
6 changed files with 436 additions and 4 deletions
10
Makefile.in
10
Makefile.in
|
@ -158,6 +158,10 @@ DISPACKSRCS = filters/dispack/dis.cpp
|
|||
DISPACKHDRS = filters/dispack/dis.hpp filters/dispack/types.hpp
|
||||
DISPACKOBJS = $(DISPACKSRCS:.cpp=.o)
|
||||
|
||||
DICTSRCS = filters/dict/DictFilter.cpp
|
||||
DICTHDRS = filters/dict/DictFilter.h filters/dict/Common.h
|
||||
DICTOBJS = $(DICTSRCS:.cpp=.o)
|
||||
|
||||
SKEIN_BLOCK_C = crypto/skein/skein_block.c
|
||||
SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s
|
||||
SKEIN_BLOCK_SRC = @SKEIN_BLOCK@
|
||||
|
@ -246,7 +250,7 @@ $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS)
|
|||
$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
|
||||
$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(BLAKE2_OBJS) \
|
||||
@CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS) $(DISPACKOBJS) $(PPNMOBJS) \
|
||||
$(WAVPKOBJS)
|
||||
$(WAVPKOBJS) $(DICTOBJS)
|
||||
|
||||
DEBUG_LINK = $(GPP) -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp -fPIC
|
||||
DEBUG_COMPILE = $(GCC) -g -c @EXTRA_OPT_FLAGS@ -fPIC @USE_CLANG_AS@
|
||||
|
@ -340,6 +344,10 @@ $(DISPACKOBJS): $(DISPACKSRCS) $(DISPACKHDRS)
|
|||
$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \
|
||||
-Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
|
||||
|
||||
$(DICTOBJS): $(DICTSRCS) $(DICTHDRS)
|
||||
$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @DEBUG_STATS_CPPFLAGS@ @SSE_OPT_FLAGS@ @USE_CLANG_AS@ -O2 -fsched-spec-load \
|
||||
-Wno-variadic-macros $(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
|
||||
|
||||
$(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
|
||||
$(COMPILE) $(SKEIN_FLAGS) $(SKEIN_BLOCK_SRC) -o $@
|
||||
|
||||
|
|
66
filters/dict/Common.h
Normal file
66
filters/dict/Common.h
Normal file
|
@ -0,0 +1,66 @@
|
|||
#ifndef _DATATYPE_H
|
||||
#define _DATATYPE_H
|
||||
|
||||
#define CSA_VERSION 8
|
||||
|
||||
|
||||
typedef unsigned char u8;
|
||||
typedef uint16_t u16;
|
||||
typedef uint32_t u32;
|
||||
typedef uint64_t u64;
|
||||
typedef int i32;
|
||||
typedef int64_t i64;
|
||||
|
||||
const u32 KB=1024;
|
||||
const u32 MB=1048576;
|
||||
const u32 MinBlockSize=8*KB;
|
||||
|
||||
|
||||
const u32 MaxChunkBits=21;
|
||||
const u32 MaxChunkSize=(1<<(MaxChunkBits-1));
|
||||
const u32 MaxDictSize=512*MB;//Don't change
|
||||
const u32 DefaultOutStreamBlockSize=128*KB;
|
||||
const u32 DefaultInBufferSize=MaxChunkSize; //Should >=MaxChunkSize
|
||||
#define DLT_CHANNEL_MAX 5
|
||||
const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8};
|
||||
|
||||
|
||||
#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0)
|
||||
|
||||
|
||||
#define ENCODE 1
|
||||
#define DECODE 2
|
||||
|
||||
|
||||
/*****ERRORS*****************/
|
||||
#define NO_ERROR 0
|
||||
#define CANT_OPEN_FILE (-100)
|
||||
#define CANT_CREATE_FILE (-99)
|
||||
#define NOT_CSC_FILE (-98)
|
||||
#define VERSION_INVALID (-97)
|
||||
#define CSC_FILE_INVALID (-95)
|
||||
#define DECODE_ERROR (-96)
|
||||
#define CANT_ALLOC_MEM (-94)
|
||||
#define ALREADY_INITIALIZED (-93)
|
||||
#define OPERATION_ERROR (-92)
|
||||
#define FILE_DIDNT_OPEN (-91)
|
||||
/*****ERRORS*****************/
|
||||
|
||||
/******Block Type*************/
|
||||
#define DT_NONE 0
|
||||
#define DT_HARD 0x05
|
||||
#define DT_EXE 0x04
|
||||
#define DT_BAD 0x03
|
||||
#define DT_NORMAL 0x02
|
||||
#define DT_SKIP 0x01
|
||||
#define DT_AUDIO 0x06
|
||||
#define DT_RGB 0x07
|
||||
#define DT_FAST 0x08
|
||||
#define SIG_EOF 0x09
|
||||
#define DT_ENGTXT 0x0A
|
||||
#define DT_DLT 0x10
|
||||
#define DT_MAXINDEX 0x1F
|
||||
/******Block Type*************/
|
||||
|
||||
|
||||
#endif
|
278
filters/dict/DictFilter.cpp
Normal file
278
filters/dict/DictFilter.cpp
Normal file
|
@ -0,0 +1,278 @@
|
|||
/*
|
||||
* This file is a part of Pcompress, a chunked parallel multi-
|
||||
* algorithm lossless compression and decompression program.
|
||||
*
|
||||
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this program.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* moinakg@gmail.com, http://moinakg.wordpress.com/
|
||||
*/
|
||||
|
||||
/*
|
||||
* Dict filter for text files. Adapted from Public Domain sources
|
||||
* of Fu Siyuan's CSC 3.2 archiver.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <stdint.h>
|
||||
#include "DictFilter.h"
|
||||
#include "Common.h"
|
||||
|
||||
const u32 wordNum = 123;
|
||||
|
||||
u8 wordList[wordNum][8] =
|
||||
{
|
||||
"",
|
||||
"ac","ad","ai","al","am",
|
||||
"an","ar","as","at","ea",
|
||||
"ec","ed","ee","el","en",
|
||||
"er","es","et","id","ie",
|
||||
"ig","il","in","io","is",
|
||||
"it","of","ol","on","oo",
|
||||
"or","os","ou","ow","ul",
|
||||
"un","ur","us","ba","be",
|
||||
"ca","ce","co","ch","de",
|
||||
"di","ge","gh","ha","he",
|
||||
"hi","ho","ra","re","ri",
|
||||
"ro","rs","la","le","li",
|
||||
"lo","ld","ll","ly","se",
|
||||
"si","so","sh","ss","st",
|
||||
"ma","me","mi","ne","nc",
|
||||
"nd","ng","nt","pa","pe",
|
||||
"ta","te","ti","to","th",
|
||||
"tr","wa","ve",
|
||||
"all","and","but","dow",
|
||||
"for","had","hav","her",
|
||||
"him","his","man","mor",
|
||||
"not","now","one","out",
|
||||
"she","the","was","wer",
|
||||
"whi","whe","wit","you",
|
||||
"any","are",
|
||||
"that","said","with","have",
|
||||
"this","from","were","tion",
|
||||
};
|
||||
|
||||
|
||||
void
|
||||
DictFilter::MakeWordTree()
|
||||
{
|
||||
u32 i,j;
|
||||
u32 treePos;
|
||||
u8 symbolIndex = 0x82;
|
||||
|
||||
nodeMum = 1;
|
||||
|
||||
memset(wordTree,0,sizeof(wordTree));
|
||||
|
||||
for (i = 1; i < wordNum; i++) {
|
||||
treePos = 0;
|
||||
for(j = 0; wordList[i][j] != 0; j++) {
|
||||
u32 idx = wordList[i][j] - 'a';
|
||||
if (wordTree[treePos].next[idx]) {
|
||||
treePos = wordTree[treePos].next[idx];
|
||||
} else {
|
||||
wordTree[treePos].next[idx] = nodeMum;
|
||||
treePos = nodeMum;
|
||||
nodeMum++;
|
||||
}
|
||||
}
|
||||
wordIndex[symbolIndex] = i;
|
||||
wordTree[treePos].symbol = symbolIndex++;
|
||||
}
|
||||
|
||||
maxSymbol=symbolIndex;
|
||||
|
||||
}
|
||||
|
||||
|
||||
DictFilter::DictFilter()
|
||||
{
|
||||
MakeWordTree();
|
||||
}
|
||||
|
||||
|
||||
|
||||
DictFilter::~DictFilter()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
u32
|
||||
DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
|
||||
{
|
||||
if (size < 16384)
|
||||
return 0;
|
||||
|
||||
u32 i,j,treePos = 0;
|
||||
u32 lastSymbol = 0;
|
||||
u32 dstSize = 0;
|
||||
u32 idx;
|
||||
|
||||
|
||||
for(i = 0; i < size-5;) {
|
||||
if (src[i] >= 'a' && src[i] <= 'z') {
|
||||
|
||||
u32 matchSymbol = 0,longestWord = 0;
|
||||
treePos = 0;
|
||||
for(j = 0;;) {
|
||||
idx = src[i+j] - 'a';
|
||||
if (idx < 0 || idx > 25)
|
||||
break;
|
||||
if (wordTree[treePos].next[idx] == 0)
|
||||
break;
|
||||
|
||||
treePos=wordTree[treePos].next[idx];
|
||||
j++;
|
||||
if (wordTree[treePos].symbol) {
|
||||
matchSymbol = wordTree[treePos].symbol;
|
||||
longestWord = j;
|
||||
}
|
||||
}
|
||||
|
||||
if (matchSymbol) {
|
||||
dst[dstSize++] = matchSymbol;
|
||||
i += longestWord;
|
||||
continue;
|
||||
}
|
||||
lastSymbol = 0;
|
||||
dst[dstSize++] = src[i];
|
||||
i++;
|
||||
} else {
|
||||
if (src[i] >= 0x82) {
|
||||
dst[dstSize++] = 254;
|
||||
dst[dstSize++] = src[i];
|
||||
}
|
||||
else
|
||||
dst[dstSize++] = src[i];
|
||||
|
||||
lastSymbol = 0;
|
||||
treePos = 0;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (; i<size; i++) {
|
||||
if (src[i] >= 0x82) {
|
||||
dst[dstSize++] = 254;
|
||||
dst[dstSize++] = src[i];
|
||||
}
|
||||
else
|
||||
dst[dstSize++] = src[i];
|
||||
}
|
||||
|
||||
if (dstSize > size*0.82)
|
||||
return 0;
|
||||
|
||||
*dstsize = dstSize;
|
||||
return 1;
|
||||
}
|
||||
|
||||
void
|
||||
DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
|
||||
{
|
||||
|
||||
u32 i = 0,j;
|
||||
u32 dstPos = 0,idx;
|
||||
|
||||
while(dstPos < *dstsize && i < size) {
|
||||
if (src[i] >= 0x82 && src[i] < maxSymbol) {
|
||||
idx = wordIndex[src[i]];
|
||||
for(j=0; wordList[idx][j]; j++)
|
||||
dst[dstPos++] = wordList[idx][j];
|
||||
}
|
||||
else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
|
||||
i++;
|
||||
dst[dstPos++] = src[i];
|
||||
}
|
||||
else {
|
||||
dst[dstPos++] = src[i];
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
*dstsize = dstPos;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void *
|
||||
new_dict_context()
|
||||
{
|
||||
DictFilter *df = new DictFilter();
|
||||
return (static_cast<void *>(df));
|
||||
}
|
||||
|
||||
void
|
||||
delete_dict_context(void *dict_ctx)
|
||||
{
|
||||
if (dict_ctx) {
|
||||
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
|
||||
delete df;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
||||
{
|
||||
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
|
||||
u32 fl = fromlen;
|
||||
u32 dl = *dstlen;
|
||||
u8 *dst;
|
||||
|
||||
if (fromlen > UINT32_MAX)
|
||||
return (-1);
|
||||
U32_P(to) = LE32(fromlen);
|
||||
dst = to + 4;
|
||||
dl -= 4;
|
||||
if (df->Forward_Dict(from, fl, dst, &dl)) {
|
||||
*dstlen = dl + 4;
|
||||
return (0);
|
||||
}
|
||||
return (-1);
|
||||
}
|
||||
|
||||
int
|
||||
dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
||||
{
|
||||
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
|
||||
u32 fl = fromlen;
|
||||
u32 dl;
|
||||
u8 *src;
|
||||
|
||||
dl = U32_P(from);
|
||||
if (dl > *dstlen) {
|
||||
log_msg(LOG_ERR, 0, "Destination overflow in dict_decode.");
|
||||
return (-1);
|
||||
}
|
||||
*dstlen = dl;
|
||||
src = from + 4;
|
||||
fl -= 4;
|
||||
|
||||
df->Inverse_Dict(src, fl, to, &dl);
|
||||
if (dl < *dstlen)
|
||||
return (-1);
|
||||
return (0);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
80
filters/dict/DictFilter.h
Normal file
80
filters/dict/DictFilter.h
Normal file
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* This file is a part of Pcompress, a chunked parallel multi-
|
||||
* algorithm lossless compression and decompression program.
|
||||
*
|
||||
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this program.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* moinakg@gmail.com, http://moinakg.wordpress.com/
|
||||
*/
|
||||
|
||||
/*
|
||||
* Dict filter for text files. Adapted from Public Domain sources
|
||||
* of Fu Siyuan's CSC 3.2 archiver.
|
||||
*/
|
||||
|
||||
#ifndef _FILTERS_H
|
||||
#define _FILTERS_H
|
||||
|
||||
#include <utils.h>
|
||||
|
||||
#include "Common.h"
|
||||
#define MAX_WORDTREE_NODE_NUM 300 //Enough now!
|
||||
|
||||
class DictFilter
|
||||
{
|
||||
public:
|
||||
~DictFilter();
|
||||
DictFilter();
|
||||
|
||||
u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
|
||||
void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
|
||||
|
||||
private:
|
||||
typedef struct
|
||||
{
|
||||
u32 next[26];
|
||||
u8 symbol;
|
||||
} CTreeNode;
|
||||
CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
|
||||
u32 nodeMum;
|
||||
u8 maxSymbol;
|
||||
//Used for DICT transformer. Words are stored in trees.
|
||||
|
||||
u32 wordIndex[256];
|
||||
//Used for DICT untransformer.choose words by symbols.
|
||||
void MakeWordTree(); //Init the DICT transformer
|
||||
|
||||
u32 x0,x1;
|
||||
u32 i,k;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void *new_dict_context();
|
||||
void delete_dict_context(void *dict_ctx);
|
||||
|
||||
int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -191,7 +191,6 @@ show_compression_stats(pc_ctx_t *pctx)
|
|||
|
||||
/*
|
||||
* Wrapper functions to pre-process the buffer and then call the main compression routine.
|
||||
* At present only LZP pre-compression is used below. Some extra metadata is added:
|
||||
*
|
||||
* Byte 0: A flag to indicate which pre-processor was used.
|
||||
* Byte 1 - Byte 8: Size of buffer after pre-processing
|
||||
|
|
|
@ -277,7 +277,7 @@ typedef enum {
|
|||
/*
|
||||
* Sub-types.
|
||||
*/
|
||||
#define NUM_SUB_TYPES 33
|
||||
#define NUM_SUB_TYPES 34
|
||||
TYPE_EXE32 = 8,
|
||||
TYPE_JPEG = 16,
|
||||
TYPE_MARKUP = 24,
|
||||
|
@ -309,7 +309,8 @@ typedef enum {
|
|||
TYPE_DICOM = 232,
|
||||
TYPE_PNM = 240,
|
||||
TYPE_PACKPNM = 248,
|
||||
TYPE_WAV = 256
|
||||
TYPE_WAV = 256,
|
||||
TYPE_ENGLISH = 264
|
||||
} data_type_t;
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in a new issue