pcompress/filters/dict/DictFilter.cpp
Moinak Ghosh e7081eb5a3 Git commit - rehash. Incorrect earlier commit.
Implement Separate metadata stream.
Fix blatant wrong check in Bzip2 compressor.
Implement E8E9 filter fallback in Dispack.
Improve dict buffer size checks.
Reduce thread count to control memory usage in archive mode.
2014-10-24 23:30:40 +05:30

347 lines
7 KiB
C++

/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@gmail.com, http://moinakg.wordpress.com/
*/
/*
* Dict filter for text files. Adapted from Public Domain sources
* of Fu Siyuan's CSC 3.2 archiver.
*/
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <stdint.h>
#include <stdio.h>
#include "DictFilter.h"
#include "Common.h"
#include "utils.h"
extern "C" {
extern int analyze_buffer(void *src, uint64_t srclen);
}
class DictFilter
{
public:
~DictFilter();
DictFilter();
u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
private:
typedef struct
{
u32 next[26];
u8 symbol;
} CTreeNode;
CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
u32 nodeMum;
u8 maxSymbol;
//Used for DICT transformer. Words are stored in trees.
u32 wordIndex[256];
//Used for DICT untransformer.choose words by symbols.
void MakeWordTree(); //Init the DICT transformer
u32 x0,x1;
u32 i,k;
};
const u32 wordNum = 123;
u8 wordList[wordNum][8] =
{
"",
"ac","ad","ai","al","am",
"an","ar","as","at","ea",
"ec","ed","ee","el","en",
"er","es","et","id","ie",
"ig","il","in","io","is",
"it","of","ol","on","oo",
"or","os","ou","ow","ul",
"un","ur","us","ba","be",
"ca","ce","co","ch","de",
"di","ge","gh","ha","he",
"hi","ho","ra","re","ri",
"ro","rs","la","le","li",
"lo","ld","ll","ly","se",
"si","so","sh","ss","st",
"ma","me","mi","ne","nc",
"nd","ng","nt","pa","pe",
"ta","te","ti","to","th",
"tr","wa","ve",
"all","and","but","dow",
"for","had","hav","her",
"him","his","man","mor",
"not","now","one","out",
"she","the","was","wer",
"whi","whe","wit","you",
"any","are",
"that","said","with","have",
"this","from","were","tion",
};
void
DictFilter::MakeWordTree()
{
u32 i,j;
u32 treePos;
u8 symbolIndex = 0x82;
nodeMum = 1;
memset(wordTree,0,sizeof(wordTree));
for (i = 1; i < wordNum; i++) {
treePos = 0;
for(j = 0; wordList[i][j] != 0; j++) {
u32 idx = wordList[i][j] - 'a';
if (wordTree[treePos].next[idx]) {
treePos = wordTree[treePos].next[idx];
} else {
wordTree[treePos].next[idx] = nodeMum;
treePos = nodeMum;
nodeMum++;
}
}
wordIndex[symbolIndex] = i;
wordTree[treePos].symbol = symbolIndex++;
}
maxSymbol=symbolIndex;
}
DictFilter::DictFilter()
{
MakeWordTree();
}
DictFilter::~DictFilter()
{
}
u32
DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
{
if (size < 16384)
return 0;
u32 i,j,treePos = 0;
u32 lastSymbol = 0;
u32 dstSize = 0;
int idx;
for(i = 0; i < size-5;) {
if (src[i] >= 'a' && src[i] <= 'z') {
u32 matchSymbol = 0,longestWord = 0;
treePos = 0;
for(j = 0;;) {
idx = src[i+j] - 'a';
if (idx < 0 || idx > 25)
break;
if (wordTree[treePos].next[idx] == 0)
break;
treePos=wordTree[treePos].next[idx];
j++;
if (wordTree[treePos].symbol) {
matchSymbol = wordTree[treePos].symbol;
longestWord = j;
}
}
if (matchSymbol) {
dst[dstSize++] = matchSymbol;
i += longestWord;
continue;
}
lastSymbol = 0;
dst[dstSize++] = src[i];
i++;
} else {
if (src[i] >= 0x82) {
dst[dstSize++] = 254;
dst[dstSize++] = src[i];
}
else
dst[dstSize++] = src[i];
lastSymbol = 0;
treePos = 0;
i++;
}
}
for (; i<size; i++) {
if (src[i] >= 0x82) {
dst[dstSize++] = 254;
dst[dstSize++] = src[i];
}
else
dst[dstSize++] = src[i];
}
if (dstSize > size*0.82)
return 0;
*dstsize = dstSize;
return 1;
}
void
DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
{
u32 i = 0,j;
u32 dstPos = 0,idx;
while(dstPos < *dstsize && i < size) {
if (src[i] >= 0x82 && src[i] < maxSymbol) {
idx = wordIndex[src[i]];
for(j=0; wordList[idx][j]; j++)
dst[dstPos++] = wordList[idx][j];
}
else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
i++;
dst[dstPos++] = src[i];
}
else {
dst[dstPos++] = src[i];
}
i++;
}
*dstsize = dstPos;
}
#ifdef __cplusplus
extern "C" {
#endif
void *
new_dict_context()
{
DictFilter *df = new DictFilter();
return (static_cast<void *>(df));
}
void
delete_dict_context(void *dict_ctx)
{
if (dict_ctx) {
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
delete df;
}
}
int
dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
{
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
u32 fl;
u32 dl;
int atype;
uchar_t *dst;
DEBUG_STAT_EN(double strt, en);
/*
* Dict can't handle > 4GB buffers :-O
*/
if (fromlen > UINT32_MAX)
return (-1);
fl = (u32)fromlen;
dl = (u32)(*dstlen);
DEBUG_STAT_EN(strt = get_wtime_millis());
atype = analyze_buffer(from, fromlen);
if (PC_TYPE(atype) == TYPE_TEXT) {
U32_P(to) = LE32(fl);
dst = to + 4;
dl -= 4;
if (df->Forward_Dict(from, fl, dst, &dl)) {
*dstlen = dl + 8;
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
fromlen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
get_mb_s(fromlen, strt, en)));
return (1);
}
}
DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
return (-1);
}
int
dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
{
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
u32 fl;
u32 dl;
u8 *src;
DEBUG_STAT_EN(double strt, en);
if (fromlen > UINT32_MAX) {
log_msg(LOG_ERR, 0, "Dict decode buffer too big!");
return (-1);
}
fl = (u32)fromlen;
DEBUG_STAT_EN(strt = get_wtime_millis());
dl = U32_P(from);
if (dl > *dstlen) {
log_msg(LOG_ERR, 0, "Destination overflow in dict_decode. Need: %" PRIu64 ", Got: %" PRIu64 "\n",
dl, *dstlen);
return (-1);
}
*dstlen = dl;
src = from + 4;
fl -= 4;
df->Inverse_Dict(src, fl, to, &dl);
if (dl < *dstlen) {
log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
*dstlen, dl);
return (-1);
}
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
fromlen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
get_mb_s(fromlen, strt, en)));
return (0);
}
#ifdef __cplusplus
}
#endif