Update,simplify analyzer function to indicate text data for Dict filter.

Fix archive header writing bug.
Strip ^M chars from dict filter files.
Include DICT preprocessing type.
Fix a bunch of bugs found by Xcode.
This commit is contained in:
Moinak Ghosh 2014-09-20 12:49:00 +05:30
parent 4fedebc607
commit 071a9e2b26
13 changed files with 617 additions and 448 deletions

1
.gitignore vendored
View file

@ -24,3 +24,4 @@ libtool
stamp-h1 stamp-h1
.libs .libs
buildtmp buildtmp
*.dSYM

View file

@ -31,9 +31,11 @@ GPP=@GPP@
LIBVER=1 LIBVER=1
MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \ MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \ adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
utils/xxhash_base.c utils/heap.c utils/cpuid.c pcompress.c utils/xxhash_base.c utils/heap.c utils/cpuid.c filters/analyzer/analyzer.c \
pcompress.c
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heap.h \ MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heap.h \
utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp \
filters/analyzer/analyzer.h
MAINOBJS = $(MAINSRCS:.c=.o) MAINOBJS = $(MAINSRCS:.c=.o)
PROGSRCS = main.c PROGSRCS = main.c
@ -233,7 +235,7 @@ BASE_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_P
-I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ -I./rabin/global \ -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ -I./rabin/global \
-I./crypto/keccak -I./filters/transpose -I./crypto/blake2 $(EXTRA_CPPFLAGS) \ -I./crypto/keccak -I./filters/transpose -I./crypto/blake2 $(EXTRA_CPPFLAGS) \
-I./crypto/xsalsa20 -I./archive -pedantic -Wall -I./filters -fno-strict-aliasing \ -I./crypto/xsalsa20 -I./archive -pedantic -Wall -I./filters -fno-strict-aliasing \
-Wno-unused-but-set-variable -Wno-enum-compare \ -Wno-unused-but-set-variable -Wno-enum-compare -I./filters/analyzer \
@COMPAT_CPPFLAGS@ @XSALSA20_DEBUG@ -I@LIBARCHIVE_DIR@/libarchive -I./filters/packjpg \ @COMPAT_CPPFLAGS@ @XSALSA20_DEBUG@ -I@LIBARCHIVE_DIR@/libarchive -I./filters/packjpg \
-I./filters/packpnm @ENABLE_WAVPACK@ -I./filters/packpnm @ENABLE_WAVPACK@
COMMON_CPPFLAGS = $(BASE_CPPFLAGS) -std=gnu99 COMMON_CPPFLAGS = $(BASE_CPPFLAGS) -std=gnu99

View file

@ -175,7 +175,7 @@ creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len
pctx->btype = pctx->ctype; pctx->btype = pctx->ctype;
} else { } else {
if (pctx->arc_buf_pos < pctx->min_chunk) { if (pctx->arc_buf_pos < pctx->min_chunk) {
uint32_t diff = pctx->min_chunk - pctx->arc_buf_pos; int diff = pctx->min_chunk - (int)(pctx->arc_buf_pos);
if (len > diff) if (len > diff)
pctx->btype = pctx->ctype; pctx->btype = pctx->ctype;
else else
@ -918,9 +918,10 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry,
size_t sz, offset, len; size_t sz, offset, len;
ssize_t bytes_to_write; ssize_t bytes_to_write;
uchar_t *mapbuf; uchar_t *mapbuf;
int rv, fd; int rv, fd, typ1;
const char *fpath; const char *fpath;
typ1 = typ;
offset = 0; offset = 0;
rv = 0; rv = 0;
sz = archive_entry_size(entry); sz = archive_entry_size(entry);
@ -1014,6 +1015,11 @@ do_map:
} else { } else {
return (ARCHIVE_OK); return (ARCHIVE_OK);
} }
} else {
if (write_header(arc, entry) == -1) {
close(fd);
return (-1);
}
} }
} else { } else {
if (write_header(arc, entry) == -1) { if (write_header(arc, entry) == -1) {
@ -1029,7 +1035,7 @@ do_map:
* stage there is no need for blocking. * stage there is no need for blocking.
*/ */
wrtn = archive_write_data(arc, src, wlen); wrtn = archive_write_data(arc, src, wlen);
if (wrtn < wlen) { if (wrtn < (ssize_t)wlen) {
/* Write failed; this is bad */ /* Write failed; this is bad */
log_msg(LOG_ERR, 0, "Data write error: %s", archive_error_string(arc)); log_msg(LOG_ERR, 0, "Data write error: %s", archive_error_string(arc));
rv = -1; rv = -1;

1
config
View file

@ -714,7 +714,6 @@ echo "*************** Running configure in libarchive ****************"
(cd $libarchive_dir (cd $libarchive_dir
CC=${GCC} ./configure --disable-bsdtar --disable-bsdcpio --without-zlib --without-bz2lib --without-lzmadec --without-lzma --without-lzo2 --without-nettle --without-openssl --without-xml2 --without-expat --disable-silent-rules --enable-shared=no --enable-static=yes --enable-xattr CC=${GCC} ./configure --disable-bsdtar --disable-bsdcpio --without-zlib --without-bz2lib --without-lzmadec --without-lzma --without-lzo2 --without-nettle --without-openssl --without-xml2 --without-expat --disable-silent-rules --enable-shared=no --enable-static=yes --enable-xattr
[ $? -ne 0 ] && exit $? [ $? -ne 0 ] && exit $?
cp Makefile Makefile.orig
cat Makefile | sed ' cat Makefile | sed '
s@$(BUILT_SOURCES)@@ s@$(BUILT_SOURCES)@@
s@$(libarchive_test_SOURCES)@@ s@$(libarchive_test_SOURCES)@@

View file

@ -0,0 +1,69 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*/
#include "utils.h"
int
analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode)
{
uchar_t *src1 = (uchar_t *)src;
int stype = PC_SUBTYPE(btype);
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
uint32_t freq[256], freq0x80[2] = {0};
uint64_t i, alphabetNum = 0, tot8b = 0;
uchar_t cur_byte;
/*
* Count number of 8-bit binary bytes and XML tags in source.
*/
tot8b = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
freq[cur_byte]++;
}
for (i = 0; i < 256; i++)
freq0x80[i>>7]+=freq[i];
for(i = 'a'; i <= 'z'; i++)
alphabetNum+=freq[i];
/*
* Heuristics for detecting BINARY vs generic TEXT
*/
tot8b /= 0x80;
if (tot8b < (srclen>>2 + srclen>>3)) {
btype = TYPE_TEXT;
if (freq0x80[1]<(srclen>>3) && (freq[' ']>(srclen>>7))
&& (freq['a']+freq['e']+freq['t']>(srclen>>4))
&& alphabetNum>(srclen>>2)) {
btype |= TYPE_ENGLISH;
}
}
}
return (btype);
}

View file

@ -0,0 +1,30 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*/
#ifndef _ANALYZER_H
#define _ANALYZER_H
int analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode);
#endif

View file

@ -432,7 +432,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen
val = gtot1 + (srclen - (pos - src)); val = gtot1 + (srclen - (pos - src));
} }
*dstlen = pos2 - dst; *dstlen = pos2 - dst;
return (val); return ((int)val);
} }
int int

View file

@ -1,66 +1,66 @@
#ifndef _DATATYPE_H #ifndef _DATATYPE_H
#define _DATATYPE_H #define _DATATYPE_H
#define CSA_VERSION 8 #define CSA_VERSION 8
typedef unsigned char u8; typedef unsigned char u8;
typedef uint16_t u16; typedef uint16_t u16;
typedef uint32_t u32; typedef uint32_t u32;
typedef uint64_t u64; typedef uint64_t u64;
typedef int i32; typedef int i32;
typedef int64_t i64; typedef int64_t i64;
const u32 KB=1024; const u32 KB=1024;
const u32 MB=1048576; const u32 MB=1048576;
const u32 MinBlockSize=8*KB; const u32 MinBlockSize=8*1024;
const u32 MaxChunkBits=21; const u32 MaxChunkBits=21;
const u32 MaxChunkSize=(1<<(MaxChunkBits-1)); const u32 MaxChunkSize=(1<<(21-1));
const u32 MaxDictSize=512*MB;//Don't change const u32 MaxDictSize=512*1048576;//Don't change
const u32 DefaultOutStreamBlockSize=128*KB; const u32 DefaultOutStreamBlockSize=128*1024;
const u32 DefaultInBufferSize=MaxChunkSize; //Should >=MaxChunkSize const u32 DefaultInBufferSize=21; //Should >=MaxChunkSize
#define DLT_CHANNEL_MAX 5 #define DLT_CHANNEL_MAX 5
const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8}; const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8};
#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0) #define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0)
#define ENCODE 1 #define ENCODE 1
#define DECODE 2 #define DECODE 2
/*****ERRORS*****************/ /*****ERRORS*****************/
#define NO_ERROR 0 #define NO_ERROR 0
#define CANT_OPEN_FILE (-100) #define CANT_OPEN_FILE (-100)
#define CANT_CREATE_FILE (-99) #define CANT_CREATE_FILE (-99)
#define NOT_CSC_FILE (-98) #define NOT_CSC_FILE (-98)
#define VERSION_INVALID (-97) #define VERSION_INVALID (-97)
#define CSC_FILE_INVALID (-95) #define CSC_FILE_INVALID (-95)
#define DECODE_ERROR (-96) #define DECODE_ERROR (-96)
#define CANT_ALLOC_MEM (-94) #define CANT_ALLOC_MEM (-94)
#define ALREADY_INITIALIZED (-93) #define ALREADY_INITIALIZED (-93)
#define OPERATION_ERROR (-92) #define OPERATION_ERROR (-92)
#define FILE_DIDNT_OPEN (-91) #define FILE_DIDNT_OPEN (-91)
/*****ERRORS*****************/ /*****ERRORS*****************/
/******Block Type*************/ /******Block Type*************/
#define DT_NONE 0 #define DT_NONE 0
#define DT_HARD 0x05 #define DT_HARD 0x05
#define DT_EXE 0x04 #define DT_EXE 0x04
#define DT_BAD 0x03 #define DT_BAD 0x03
#define DT_NORMAL 0x02 #define DT_NORMAL 0x02
#define DT_SKIP 0x01 #define DT_SKIP 0x01
#define DT_AUDIO 0x06 #define DT_AUDIO 0x06
#define DT_RGB 0x07 #define DT_RGB 0x07
#define DT_FAST 0x08 #define DT_FAST 0x08
#define SIG_EOF 0x09 #define SIG_EOF 0x09
#define DT_ENGTXT 0x0A #define DT_ENGTXT 0x0A
#define DT_DLT 0x10 #define DT_DLT 0x10
#define DT_MAXINDEX 0x1F #define DT_MAXINDEX 0x1F
/******Block Type*************/ /******Block Type*************/
#endif #endif

View file

@ -1,278 +1,306 @@
/* /*
* This file is a part of Pcompress, a chunked parallel multi- * This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program. * algorithm lossless compression and decompression program.
* *
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either * License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version. * version 3 of the License, or (at your option) any later version.
* *
* This program is distributed in the hope that it will be useful, * This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. * Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public * You should have received a copy of the GNU Lesser General Public
* License along with this program. * License along with this program.
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
* *
* moinakg@gmail.com, http://moinakg.wordpress.com/ * moinakg@gmail.com, http://moinakg.wordpress.com/
*/ */
/* /*
* Dict filter for text files. Adapted from Public Domain sources * Dict filter for text files. Adapted from Public Domain sources
* of Fu Siyuan's CSC 3.2 archiver. * of Fu Siyuan's CSC 3.2 archiver.
*/ */
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <strings.h> #include <strings.h>
#include <stdint.h> #include <stdint.h>
#include "DictFilter.h" #include "DictFilter.h"
#include "Common.h" #include "Common.h"
const u32 wordNum = 123; class DictFilter
{
u8 wordList[wordNum][8] = public:
{ ~DictFilter();
"", DictFilter();
"ac","ad","ai","al","am",
"an","ar","as","at","ea", u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
"ec","ed","ee","el","en", void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
"er","es","et","id","ie",
"ig","il","in","io","is", private:
"it","of","ol","on","oo", typedef struct
"or","os","ou","ow","ul", {
"un","ur","us","ba","be", u32 next[26];
"ca","ce","co","ch","de", u8 symbol;
"di","ge","gh","ha","he", } CTreeNode;
"hi","ho","ra","re","ri", CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
"ro","rs","la","le","li", u32 nodeMum;
"lo","ld","ll","ly","se", u8 maxSymbol;
"si","so","sh","ss","st", //Used for DICT transformer. Words are stored in trees.
"ma","me","mi","ne","nc",
"nd","ng","nt","pa","pe", u32 wordIndex[256];
"ta","te","ti","to","th", //Used for DICT untransformer.choose words by symbols.
"tr","wa","ve", void MakeWordTree(); //Init the DICT transformer
"all","and","but","dow",
"for","had","hav","her", u32 x0,x1;
"him","his","man","mor", u32 i,k;
"not","now","one","out", };
"she","the","was","wer",
"whi","whe","wit","you", const u32 wordNum = 123;
"any","are",
"that","said","with","have", u8 wordList[wordNum][8] =
"this","from","were","tion", {
}; "",
"ac","ad","ai","al","am",
"an","ar","as","at","ea",
void "ec","ed","ee","el","en",
DictFilter::MakeWordTree() "er","es","et","id","ie",
{ "ig","il","in","io","is",
u32 i,j; "it","of","ol","on","oo",
u32 treePos; "or","os","ou","ow","ul",
u8 symbolIndex = 0x82; "un","ur","us","ba","be",
"ca","ce","co","ch","de",
nodeMum = 1; "di","ge","gh","ha","he",
"hi","ho","ra","re","ri",
memset(wordTree,0,sizeof(wordTree)); "ro","rs","la","le","li",
"lo","ld","ll","ly","se",
for (i = 1; i < wordNum; i++) { "si","so","sh","ss","st",
treePos = 0; "ma","me","mi","ne","nc",
for(j = 0; wordList[i][j] != 0; j++) { "nd","ng","nt","pa","pe",
u32 idx = wordList[i][j] - 'a'; "ta","te","ti","to","th",
if (wordTree[treePos].next[idx]) { "tr","wa","ve",
treePos = wordTree[treePos].next[idx]; "all","and","but","dow",
} else { "for","had","hav","her",
wordTree[treePos].next[idx] = nodeMum; "him","his","man","mor",
treePos = nodeMum; "not","now","one","out",
nodeMum++; "she","the","was","wer",
} "whi","whe","wit","you",
} "any","are",
wordIndex[symbolIndex] = i; "that","said","with","have",
wordTree[treePos].symbol = symbolIndex++; "this","from","were","tion",
} };
maxSymbol=symbolIndex;
void
} DictFilter::MakeWordTree()
{
u32 i,j;
DictFilter::DictFilter() u32 treePos;
{ u8 symbolIndex = 0x82;
MakeWordTree();
} nodeMum = 1;
memset(wordTree,0,sizeof(wordTree));
DictFilter::~DictFilter() for (i = 1; i < wordNum; i++) {
{ treePos = 0;
} for(j = 0; wordList[i][j] != 0; j++) {
u32 idx = wordList[i][j] - 'a';
if (wordTree[treePos].next[idx]) {
u32 treePos = wordTree[treePos].next[idx];
DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) } else {
{ wordTree[treePos].next[idx] = nodeMum;
if (size < 16384) treePos = nodeMum;
return 0; nodeMum++;
}
u32 i,j,treePos = 0; }
u32 lastSymbol = 0; wordIndex[symbolIndex] = i;
u32 dstSize = 0; wordTree[treePos].symbol = symbolIndex++;
u32 idx; }
maxSymbol=symbolIndex;
for(i = 0; i < size-5;) {
if (src[i] >= 'a' && src[i] <= 'z') { }
u32 matchSymbol = 0,longestWord = 0;
treePos = 0; DictFilter::DictFilter()
for(j = 0;;) { {
idx = src[i+j] - 'a'; MakeWordTree();
if (idx < 0 || idx > 25) }
break;
if (wordTree[treePos].next[idx] == 0)
break;
DictFilter::~DictFilter()
treePos=wordTree[treePos].next[idx]; {
j++; }
if (wordTree[treePos].symbol) {
matchSymbol = wordTree[treePos].symbol;
longestWord = j; u32
} DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
} {
if (size < 16384)
if (matchSymbol) { return 0;
dst[dstSize++] = matchSymbol;
i += longestWord; u32 i,j,treePos = 0;
continue; u32 lastSymbol = 0;
} u32 dstSize = 0;
lastSymbol = 0; u32 idx;
dst[dstSize++] = src[i];
i++;
} else { for(i = 0; i < size-5;) {
if (src[i] >= 0x82) { if (src[i] >= 'a' && src[i] <= 'z') {
dst[dstSize++] = 254;
dst[dstSize++] = src[i]; u32 matchSymbol = 0,longestWord = 0;
} treePos = 0;
else for(j = 0;;) {
dst[dstSize++] = src[i]; idx = src[i+j] - 'a';
if (idx < 0 || idx > 25)
lastSymbol = 0; break;
treePos = 0; if (wordTree[treePos].next[idx] == 0)
i++; break;
}
treePos=wordTree[treePos].next[idx];
} j++;
if (wordTree[treePos].symbol) {
for (; i<size; i++) { matchSymbol = wordTree[treePos].symbol;
if (src[i] >= 0x82) { longestWord = j;
dst[dstSize++] = 254; }
dst[dstSize++] = src[i]; }
}
else if (matchSymbol) {
dst[dstSize++] = src[i]; dst[dstSize++] = matchSymbol;
} i += longestWord;
continue;
if (dstSize > size*0.82) }
return 0; lastSymbol = 0;
dst[dstSize++] = src[i];
*dstsize = dstSize; i++;
return 1; } else {
} if (src[i] >= 0x82) {
dst[dstSize++] = 254;
void dst[dstSize++] = src[i];
DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) }
{ else
dst[dstSize++] = src[i];
u32 i = 0,j;
u32 dstPos = 0,idx; lastSymbol = 0;
treePos = 0;
while(dstPos < *dstsize && i < size) { i++;
if (src[i] >= 0x82 && src[i] < maxSymbol) { }
idx = wordIndex[src[i]];
for(j=0; wordList[idx][j]; j++) }
dst[dstPos++] = wordList[idx][j];
} for (; i<size; i++) {
else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) { if (src[i] >= 0x82) {
i++; dst[dstSize++] = 254;
dst[dstPos++] = src[i]; dst[dstSize++] = src[i];
} }
else { else
dst[dstPos++] = src[i]; dst[dstSize++] = src[i];
} }
i++; if (dstSize > size*0.82)
} return 0;
*dstsize = dstPos;
} *dstsize = dstSize;
return 1;
#ifdef __cplusplus }
extern "C" {
#endif void
DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
void * {
new_dict_context()
{ u32 i = 0,j;
DictFilter *df = new DictFilter(); u32 dstPos = 0,idx;
return (static_cast<void *>(df));
} while(dstPos < *dstsize && i < size) {
if (src[i] >= 0x82 && src[i] < maxSymbol) {
void idx = wordIndex[src[i]];
delete_dict_context(void *dict_ctx) for(j=0; wordList[idx][j]; j++)
{ dst[dstPos++] = wordList[idx][j];
if (dict_ctx) { }
DictFilter *df = static_cast<DictFilter *>(dict_ctx); else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
delete df; i++;
} dst[dstPos++] = src[i];
} }
else {
int dst[dstPos++] = src[i];
dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) }
{
DictFilter *df = static_cast<DictFilter *>(dict_ctx); i++;
u32 fl = fromlen; }
u32 dl = *dstlen; *dstsize = dstPos;
u8 *dst; }
if (fromlen > UINT32_MAX) #ifdef __cplusplus
return (-1); extern "C" {
U32_P(to) = LE32(fromlen); #endif
dst = to + 4;
dl -= 4; void *
if (df->Forward_Dict(from, fl, dst, &dl)) { new_dict_context()
*dstlen = dl + 4; {
return (0); DictFilter *df = new DictFilter();
} return (static_cast<void *>(df));
return (-1); }
}
void
int delete_dict_context(void *dict_ctx)
dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) {
{ if (dict_ctx) {
DictFilter *df = static_cast<DictFilter *>(dict_ctx); DictFilter *df = static_cast<DictFilter *>(dict_ctx);
u32 fl = fromlen; delete df;
u32 dl; }
u8 *src; }
dl = U32_P(from); int
if (dl > *dstlen) { dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
log_msg(LOG_ERR, 0, "Destination overflow in dict_decode."); {
return (-1); DictFilter *df = static_cast<DictFilter *>(dict_ctx);
} u32 fl = fromlen;
*dstlen = dl; u32 dl = *dstlen;
src = from + 4; u8 *dst;
fl -= 4;
if (fromlen > UINT32_MAX)
df->Inverse_Dict(src, fl, to, &dl); return (-1);
if (dl < *dstlen) U32_P(to) = LE32(fromlen);
return (-1); dst = to + 4;
return (0); dl -= 4;
} if (df->Forward_Dict(from, fl, dst, &dl)) {
*dstlen = dl + 4;
#ifdef __cplusplus return (0);
} }
#endif return (-1);
}
int
dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
{
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
u32 fl = fromlen;
u32 dl;
u8 *src;
dl = U32_P(from);
if (dl > *dstlen) {
log_msg(LOG_ERR, 0, "Destination overflow in dict_decode.");
return (-1);
}
*dstlen = dl;
src = from + 4;
fl -= 4;
df->Inverse_Dict(src, fl, to, &dl);
if (dl < *dstlen)
return (-1);
return (0);
}
#ifdef __cplusplus
}
#endif

View file

@ -1,80 +1,52 @@
/* /*
* This file is a part of Pcompress, a chunked parallel multi- * This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program. * algorithm lossless compression and decompression program.
* *
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either * License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version. * version 3 of the License, or (at your option) any later version.
* *
* This program is distributed in the hope that it will be useful, * This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. * Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public * You should have received a copy of the GNU Lesser General Public
* License along with this program. * License along with this program.
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
* *
* moinakg@gmail.com, http://moinakg.wordpress.com/ * moinakg@gmail.com, http://moinakg.wordpress.com/
*/ */
/* /*
* Dict filter for text files. Adapted from Public Domain sources * Dict filter for text files. Adapted from Public Domain sources
* of Fu Siyuan's CSC 3.2 archiver. * of Fu Siyuan's CSC 3.2 archiver.
*/ */
#ifndef _FILTERS_H #ifndef _FILTERS_H
#define _FILTERS_H #define _FILTERS_H
#include <utils.h> #include <utils.h>
#include "Common.h" #include "Common.h"
#define MAX_WORDTREE_NODE_NUM 300 //Enough now! #define MAX_WORDTREE_NODE_NUM 300 //Enough now!
class DictFilter #ifdef __cplusplus
{ extern "C" {
public: #endif
~DictFilter();
DictFilter(); void *new_dict_context();
void delete_dict_context(void *dict_ctx);
u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize); int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
private:
typedef struct #ifdef __cplusplus
{ }
u32 next[26]; #endif
u8 symbol;
} CTreeNode; #endif
CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
u32 nodeMum;
u8 maxSymbol;
//Used for DICT transformer. Words are stored in trees.
u32 wordIndex[256];
//Used for DICT untransformer.choose words by symbols.
void MakeWordTree(); //Init the DICT transformer
u32 x0,x1;
u32 i,k;
};
#ifdef __cplusplus
extern "C" {
#endif
void *new_dict_context();
void delete_dict_context(void *dict_ctx);
int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -56,6 +56,8 @@
#include <errno.h> #include <errno.h>
#include <pc_archive.h> #include <pc_archive.h>
#include <filters/dispack/dis.hpp> #include <filters/dispack/dis.hpp>
#include "analyzer.h"
#include "filters/dict/DictFilter.h"
/* /*
* We use 8MB chunks by default. * We use 8MB chunks by default.
@ -204,7 +206,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data,
algo_props_t *props) algo_props_t *props)
{ {
uchar_t *dest = (uchar_t *)dst, type = 0; uchar_t *dest = (uchar_t *)dst, type = 0, atype;
int64_t result; int64_t result;
uint64_t _dstlen, fromlen; uint64_t _dstlen, fromlen;
uchar_t *from, *to; uchar_t *from, *to;
@ -238,13 +240,45 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
} }
} }
/*
* The analyzer is run below only for non-archive mode. When archiving the
* archiver thread runs analyzer on incremental blocks and sets the type
* accordingly.
*/
atype = btype;
/*
* Run an analyzer on the data. At present the analyzer only tries
* to detect if this is text for running the dict filter.
*/
if (pctx->enable_analyzer) {
atype = analyze_buffer(src, srclen, btype, pctx->adapt_mode);
}
/*
* Enabling LZP also enables the DICT filter since we are dealing with text
* in any case.
*/
if (pctx->lzp_preprocess && PC_TYPE(atype) == TYPE_TEXT) {
void *dct = new_dict_context();
_dstlen = fromlen;
result = dict_encode(dct, from, fromlen, to, &_dstlen);
delete_dict_context(dct);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DICT;
}
}
#ifndef _MPLV2_LICENSE_ #ifndef _MPLV2_LICENSE_
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) { if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
int hashsize; int hashsize;
hashsize = lzp_hash_size(level); hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)from, to, fromlen, result = lzp_compress((const uchar_t *)from, to, fromlen,
hashsize, LZP_DEFAULT_LZPMINLEN, 0); hashsize, LZP_DEFAULT_LZPMINLEN, 0);
if (result >= 0 && result < srclen) { if (result >= 0 && result < srclen) {
uchar_t *tmp; uchar_t *tmp;
tmp = from; tmp = from;
@ -375,6 +409,20 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
#endif #endif
} }
if (type & PREPROC_TYPE_DICT) {
void *dct = new_dict_context();
result = dict_decode(dct, src, srclen, dst, &_dstlen);
delete_dict_context(dct);
if (result != -1) {
memcpy(src, dst, _dstlen);
srclen = _dstlen;
*dstlen = _dstlen;
} else {
log_msg(LOG_ERR, 0, "DICT decoding failed.");
return (result);
}
}
if (type & PREPROC_TYPE_DISPACK) { if (type & PREPROC_TYPE_DISPACK) {
result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen1); result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen1);
if (result != -1) { if (result != -1) {
@ -689,13 +737,13 @@ cont:
* Compressed length: 8 bytes. * Compressed length: 8 bytes.
* Checksum: Upto 64 bytes. * Checksum: Upto 64 bytes.
* Chunk flags: 1 byte. * Chunk flags: 1 byte.
* *
* Chunk Flags, 8 bits: * Chunk Flags, 8 bits:
* I I I I I I I I * I I I I I I I I
* | | | | | | * | | | | | |
* | '-----' | | `- 0 - Uncompressed * | '-----' | | `- 0 - Uncompressed
* | | | | 1 - Compressed * | | | | 1 - Compressed
* | | | | * | | | |
* | | | `---- 1 - Chunk was Deduped * | | | `---- 1 - Chunk was Deduped
* | | `------- 1 - Chunk was pre-compressed * | | `------- 1 - Chunk was pre-compressed
* | | * | |
@ -1070,7 +1118,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
memset(zero, 0, MAX_PW_LEN); memset(zero, 0, MAX_PW_LEN);
fd = open(pctx->pwd_file, O_RDWR); fd = open(pctx->pwd_file, O_RDWR);
if (fd != -1) { if (fd != -1) {
pw_len = lseek(fd, 0, SEEK_END); pw_len = (int)lseek(fd, 0, SEEK_END);
if (pw_len != -1) { if (pw_len != -1) {
if (pw_len > MAX_PW_LEN) pw_len = MAX_PW_LEN-1; if (pw_len > MAX_PW_LEN) pw_len = MAX_PW_LEN-1;
lseek(fd, 0, SEEK_SET); lseek(fd, 0, SEEK_SET);
@ -1552,9 +1600,11 @@ redo:
dedupe_index_sz = 0; dedupe_index_sz = 0;
type = COMPRESSED; type = COMPRESSED;
/* Perform Dedup if enabled. */ /* Perform Dedup if enabled. */
if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan)) { if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan)) {
dedupe_context_t *rctx; dedupe_context_t *rctx;
uint64_t rb = tdat->rbytes;
/* /*
* Compute checksum of original uncompressed chunk. When doing dedup * Compute checksum of original uncompressed chunk. When doing dedup
@ -1569,8 +1619,9 @@ redo:
rctx = tdat->rctx; rctx = tdat->rctx;
reset_dedupe_context(tdat->rctx); reset_dedupe_context(tdat->rctx);
rctx->cbuf = tdat->uncompressed_chunk; rctx->cbuf = tdat->uncompressed_chunk;
dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &rb, 0,
NULL, tdat->cksum_mt); NULL, tdat->cksum_mt);
tdat->rbytes = rb;
if (!rctx->valid) { if (!rctx->valid) {
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes); memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
tdat->rbytes = rbytes; tdat->rbytes = rbytes;
@ -1744,6 +1795,10 @@ plain_index:
tdat->len_cmp += (pctx->cksum_bytes + pctx->mac_bytes); tdat->len_cmp += (pctx->cksum_bytes + pctx->mac_bytes);
rbytes = tdat->len_cmp - len_cmp; // HDR size for HMAC rbytes = tdat->len_cmp - len_cmp; // HDR size for HMAC
/*
* In adaptive mode return value from compression function function indicates
* which algorithm was used on the chunk. We have to store that.
*/
if (pctx->adapt_mode) if (pctx->adapt_mode)
type |= (rv << 4); type |= (rv << 4);
@ -2750,7 +2805,8 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
pctx->_deinit_func = adapt_deinit; pctx->_deinit_func = adapt_deinit;
pctx->_stats_func = adapt_stats; pctx->_stats_func = adapt_stats;
pctx->_props_func = adapt_props; pctx->_props_func = adapt_props;
pctx->adapt_mode = 1; pctx->adapt_mode = 2;
pctx->enable_analyzer = 1;
rv = 0; rv = 0;
} else if (memcmp(algorithm, "adapt", 5) == 0) { } else if (memcmp(algorithm, "adapt", 5) == 0) {
@ -2761,6 +2817,7 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
pctx->_stats_func = adapt_stats; pctx->_stats_func = adapt_stats;
pctx->_props_func = adapt_props; pctx->_props_func = adapt_props;
pctx->adapt_mode = 1; pctx->adapt_mode = 1;
pctx->enable_analyzer = 1;
rv = 0; rv = 0;
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
} else if (memcmp(algorithm, "libbsc", 6) == 0) { } else if (memcmp(algorithm, "libbsc", 6) == 0) {
@ -2770,7 +2827,6 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail)
pctx->_deinit_func = libbsc_deinit; pctx->_deinit_func = libbsc_deinit;
pctx->_stats_func = libbsc_stats; pctx->_stats_func = libbsc_stats;
pctx->_props_func = libbsc_props; pctx->_props_func = libbsc_props;
pctx->adapt_mode = 1;
rv = 0; rv = 0;
#endif #endif
} }
@ -3337,6 +3393,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
} }
if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) { if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) {
pctx->preprocess_mode = 1; pctx->preprocess_mode = 1;
pctx->enable_analyzer = 1;
} }
if (pctx->chunksize == 0) { if (pctx->chunksize == 0) {
if (pctx->level < 9) { if (pctx->level < 9) {

View file

@ -60,13 +60,14 @@ extern "C" {
#define CHSIZE_MASK 0x80 #define CHSIZE_MASK 0x80
#define BZIP2_A_NUM 16 #define BZIP2_A_NUM 16
#define LZMA_A_NUM 32 #define LZMA_A_NUM 32
#define CHUNK_FLAG_DEDUP 2 #define CHUNK_FLAG_DEDUP 2
#define CHUNK_FLAG_PREPROC 4 #define CHUNK_FLAG_PREPROC 4
#define COMP_EXTN ".pz" #define COMP_EXTN ".pz"
#define PREPROC_TYPE_LZP 1 #define PREPROC_TYPE_LZP 1
#define PREPROC_TYPE_DELTA2 2 #define PREPROC_TYPE_DELTA2 2
#define PREPROC_TYPE_DISPACK 4 #define PREPROC_TYPE_DISPACK 4
#define PREPROC_TYPE_DICT 8
#define PREPROC_COMPRESSED 128 #define PREPROC_COMPRESSED 128
/* /*
@ -212,6 +213,7 @@ typedef struct pc_ctx {
int delta2_nstrides; int delta2_nstrides;
int enable_rabin_split; int enable_rabin_split;
int enable_fixed_scan; int enable_fixed_scan;
int enable_analyzer;
int preprocess_mode; int preprocess_mode;
int lzp_preprocess; int lzp_preprocess;
int dispack_preprocess; int dispack_preprocess;
@ -275,7 +277,7 @@ struct cmp_data {
uchar_t *compressed_chunk; uchar_t *compressed_chunk;
uchar_t *uncompressed_chunk; uchar_t *uncompressed_chunk;
dedupe_context_t *rctx; dedupe_context_t *rctx;
uint64_t rbytes; int64_t rbytes;
uint64_t chunksize; uint64_t chunksize;
uint64_t len_cmp, len_cmp_be; uint64_t len_cmp, len_cmp_be;
uchar_t checksum[CKSUM_MAX_BYTES]; uchar_t checksum[CKSUM_MAX_BYTES];

View file

@ -383,14 +383,18 @@ get_total_ram()
} }
#ifdef __APPLE__ #ifdef __APPLE__
#define NANO_SEC (1000000000ULL)
int int
clock_gettime(int clk_id, struct timespec *ts) clock_gettime(int clk_id, struct timespec *ts)
{ {
if (clk_id == CLOCK_MONOTONIC) { if (clk_id == CLOCK_MONOTONIC) {
uint64_t abstime = mach_absolute_time(); uint64_t nanotime = mach_absolute_time() *
return (abstime * sTimebaseInfo.numer / sTimebaseInfo.denom); sTimebaseInfo.numer / sTimebaseInfo.denom;
ts->tv_sec = nanotime / NANO_SEC;
ts->tv_nsec = nanotime % NANO_SEC;
return (0);
} }
return (0); return (EINVAL);
} }
#endif #endif
@ -543,8 +547,7 @@ log_msg(log_level_t log_level, int show_errno, const char *format, ...)
fputs(msg, stderr); fputs(msg, stderr);
} else if (ldest.type == LOG_FILE) { } else if (ldest.type == LOG_FILE) {
int rv; (void) write(ldest.fd, msg, strlen(msg));
rv = write(ldest.fd, msg, strlen(msg));
} else { } else {
ldest.cb(msg); ldest.cb(msg);
} }