diff --git a/.gitignore b/.gitignore index 2dc6b14..39e979a 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ libtool stamp-h1 .libs buildtmp +*.dSYM diff --git a/Makefile.in b/Makefile.in index a327204..8a71942 100644 --- a/Makefile.in +++ b/Makefile.in @@ -31,9 +31,11 @@ GPP=@GPP@ LIBVER=1 MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \ adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \ - utils/xxhash_base.c utils/heap.c utils/cpuid.c pcompress.c + utils/xxhash_base.c utils/heap.c utils/cpuid.c filters/analyzer/analyzer.c \ + pcompress.c MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heap.h \ - utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp + utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp \ + filters/analyzer/analyzer.h MAINOBJS = $(MAINSRCS:.c=.o) PROGSRCS = main.c @@ -233,7 +235,7 @@ BASE_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT_P -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ -I./rabin/global \ -I./crypto/keccak -I./filters/transpose -I./crypto/blake2 $(EXTRA_CPPFLAGS) \ -I./crypto/xsalsa20 -I./archive -pedantic -Wall -I./filters -fno-strict-aliasing \ - -Wno-unused-but-set-variable -Wno-enum-compare \ + -Wno-unused-but-set-variable -Wno-enum-compare -I./filters/analyzer \ @COMPAT_CPPFLAGS@ @XSALSA20_DEBUG@ -I@LIBARCHIVE_DIR@/libarchive -I./filters/packjpg \ -I./filters/packpnm @ENABLE_WAVPACK@ COMMON_CPPFLAGS = $(BASE_CPPFLAGS) -std=gnu99 diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 0b15d72..506c7d4 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -175,7 +175,7 @@ creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len pctx->btype = pctx->ctype; } else { if (pctx->arc_buf_pos < pctx->min_chunk) { - uint32_t diff = pctx->min_chunk - pctx->arc_buf_pos; + int diff = pctx->min_chunk - (int)(pctx->arc_buf_pos); if (len > diff) pctx->btype = pctx->ctype; else @@ -918,9 +918,10 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry, size_t sz, offset, len; ssize_t bytes_to_write; uchar_t *mapbuf; - int rv, fd; + int rv, fd, typ1; const char *fpath; + typ1 = typ; offset = 0; rv = 0; sz = archive_entry_size(entry); @@ -1014,6 +1015,11 @@ do_map: } else { return (ARCHIVE_OK); } + } else { + if (write_header(arc, entry) == -1) { + close(fd); + return (-1); + } } } else { if (write_header(arc, entry) == -1) { @@ -1029,7 +1035,7 @@ do_map: * stage there is no need for blocking. */ wrtn = archive_write_data(arc, src, wlen); - if (wrtn < wlen) { + if (wrtn < (ssize_t)wlen) { /* Write failed; this is bad */ log_msg(LOG_ERR, 0, "Data write error: %s", archive_error_string(arc)); rv = -1; diff --git a/config b/config index df4b21b..5ed7569 100755 --- a/config +++ b/config @@ -714,7 +714,6 @@ echo "*************** Running configure in libarchive ****************" (cd $libarchive_dir CC=${GCC} ./configure --disable-bsdtar --disable-bsdcpio --without-zlib --without-bz2lib --without-lzmadec --without-lzma --without-lzo2 --without-nettle --without-openssl --without-xml2 --without-expat --disable-silent-rules --enable-shared=no --enable-static=yes --enable-xattr [ $? -ne 0 ] && exit $? - cp Makefile Makefile.orig cat Makefile | sed ' s@$(BUILT_SOURCES)@@ s@$(libarchive_test_SOURCES)@@ diff --git a/filters/analyzer/analyzer.c b/filters/analyzer/analyzer.c new file mode 100644 index 0000000..1dce8c4 --- /dev/null +++ b/filters/analyzer/analyzer.c @@ -0,0 +1,69 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program. + * If not, see . + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +#include "utils.h" + +int +analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode) +{ + uchar_t *src1 = (uchar_t *)src; + int stype = PC_SUBTYPE(btype); + + if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) { + uint32_t freq[256], freq0x80[2] = {0}; + uint64_t i, alphabetNum = 0, tot8b = 0; + uchar_t cur_byte; + + /* + * Count number of 8-bit binary bytes and XML tags in source. + */ + tot8b = 0; + for (i = 0; i < srclen; i++) { + cur_byte = src1[i]; + tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization + freq[cur_byte]++; + } + + for (i = 0; i < 256; i++) + freq0x80[i>>7]+=freq[i]; + + for(i = 'a'; i <= 'z'; i++) + alphabetNum+=freq[i]; + + /* + * Heuristics for detecting BINARY vs generic TEXT + */ + tot8b /= 0x80; + if (tot8b < (srclen>>2 + srclen>>3)) { + btype = TYPE_TEXT; + if (freq0x80[1]<(srclen>>3) && (freq[' ']>(srclen>>7)) + && (freq['a']+freq['e']+freq['t']>(srclen>>4)) + && alphabetNum>(srclen>>2)) { + btype |= TYPE_ENGLISH; + } + } + } + + return (btype); +} diff --git a/filters/analyzer/analyzer.h b/filters/analyzer/analyzer.h new file mode 100644 index 0000000..922b596 --- /dev/null +++ b/filters/analyzer/analyzer.h @@ -0,0 +1,30 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program. + * If not, see . + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +#ifndef _ANALYZER_H +#define _ANALYZER_H + +int analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode); + +#endif diff --git a/filters/delta2/delta2.c b/filters/delta2/delta2.c index 5ea572d..3503a88 100644 --- a/filters/delta2/delta2.c +++ b/filters/delta2/delta2.c @@ -432,7 +432,7 @@ delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen val = gtot1 + (srclen - (pos - src)); } *dstlen = pos2 - dst; - return (val); + return ((int)val); } int diff --git a/filters/dict/Common.h b/filters/dict/Common.h index 166dee8..98061ee 100644 --- a/filters/dict/Common.h +++ b/filters/dict/Common.h @@ -1,66 +1,66 @@ -#ifndef _DATATYPE_H -#define _DATATYPE_H - -#define CSA_VERSION 8 - - -typedef unsigned char u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; -typedef int i32; -typedef int64_t i64; - -const u32 KB=1024; -const u32 MB=1048576; -const u32 MinBlockSize=8*KB; - - -const u32 MaxChunkBits=21; -const u32 MaxChunkSize=(1<<(MaxChunkBits-1)); -const u32 MaxDictSize=512*MB;//Don't change -const u32 DefaultOutStreamBlockSize=128*KB; -const u32 DefaultInBufferSize=MaxChunkSize; //Should >=MaxChunkSize -#define DLT_CHANNEL_MAX 5 -const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8}; - - -#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0) - - -#define ENCODE 1 -#define DECODE 2 - - -/*****ERRORS*****************/ -#define NO_ERROR 0 -#define CANT_OPEN_FILE (-100) -#define CANT_CREATE_FILE (-99) -#define NOT_CSC_FILE (-98) -#define VERSION_INVALID (-97) -#define CSC_FILE_INVALID (-95) -#define DECODE_ERROR (-96) -#define CANT_ALLOC_MEM (-94) -#define ALREADY_INITIALIZED (-93) -#define OPERATION_ERROR (-92) -#define FILE_DIDNT_OPEN (-91) -/*****ERRORS*****************/ - -/******Block Type*************/ -#define DT_NONE 0 -#define DT_HARD 0x05 -#define DT_EXE 0x04 -#define DT_BAD 0x03 -#define DT_NORMAL 0x02 -#define DT_SKIP 0x01 -#define DT_AUDIO 0x06 -#define DT_RGB 0x07 -#define DT_FAST 0x08 -#define SIG_EOF 0x09 -#define DT_ENGTXT 0x0A -#define DT_DLT 0x10 -#define DT_MAXINDEX 0x1F -/******Block Type*************/ - - -#endif +#ifndef _DATATYPE_H +#define _DATATYPE_H + +#define CSA_VERSION 8 + + +typedef unsigned char u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int i32; +typedef int64_t i64; + +const u32 KB=1024; +const u32 MB=1048576; +const u32 MinBlockSize=8*1024; + + +const u32 MaxChunkBits=21; +const u32 MaxChunkSize=(1<<(21-1)); +const u32 MaxDictSize=512*1048576;//Don't change +const u32 DefaultOutStreamBlockSize=128*1024; +const u32 DefaultInBufferSize=21; //Should >=MaxChunkSize +#define DLT_CHANNEL_MAX 5 +const u32 DltIndex[DLT_CHANNEL_MAX]={1,2,3,4,8}; + + +#define SAFEFREE(x) do{if ((x)!=NULL) free(x);x=NULL;}while(0) + + +#define ENCODE 1 +#define DECODE 2 + + +/*****ERRORS*****************/ +#define NO_ERROR 0 +#define CANT_OPEN_FILE (-100) +#define CANT_CREATE_FILE (-99) +#define NOT_CSC_FILE (-98) +#define VERSION_INVALID (-97) +#define CSC_FILE_INVALID (-95) +#define DECODE_ERROR (-96) +#define CANT_ALLOC_MEM (-94) +#define ALREADY_INITIALIZED (-93) +#define OPERATION_ERROR (-92) +#define FILE_DIDNT_OPEN (-91) +/*****ERRORS*****************/ + +/******Block Type*************/ +#define DT_NONE 0 +#define DT_HARD 0x05 +#define DT_EXE 0x04 +#define DT_BAD 0x03 +#define DT_NORMAL 0x02 +#define DT_SKIP 0x01 +#define DT_AUDIO 0x06 +#define DT_RGB 0x07 +#define DT_FAST 0x08 +#define SIG_EOF 0x09 +#define DT_ENGTXT 0x0A +#define DT_DLT 0x10 +#define DT_MAXINDEX 0x1F +/******Block Type*************/ + + +#endif diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp index 6aa6d87..bb0d151 100644 --- a/filters/dict/DictFilter.cpp +++ b/filters/dict/DictFilter.cpp @@ -1,278 +1,306 @@ -/* - * This file is a part of Pcompress, a chunked parallel multi- - * algorithm lossless compression and decompression program. - * - * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. - * Use is subject to license terms. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program. - * If not, see . - * - * moinakg@gmail.com, http://moinakg.wordpress.com/ - */ - -/* - * Dict filter for text files. Adapted from Public Domain sources - * of Fu Siyuan's CSC 3.2 archiver. - */ - -#include -#include -#include -#include -#include "DictFilter.h" -#include "Common.h" - -const u32 wordNum = 123; - -u8 wordList[wordNum][8] = -{ - "", - "ac","ad","ai","al","am", - "an","ar","as","at","ea", - "ec","ed","ee","el","en", - "er","es","et","id","ie", - "ig","il","in","io","is", - "it","of","ol","on","oo", - "or","os","ou","ow","ul", - "un","ur","us","ba","be", - "ca","ce","co","ch","de", - "di","ge","gh","ha","he", - "hi","ho","ra","re","ri", - "ro","rs","la","le","li", - "lo","ld","ll","ly","se", - "si","so","sh","ss","st", - "ma","me","mi","ne","nc", - "nd","ng","nt","pa","pe", - "ta","te","ti","to","th", - "tr","wa","ve", - "all","and","but","dow", - "for","had","hav","her", - "him","his","man","mor", - "not","now","one","out", - "she","the","was","wer", - "whi","whe","wit","you", - "any","are", - "that","said","with","have", - "this","from","were","tion", -}; - - -void -DictFilter::MakeWordTree() -{ - u32 i,j; - u32 treePos; - u8 symbolIndex = 0x82; - - nodeMum = 1; - - memset(wordTree,0,sizeof(wordTree)); - - for (i = 1; i < wordNum; i++) { - treePos = 0; - for(j = 0; wordList[i][j] != 0; j++) { - u32 idx = wordList[i][j] - 'a'; - if (wordTree[treePos].next[idx]) { - treePos = wordTree[treePos].next[idx]; - } else { - wordTree[treePos].next[idx] = nodeMum; - treePos = nodeMum; - nodeMum++; - } - } - wordIndex[symbolIndex] = i; - wordTree[treePos].symbol = symbolIndex++; - } - - maxSymbol=symbolIndex; - -} - - -DictFilter::DictFilter() -{ - MakeWordTree(); -} - - - -DictFilter::~DictFilter() -{ -} - - -u32 -DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) -{ - if (size < 16384) - return 0; - - u32 i,j,treePos = 0; - u32 lastSymbol = 0; - u32 dstSize = 0; - u32 idx; - - - for(i = 0; i < size-5;) { - if (src[i] >= 'a' && src[i] <= 'z') { - - u32 matchSymbol = 0,longestWord = 0; - treePos = 0; - for(j = 0;;) { - idx = src[i+j] - 'a'; - if (idx < 0 || idx > 25) - break; - if (wordTree[treePos].next[idx] == 0) - break; - - treePos=wordTree[treePos].next[idx]; - j++; - if (wordTree[treePos].symbol) { - matchSymbol = wordTree[treePos].symbol; - longestWord = j; - } - } - - if (matchSymbol) { - dst[dstSize++] = matchSymbol; - i += longestWord; - continue; - } - lastSymbol = 0; - dst[dstSize++] = src[i]; - i++; - } else { - if (src[i] >= 0x82) { - dst[dstSize++] = 254; - dst[dstSize++] = src[i]; - } - else - dst[dstSize++] = src[i]; - - lastSymbol = 0; - treePos = 0; - i++; - } - - } - - for (; i= 0x82) { - dst[dstSize++] = 254; - dst[dstSize++] = src[i]; - } - else - dst[dstSize++] = src[i]; - } - - if (dstSize > size*0.82) - return 0; - - *dstsize = dstSize; - return 1; -} - -void -DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) -{ - - u32 i = 0,j; - u32 dstPos = 0,idx; - - while(dstPos < *dstsize && i < size) { - if (src[i] >= 0x82 && src[i] < maxSymbol) { - idx = wordIndex[src[i]]; - for(j=0; wordList[idx][j]; j++) - dst[dstPos++] = wordList[idx][j]; - } - else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) { - i++; - dst[dstPos++] = src[i]; - } - else { - dst[dstPos++] = src[i]; - } - - i++; - } - *dstsize = dstPos; -} - -#ifdef __cplusplus -extern "C" { -#endif - -void * -new_dict_context() -{ - DictFilter *df = new DictFilter(); - return (static_cast(df)); -} - -void -delete_dict_context(void *dict_ctx) -{ - if (dict_ctx) { - DictFilter *df = static_cast(dict_ctx); - delete df; - } -} - -int -dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) -{ - DictFilter *df = static_cast(dict_ctx); - u32 fl = fromlen; - u32 dl = *dstlen; - u8 *dst; - - if (fromlen > UINT32_MAX) - return (-1); - U32_P(to) = LE32(fromlen); - dst = to + 4; - dl -= 4; - if (df->Forward_Dict(from, fl, dst, &dl)) { - *dstlen = dl + 4; - return (0); - } - return (-1); -} - -int -dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) -{ - DictFilter *df = static_cast(dict_ctx); - u32 fl = fromlen; - u32 dl; - u8 *src; - - dl = U32_P(from); - if (dl > *dstlen) { - log_msg(LOG_ERR, 0, "Destination overflow in dict_decode."); - return (-1); - } - *dstlen = dl; - src = from + 4; - fl -= 4; - - df->Inverse_Dict(src, fl, to, &dl); - if (dl < *dstlen) - return (-1); - return (0); -} - -#ifdef __cplusplus -} -#endif +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program. + * If not, see . + * + * moinakg@gmail.com, http://moinakg.wordpress.com/ + */ + +/* + * Dict filter for text files. Adapted from Public Domain sources + * of Fu Siyuan's CSC 3.2 archiver. + */ + +#include +#include +#include +#include +#include "DictFilter.h" +#include "Common.h" + +class DictFilter +{ +public: + ~DictFilter(); + DictFilter(); + + u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize); + void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize); + +private: + typedef struct + { + u32 next[26]; + u8 symbol; + } CTreeNode; + CTreeNode wordTree[MAX_WORDTREE_NODE_NUM]; + u32 nodeMum; + u8 maxSymbol; + //Used for DICT transformer. Words are stored in trees. + + u32 wordIndex[256]; + //Used for DICT untransformer.choose words by symbols. + void MakeWordTree(); //Init the DICT transformer + + u32 x0,x1; + u32 i,k; +}; + +const u32 wordNum = 123; + +u8 wordList[wordNum][8] = +{ + "", + "ac","ad","ai","al","am", + "an","ar","as","at","ea", + "ec","ed","ee","el","en", + "er","es","et","id","ie", + "ig","il","in","io","is", + "it","of","ol","on","oo", + "or","os","ou","ow","ul", + "un","ur","us","ba","be", + "ca","ce","co","ch","de", + "di","ge","gh","ha","he", + "hi","ho","ra","re","ri", + "ro","rs","la","le","li", + "lo","ld","ll","ly","se", + "si","so","sh","ss","st", + "ma","me","mi","ne","nc", + "nd","ng","nt","pa","pe", + "ta","te","ti","to","th", + "tr","wa","ve", + "all","and","but","dow", + "for","had","hav","her", + "him","his","man","mor", + "not","now","one","out", + "she","the","was","wer", + "whi","whe","wit","you", + "any","are", + "that","said","with","have", + "this","from","were","tion", +}; + + +void +DictFilter::MakeWordTree() +{ + u32 i,j; + u32 treePos; + u8 symbolIndex = 0x82; + + nodeMum = 1; + + memset(wordTree,0,sizeof(wordTree)); + + for (i = 1; i < wordNum; i++) { + treePos = 0; + for(j = 0; wordList[i][j] != 0; j++) { + u32 idx = wordList[i][j] - 'a'; + if (wordTree[treePos].next[idx]) { + treePos = wordTree[treePos].next[idx]; + } else { + wordTree[treePos].next[idx] = nodeMum; + treePos = nodeMum; + nodeMum++; + } + } + wordIndex[symbolIndex] = i; + wordTree[treePos].symbol = symbolIndex++; + } + + maxSymbol=symbolIndex; + +} + + +DictFilter::DictFilter() +{ + MakeWordTree(); +} + + + +DictFilter::~DictFilter() +{ +} + + +u32 +DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) +{ + if (size < 16384) + return 0; + + u32 i,j,treePos = 0; + u32 lastSymbol = 0; + u32 dstSize = 0; + u32 idx; + + + for(i = 0; i < size-5;) { + if (src[i] >= 'a' && src[i] <= 'z') { + + u32 matchSymbol = 0,longestWord = 0; + treePos = 0; + for(j = 0;;) { + idx = src[i+j] - 'a'; + if (idx < 0 || idx > 25) + break; + if (wordTree[treePos].next[idx] == 0) + break; + + treePos=wordTree[treePos].next[idx]; + j++; + if (wordTree[treePos].symbol) { + matchSymbol = wordTree[treePos].symbol; + longestWord = j; + } + } + + if (matchSymbol) { + dst[dstSize++] = matchSymbol; + i += longestWord; + continue; + } + lastSymbol = 0; + dst[dstSize++] = src[i]; + i++; + } else { + if (src[i] >= 0x82) { + dst[dstSize++] = 254; + dst[dstSize++] = src[i]; + } + else + dst[dstSize++] = src[i]; + + lastSymbol = 0; + treePos = 0; + i++; + } + + } + + for (; i= 0x82) { + dst[dstSize++] = 254; + dst[dstSize++] = src[i]; + } + else + dst[dstSize++] = src[i]; + } + + if (dstSize > size*0.82) + return 0; + + *dstsize = dstSize; + return 1; +} + +void +DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize) +{ + + u32 i = 0,j; + u32 dstPos = 0,idx; + + while(dstPos < *dstsize && i < size) { + if (src[i] >= 0x82 && src[i] < maxSymbol) { + idx = wordIndex[src[i]]; + for(j=0; wordList[idx][j]; j++) + dst[dstPos++] = wordList[idx][j]; + } + else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) { + i++; + dst[dstPos++] = src[i]; + } + else { + dst[dstPos++] = src[i]; + } + + i++; + } + *dstsize = dstPos; +} + +#ifdef __cplusplus +extern "C" { +#endif + +void * +new_dict_context() +{ + DictFilter *df = new DictFilter(); + return (static_cast(df)); +} + +void +delete_dict_context(void *dict_ctx) +{ + if (dict_ctx) { + DictFilter *df = static_cast(dict_ctx); + delete df; + } +} + +int +dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) +{ + DictFilter *df = static_cast(dict_ctx); + u32 fl = fromlen; + u32 dl = *dstlen; + u8 *dst; + + if (fromlen > UINT32_MAX) + return (-1); + U32_P(to) = LE32(fromlen); + dst = to + 4; + dl -= 4; + if (df->Forward_Dict(from, fl, dst, &dl)) { + *dstlen = dl + 4; + return (0); + } + return (-1); +} + +int +dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) +{ + DictFilter *df = static_cast(dict_ctx); + u32 fl = fromlen; + u32 dl; + u8 *src; + + dl = U32_P(from); + if (dl > *dstlen) { + log_msg(LOG_ERR, 0, "Destination overflow in dict_decode."); + return (-1); + } + *dstlen = dl; + src = from + 4; + fl -= 4; + + df->Inverse_Dict(src, fl, to, &dl); + if (dl < *dstlen) + return (-1); + return (0); +} + +#ifdef __cplusplus +} +#endif diff --git a/filters/dict/DictFilter.h b/filters/dict/DictFilter.h index 4b4baee..08187c6 100644 --- a/filters/dict/DictFilter.h +++ b/filters/dict/DictFilter.h @@ -1,80 +1,52 @@ -/* - * This file is a part of Pcompress, a chunked parallel multi- - * algorithm lossless compression and decompression program. - * - * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. - * Use is subject to license terms. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program. - * If not, see . - * - * moinakg@gmail.com, http://moinakg.wordpress.com/ - */ - -/* - * Dict filter for text files. Adapted from Public Domain sources - * of Fu Siyuan's CSC 3.2 archiver. - */ - -#ifndef _FILTERS_H -#define _FILTERS_H - -#include - -#include "Common.h" -#define MAX_WORDTREE_NODE_NUM 300 //Enough now! - -class DictFilter -{ -public: - ~DictFilter(); - DictFilter(); - - u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize); - void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize); - -private: - typedef struct - { - u32 next[26]; - u8 symbol; - } CTreeNode; - CTreeNode wordTree[MAX_WORDTREE_NODE_NUM]; - u32 nodeMum; - u8 maxSymbol; - //Used for DICT transformer. Words are stored in trees. - - u32 wordIndex[256]; - //Used for DICT untransformer.choose words by symbols. - void MakeWordTree(); //Init the DICT transformer - - u32 x0,x1; - u32 i,k; -}; - -#ifdef __cplusplus -extern "C" { -#endif - -void *new_dict_context(); -void delete_dict_context(void *dict_ctx); - -int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); -int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); - -#ifdef __cplusplus -} -#endif - -#endif +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program. + * If not, see . + * + * moinakg@gmail.com, http://moinakg.wordpress.com/ + */ + +/* + * Dict filter for text files. Adapted from Public Domain sources + * of Fu Siyuan's CSC 3.2 archiver. + */ + +#ifndef _FILTERS_H +#define _FILTERS_H + +#include + +#include "Common.h" +#define MAX_WORDTREE_NODE_NUM 300 //Enough now! + +#ifdef __cplusplus +extern "C" { +#endif + +void *new_dict_context(); +void delete_dict_context(void *dict_ctx); + +int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); +int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/pcompress.c b/pcompress.c index b7d0073..d6b9f2a 100644 --- a/pcompress.c +++ b/pcompress.c @@ -56,6 +56,8 @@ #include #include #include +#include "analyzer.h" +#include "filters/dict/DictFilter.h" /* * We use 8MB chunks by default. @@ -204,7 +206,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, algo_props_t *props) { - uchar_t *dest = (uchar_t *)dst, type = 0; + uchar_t *dest = (uchar_t *)dst, type = 0, atype; int64_t result; uint64_t _dstlen, fromlen; uchar_t *from, *to; @@ -238,13 +240,45 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t } } + /* + * The analyzer is run below only for non-archive mode. When archiving the + * archiver thread runs analyzer on incremental blocks and sets the type + * accordingly. + */ + atype = btype; + /* + * Run an analyzer on the data. At present the analyzer only tries + * to detect if this is text for running the dict filter. + */ + if (pctx->enable_analyzer) { + atype = analyze_buffer(src, srclen, btype, pctx->adapt_mode); + } + + /* + * Enabling LZP also enables the DICT filter since we are dealing with text + * in any case. + */ + if (pctx->lzp_preprocess && PC_TYPE(atype) == TYPE_TEXT) { + void *dct = new_dict_context(); + _dstlen = fromlen; + result = dict_encode(dct, from, fromlen, to, &_dstlen); + delete_dict_context(dct); + if (result != -1) { + uchar_t *tmp; + tmp = from; + from = to; + to = tmp; + fromlen = _dstlen; + type |= PREPROC_TYPE_DICT; + } + } #ifndef _MPLV2_LICENSE_ if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) { int hashsize; hashsize = lzp_hash_size(level); result = lzp_compress((const uchar_t *)from, to, fromlen, - hashsize, LZP_DEFAULT_LZPMINLEN, 0); + hashsize, LZP_DEFAULT_LZPMINLEN, 0); if (result >= 0 && result < srclen) { uchar_t *tmp; tmp = from; @@ -375,6 +409,20 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64 #endif } + if (type & PREPROC_TYPE_DICT) { + void *dct = new_dict_context(); + result = dict_decode(dct, src, srclen, dst, &_dstlen); + delete_dict_context(dct); + if (result != -1) { + memcpy(src, dst, _dstlen); + srclen = _dstlen; + *dstlen = _dstlen; + } else { + log_msg(LOG_ERR, 0, "DICT decoding failed."); + return (result); + } + } + if (type & PREPROC_TYPE_DISPACK) { result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen1); if (result != -1) { @@ -689,13 +737,13 @@ cont: * Compressed length: 8 bytes. * Checksum: Upto 64 bytes. * Chunk flags: 1 byte. - * + * * Chunk Flags, 8 bits: * I I I I I I I I * | | | | | | * | '-----' | | `- 0 - Uncompressed * | | | | 1 - Compressed - * | | | | + * | | | | * | | | `---- 1 - Chunk was Deduped * | | `------- 1 - Chunk was pre-compressed * | | @@ -1070,7 +1118,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) memset(zero, 0, MAX_PW_LEN); fd = open(pctx->pwd_file, O_RDWR); if (fd != -1) { - pw_len = lseek(fd, 0, SEEK_END); + pw_len = (int)lseek(fd, 0, SEEK_END); if (pw_len != -1) { if (pw_len > MAX_PW_LEN) pw_len = MAX_PW_LEN-1; lseek(fd, 0, SEEK_SET); @@ -1552,9 +1600,11 @@ redo: dedupe_index_sz = 0; type = COMPRESSED; + /* Perform Dedup if enabled. */ if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan)) { dedupe_context_t *rctx; + uint64_t rb = tdat->rbytes; /* * Compute checksum of original uncompressed chunk. When doing dedup @@ -1569,8 +1619,9 @@ redo: rctx = tdat->rctx; reset_dedupe_context(tdat->rctx); rctx->cbuf = tdat->uncompressed_chunk; - dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &(tdat->rbytes), 0, + dedupe_index_sz = dedupe_compress(tdat->rctx, tdat->cmp_seg, &rb, 0, NULL, tdat->cksum_mt); + tdat->rbytes = rb; if (!rctx->valid) { memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes); tdat->rbytes = rbytes; @@ -1744,6 +1795,10 @@ plain_index: tdat->len_cmp += (pctx->cksum_bytes + pctx->mac_bytes); rbytes = tdat->len_cmp - len_cmp; // HDR size for HMAC + /* + * In adaptive mode return value from compression function function indicates + * which algorithm was used on the chunk. We have to store that. + */ if (pctx->adapt_mode) type |= (rv << 4); @@ -2750,7 +2805,8 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail) pctx->_deinit_func = adapt_deinit; pctx->_stats_func = adapt_stats; pctx->_props_func = adapt_props; - pctx->adapt_mode = 1; + pctx->adapt_mode = 2; + pctx->enable_analyzer = 1; rv = 0; } else if (memcmp(algorithm, "adapt", 5) == 0) { @@ -2761,6 +2817,7 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail) pctx->_stats_func = adapt_stats; pctx->_props_func = adapt_props; pctx->adapt_mode = 1; + pctx->enable_analyzer = 1; rv = 0; #ifdef ENABLE_PC_LIBBSC } else if (memcmp(algorithm, "libbsc", 6) == 0) { @@ -2770,7 +2827,6 @@ init_algo(pc_ctx_t *pctx, const char *algo, int bail) pctx->_deinit_func = libbsc_deinit; pctx->_stats_func = libbsc_stats; pctx->_props_func = libbsc_props; - pctx->adapt_mode = 1; rv = 0; #endif } @@ -3337,6 +3393,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) } if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) { pctx->preprocess_mode = 1; + pctx->enable_analyzer = 1; } if (pctx->chunksize == 0) { if (pctx->level < 9) { diff --git a/pcompress.h b/pcompress.h index c383c70..9787f6c 100644 --- a/pcompress.h +++ b/pcompress.h @@ -60,13 +60,14 @@ extern "C" { #define CHSIZE_MASK 0x80 #define BZIP2_A_NUM 16 #define LZMA_A_NUM 32 -#define CHUNK_FLAG_DEDUP 2 +#define CHUNK_FLAG_DEDUP 2 #define CHUNK_FLAG_PREPROC 4 #define COMP_EXTN ".pz" -#define PREPROC_TYPE_LZP 1 +#define PREPROC_TYPE_LZP 1 #define PREPROC_TYPE_DELTA2 2 #define PREPROC_TYPE_DISPACK 4 +#define PREPROC_TYPE_DICT 8 #define PREPROC_COMPRESSED 128 /* @@ -212,6 +213,7 @@ typedef struct pc_ctx { int delta2_nstrides; int enable_rabin_split; int enable_fixed_scan; + int enable_analyzer; int preprocess_mode; int lzp_preprocess; int dispack_preprocess; @@ -275,7 +277,7 @@ struct cmp_data { uchar_t *compressed_chunk; uchar_t *uncompressed_chunk; dedupe_context_t *rctx; - uint64_t rbytes; + int64_t rbytes; uint64_t chunksize; uint64_t len_cmp, len_cmp_be; uchar_t checksum[CKSUM_MAX_BYTES]; diff --git a/utils/utils.c b/utils/utils.c index 41669d2..08ab227 100644 --- a/utils/utils.c +++ b/utils/utils.c @@ -383,14 +383,18 @@ get_total_ram() } #ifdef __APPLE__ +#define NANO_SEC (1000000000ULL) int clock_gettime(int clk_id, struct timespec *ts) { if (clk_id == CLOCK_MONOTONIC) { - uint64_t abstime = mach_absolute_time(); - return (abstime * sTimebaseInfo.numer / sTimebaseInfo.denom); + uint64_t nanotime = mach_absolute_time() * + sTimebaseInfo.numer / sTimebaseInfo.denom; + ts->tv_sec = nanotime / NANO_SEC; + ts->tv_nsec = nanotime % NANO_SEC; + return (0); } - return (0); + return (EINVAL); } #endif @@ -543,8 +547,7 @@ log_msg(log_level_t log_level, int show_errno, const char *format, ...) fputs(msg, stderr); } else if (ldest.type == LOG_FILE) { - int rv; - rv = write(ldest.fd, msg, strlen(msg)); + (void) write(ldest.fd, msg, strlen(msg)); } else { ldest.cb(msg); }