Add Dispack filter with auto-detection of x86 executables in archive mode.

More elaborate magic header based detection of 32-bit and 64-bit x86 binaries.
Always use fast-mode LZ4 in Adaptive modes.
This commit is contained in:
Moinak Ghosh 2013-11-24 19:45:58 +05:30
parent 1e2c3e479a
commit 0192790c02
13 changed files with 1293 additions and 45 deletions

View file

@ -30,7 +30,7 @@ MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
utils/xxhash_base.c utils/heap.c utils/cpuid.c pcompress.c
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heap.h \
utils/cpuid.h utils/xxhash.h archive/pc_archive.h
utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp
MAINOBJS = $(MAINSRCS:.c=.o)
PROGSRCS = main.c
@ -142,6 +142,10 @@ PJPGHDRS = filters/packjpg/aricoder.h filters/packjpg/bitops.h filters/packjpg/d
filters/packjpg/packjpglib.h filters/packjpg/pjpgtbl.h
PJPGOBJS = $(PJPGSRCS:.cpp=.o)
DISPACKSRCS = filters/dispack/dis.cpp
DISPACKHDRS = filters/dispack/dis.hpp filters/dispack/types.hpp
DISPACKOBJS = $(DISPACKSRCS:.cpp=.o)
SKEIN_BLOCK_C = crypto/skein/skein_block.c
SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s
SKEIN_BLOCK_SRC = @SKEIN_BLOCK@
@ -218,7 +222,7 @@ OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
$(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
$(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(BLAKE2_OBJS) \
@CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS)
@CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS) $(DISPACKOBJS)
DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp -fPIC
DEBUG_COMPILE = gcc -g -c @EXTRA_OPT_FLAGS@ -fPIC
@ -296,6 +300,10 @@ $(PJPGOBJS): $(PJPGSRCS) $(PJPGHDRS)
$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @SSE_OPT_FLAGS@ -O2 -fsched-spec-load \
$(VEC_FLAGS) -DBUILD_LIB $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
$(DISPACKOBJS): $(DISPACKSRCS) $(DISPACKHDRS)
$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @SSE_OPT_FLAGS@ -O2 -fsched-spec-load \
$(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
$(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
$(COMPILE) $(SKEIN_FLAGS) $(SKEIN_BLOCK_SRC) -o $@

View file

@ -124,7 +124,7 @@ adapt_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op)
{
struct adapt_data *adat = (struct adapt_data *)(*data);
int rv = 0;
int rv = 0, lv = 1;
if (!adat) {
adat = (struct adapt_data *)slab_alloc(NULL, sizeof (struct adapt_data));
@ -137,7 +137,7 @@ adapt_init(void **data, int *level, int nthreads, uint64_t chunksize,
* compression level.
*/
if (rv == 0)
rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op);
rv = lz4_init(&(adat->lz4_data), &lv, nthreads, chunksize, file_version, op);
adat->lzma_data = NULL;
adat->bsc_data = NULL;
*data = adat;
@ -178,8 +178,9 @@ adapt2_init(void **data, int *level, int nthreads, uint64_t chunksize,
* otherwise incompressible data. So we always use it at the lowest and fastest
* compression level.
*/
lv = 1;
if (rv == 0)
rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op);
rv = lz4_init(&(adat->lz4_data), &lv, nthreads, chunksize, file_version, op);
*data = adat;
if (*level > 9) *level = 9;
}

View file

@ -915,6 +915,7 @@ do_map:
if (typ == TYPE_UNKNOWN) {
pctx->ctype = detect_type_by_data(src, len);
typ = pctx->ctype;
if (typ != TYPE_UNKNOWN) {
if (typetab[(typ >> 3)].filter_func != NULL) {
int64_t rv;
@ -1428,6 +1429,9 @@ out:
/* TTA1 packed into 32-bit integer. */
# define TTA1 (0x54544131)
/* Magic for different MSDOS COM file types. */
# define COM_MAGIC (0xcd21)
#else
/* 0x7fELF packed into 32-bit integer. */
# define ELFINT (0x464c457fU)
@ -1443,6 +1447,9 @@ out:
/* TTA1 packed into 32-bit integer. */
# define TTA1 (0x31415454)
/* Magic for different MSDOS COM file types. */
# define COM_MAGIC (0x21cd)
#endif
/*
@ -1454,12 +1461,63 @@ detect_type_by_data(uchar_t *buf, size_t len)
// At least a few bytes.
if (len < 16) return (TYPE_UNKNOWN);
if (U32_P(buf) == ELFINT)
return (TYPE_BINARY|TYPE_EXE); // Regular ELF
if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z')
return (TYPE_BINARY|TYPE_EXE); // MSDOS Exe
if (buf[0] == 0xe9)
return (TYPE_BINARY|TYPE_EXE); // MSDOS COM
if (U32_P(buf) == ELFINT) { // Regular ELF, check for 32/64-bit, core dump
if (*(buf + 16) != 4) {
if (*(buf + 4) == 2) {
return (TYPE_BINARY|TYPE_EXE64);
} else {
return (TYPE_BINARY|TYPE_EXE32);
}
} else {
return (TYPE_BINARY);
}
}
if (buf[1] == 'Z') {
// Check for MSDOS/Windows Exe types
if (buf[0] == 'L') {
return (TYPE_BINARY|TYPE_EXE32);
} else if (buf[0] == 'M') {
// If relocation table is less than 0x40 bytes into file then
// it is a 32-bit MSDOS exe.
if (LE16(U16_P(buf + 0x18)) < 0x40) {
return (TYPE_BINARY|TYPE_EXE32);
} else {
uint32_t off = LE32(U32_P(buf + 0x3c));
// This is non-MSDOS, check whether PE
if (off < len - 3) {
if (buf[off] == 'P' && buf[off+1] == 'E' &&
buf[off+2] == '\0' && buf[off+3] == '\0') {
// This is a PE executable.
// Check 32/64-bit.
off = LE32(U32_P(buf + 0x3c))+4;
if (LE16(U16_P(buf + off)) == 0x8664) {
return (TYPE_BINARY|TYPE_EXE64);
} else {
return (TYPE_BINARY|TYPE_EXE32);
}
} else {
return (TYPE_BINARY|TYPE_EXE32);
}
}
}
}
}
// MSDOS COM types
if (buf[0] == 0xe9 || buf[0] == 0xeb) {
if (LE16(U16_P(buf + 0x1fe)) == 0xaa55)
return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM
else
return (TYPE_BINARY);
}
if (U16_P(buf + 2) == COM_MAGIC || U16_P(buf + 4) == COM_MAGIC ||
U16_P(buf + 4) == COM_MAGIC || U16_P(buf + 5) == COM_MAGIC ||
U16_P(buf + 13) == COM_MAGIC || U16_P(buf + 18) == COM_MAGIC ||
U16_P(buf + 23) == COM_MAGIC || U16_P(buf + 30) == COM_MAGIC ||
U16_P(buf + 70) == COM_MAGIC) {
return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM
}
if (U32_P(buf) == TZINT)
return (TYPE_BINARY); // Timezone data
if (U32_P(buf) == PPMINT)

1067
filters/dispack/dis.cpp Normal file

File diff suppressed because it is too large Load diff

41
filters/dispack/dis.hpp Normal file
View file

@ -0,0 +1,41 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*/
#ifndef __DIS_HPP__
#define __DIS_HPP__
#include <utils.h>
#ifdef __cplusplus
extern "C" {
#endif
int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *_dstlen);
int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
#ifdef __cplusplus
}
#endif
#endif

51
filters/dispack/types.hpp Normal file
View file

@ -0,0 +1,51 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*/
#include <stdint.h>
#include <inttypes.h>
#include <arpa/inet.h>
#ifndef __TYPES_HPP__
#define __TYPES_HPP__
typedef unsigned char sU8;
typedef signed char sS8;
typedef unsigned short sU16;
typedef signed short sS16;
typedef unsigned int sU32;
typedef signed int sS32;
typedef uint64_t sU64;
typedef int64_t sS64;
typedef int sInt;
typedef char sChar;
typedef bool sBool;
typedef float sF32;
typedef double sF64;
#define sTRUE true
#define sFALSE false
#define _byteswap_ushort htons
#define _byteswap_ulong htonl
#endif

View file

@ -55,6 +55,7 @@
#include <ctype.h>
#include <errno.h>
#include <pc_archive.h>
#include <filters/dispack/dis.hpp>
/*
* We use 8MB chunks by default.
@ -215,6 +216,23 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
fromlen = srclen;
result = 0;
/*
* If Dispack is enabled it has to be done first since Dispack analyses the
* x86 instruction stream in the raw data.
*/
if (pctx->dispack_preprocess && PC_SUBTYPE(btype) == TYPE_EXE32) {
_dstlen = fromlen;
result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DISPACK;
}
}
if (pctx->lzp_preprocess) {
int hashsize;
@ -335,7 +353,23 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
*dstlen = result;
}
if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP)) && type > 0) {
/*
* If Dispack is enabled it has to be done first since Dispack analyses the
* x86 instruction stream in the raw data.
*/
if (type & PREPROC_TYPE_DISPACK) {
result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen);
if (result != -1) {
memcpy(src, dst, _dstlen);
srclen = _dstlen;
*dstlen = _dstlen;
} else {
return (result);
}
}
if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP | PREPROC_TYPE_DISPACK))
&& type > 0) {
log_msg(LOG_ERR, 0, "Invalid preprocessing flags: %d", type);
return (-1);
}
@ -3153,8 +3187,9 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
if (pctx->level > 9) ff.enable_packjpg = 1;
init_filters(&ff);
pctx->enable_packjpg = ff.enable_packjpg;
if (pctx->level > 8) pctx->dispack_preprocess = 1;
}
if (pctx->lzp_preprocess || pctx->enable_delta2_encode) {
if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) {
pctx->preprocess_mode = 1;
}
} else if (pctx->do_uncompress) {

View file

@ -60,6 +60,7 @@ extern "C" {
#define PREPROC_TYPE_LZP 1
#define PREPROC_TYPE_DELTA2 2
#define PREPROC_TYPE_DISPACK 4
#define PREPROC_COMPRESSED 128
/*
@ -205,6 +206,7 @@ typedef struct pc_ctx {
int enable_fixed_scan;
int preprocess_mode;
int lzp_preprocess;
int dispack_preprocess;
int encrypt_type;
int archive_mode;
int verbose;

View file

@ -88,16 +88,6 @@ struct ext_entry {
{"upp" , TYPE_TEXT, 3},
{"mom" , TYPE_TEXT, 3},
{"tmac" , TYPE_TEXT, 4},
{"exe" , TYPE_BINARY|TYPE_EXE, 3},
{"dll" , TYPE_BINARY|TYPE_EXE, 3},
{"bin" , TYPE_BINARY|TYPE_EXE, 3},
{"o" , TYPE_BINARY|TYPE_EXE, 1},
{"a" , TYPE_BINARY|TYPE_EXE, 1},
{"obj" , TYPE_BINARY|TYPE_EXE, 3},
{"so" , TYPE_BINARY|TYPE_EXE, 2},
{"com" , TYPE_BINARY|TYPE_EXE, 3},
{"xpi" , TYPE_BINARY|TYPE_EXE, 3},
{"off" , TYPE_BINARY|TYPE_EXE, 3},
{"pdf" , TYPE_BINARY, 3},
{"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3},
{"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4},
@ -152,5 +142,7 @@ struct ext_entry {
{"SVGZ" , TYPE_BINARY, 4},
{"ODT" , TYPE_BINARY, 3},
{"3DM" , TYPE_BINARY, 3},
{"chm" , TYPE_BINARY, 3},
{"CHM" , TYPE_BINARY, 3},
};
#endif

View file

@ -78,16 +78,6 @@ am,TYPE_TEXT
upp,TYPE_TEXT
mom,TYPE_TEXT
tmac,TYPE_TEXT
exe,TYPE_BINARY|TYPE_EXE
dll,TYPE_BINARY|TYPE_EXE
bin,TYPE_BINARY|TYPE_EXE
o,TYPE_BINARY|TYPE_EXE
a,TYPE_BINARY|TYPE_EXE
obj,TYPE_BINARY|TYPE_EXE
so,TYPE_BINARY|TYPE_EXE
com,TYPE_BINARY|TYPE_EXE
xpi,TYPE_BINARY|TYPE_EXE
off,TYPE_BINARY|TYPE_EXE
pdf,TYPE_BINARY
jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
@ -142,3 +132,5 @@ swf,TYPE_BINARY
SVGZ,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
ODT,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP
3DM,TYPE_BINARY
chm,TYPE_BINARY
CHM,TYPE_BINARY

View file

@ -12,14 +12,14 @@
/* small adjustments to _a_ to make values distinct */
ub1 tab[] = {
125,0,0,220,235,125,82,0,113,0,0,7,0,0,82,0,
0,0,7,124,0,0,82,0,0,125,0,7,0,220,125,120,
0,0,0,0,22,0,0,113,0,113,113,0,0,125,85,0,
113,0,11,113,125,7,0,0,0,40,0,113,85,0,0,125,
0,113,0,0,113,0,125,183,40,27,7,15,58,183,113,0,
124,0,0,22,125,220,0,40,0,87,87,125,113,0,183,125,
0,125,87,7,0,85,0,0,59,229,85,7,135,116,0,146,
0,0,82,0,0,0,200,0,56,125,0,0,61,202,0,0,
125,0,0,87,7,113,82,120,113,0,0,113,0,0,113,125,
0,0,7,113,0,113,0,0,0,7,0,131,0,85,0,22,
0,113,0,0,85,0,0,113,0,113,125,113,0,7,22,0,
82,0,0,113,125,125,0,0,0,0,0,113,22,0,0,125,
0,87,0,0,113,0,125,183,82,0,124,88,40,125,0,0,
124,0,168,125,0,125,0,40,0,82,125,113,113,125,116,0,
0,0,113,85,0,88,0,0,42,27,0,0,0,40,183,61,
0,0,0,0,0,111,17,0,87,125,0,0,166,91,0,0,
};
/* The hash function */

View file

@ -8,7 +8,7 @@
extern ub1 tab[];
#define PHASHLEN 0x80 /* length of hash mapping table */
#define PHASHNKEYS 141 /* How many keys were hashed */
#define PHASHNKEYS 133 /* How many keys were hashed */
#define PHASHRANGE 256 /* Range any input might map to */
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */

View file

@ -245,8 +245,8 @@ typedef enum {
/*
* Sub-types.
*/
#define NUM_SUB_TYPES 20
TYPE_EXE = 8,
#define NUM_SUB_TYPES 24
TYPE_EXE32 = 8,
TYPE_JPEG = 16,
TYPE_MARKUP = 24,
TYPE_COMPRESSED_GZ = 32,
@ -268,7 +268,8 @@ typedef enum {
TYPE_PACKJPG = 160,
TYPE_DNA_SEQ = 168,
TYPE_MJPEG = 176,
TYPE_AUDIO_COMPRESSED = 184
TYPE_AUDIO_COMPRESSED = 184,
TYPE_EXE64 = 192
} data_type_t;
/*