A bunch of improvements and fixes.

- Fix heap corruption in DICT Filter.
- Make default Dedup block size as 8KB.
- Revamp executable file handling: Part#1.
- Developed new E8E9 filter that works better than Dispack on raw data blocks.
- Remove block-based Dispack encoding. File-specific Dispack filter to be added.
- Improve file header based executable file detection.
- Introduce new sorting algorithm for filenames without extension.
This commit is contained in:
Moinak Ghosh 2014-12-11 19:15:36 +05:30
parent 4c62e4db60
commit f970b41e34
8 changed files with 206 additions and 186 deletions

View file

@ -198,17 +198,18 @@ creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len
} else {
if (pctx->arc_buf_pos < pctx->min_chunk) {
int diff = pctx->min_chunk - (int)(pctx->arc_buf_pos);
if (len >= diff)
if (len >= diff) {
pctx->btype = pctx->ctype;
else
} else {
pctx->ctype = pctx->btype;
}
pctx->interesting = 1;
} else {
pctx->arc_writing = 0;
Sem_Post(&(pctx->read_sem));
Sem_Wait(&(pctx->write_sem));
tbuf = pctx->arc_buf + pctx->arc_buf_pos;
pctx->arc_writing = 1;
tbuf = pctx->arc_buf;
pctx->btype = pctx->ctype;
}
}
@ -258,6 +259,7 @@ archiver_read(void *ctx, void *buf, uint64_t count)
pctx->btype = TYPE_UNKNOWN;
Sem_Post(&(pctx->write_sem));
Sem_Wait(&(pctx->read_sem));
pctx->arc_buf = NULL;
return (pctx->arc_buf_pos);
}
@ -722,8 +724,50 @@ add_pathname(const char *fpath, const struct stat *sb,
i = 0;
if (!dot) {
while (basename[i] != '\0' && i < NAMELEN) {
member->name[i] = basename[i]; i++;
int plen = strlen(fpath);
int nsep;
/*
* Filenames without an extension are sorted based on
* their entire path characteristics. This mostly avoids
* unwanted mixing of different file types if we just
* sort by filename.
*
* For every path separator we take the first character
* of the directory name limited by NAMELEN chars. Counting
* is backward from the basename itself. If less than
* NAMELEN path separators are present (i.e. fewer than
* NAMELEN level dir nesting) then remaining chars are filled
* from the basename.
*/
nsep = 0;
for (i = 0; i < plen; i++) {
if (fpath[i] == '/') {
nsep++;
}
}
if (nsep < NAMELEN) {
int diff = NAMELEN - nsep;
nsep = NAMELEN-1;
i = ftwbuf->base + diff;
while (diff > 0) {
member->name[nsep] = fpath[i];
nsep--;
i--;
diff--;
}
} else {
nsep = NAMELEN-1;
}
i = ftwbuf->base;
while (nsep > -1 && i > 0) {
if (fpath[i-1] == '/') {
member->name[nsep] = fpath[i];
nsep--;
}
i--;
}
// Clear 64-bit MSB
member->size &= 0x7FFFFFFFFFFFFFFF;
@ -1807,6 +1851,8 @@ out:
static int
detect_type_by_data(uchar_t *buf, size_t len)
{
uint16_t leval;
// At least a few bytes.
if (len < 10) return (TYPE_UNKNOWN);
@ -1875,16 +1921,24 @@ detect_type_by_data(uchar_t *buf, size_t len)
} else {
uint32_t off = LE32(U32_P(buf + 0x3c));
// This is non-MSDOS, check whether PE
if (off < len - 3) {
if (off < len - 100) {
if (buf[off] == 'P' && buf[off+1] == 'E' &&
buf[off+2] == '\0' && buf[off+3] == '\0') {
uint16_t id;
// This is a PE executable.
// Check 32/64-bit.
off = LE32(U32_P(buf + 0x3c))+4;
if (LE16(U16_P(buf + off)) == 0x8664) {
return (TYPE_BINARY|TYPE_EXE64);
off = LE32(U32_P(buf + 0x3c))+24;
id = LE16(U16_P(buf + off));
if (id == 0x010b || id == 0x020b) {
off = LE32(U32_P(buf + 0x3c))+4;
id = LE16(U16_P(buf + off));
if (id == 0x8664)
return (TYPE_BINARY|TYPE_EXE64);
else
return (TYPE_BINARY|TYPE_EXE32);
} else {
return (TYPE_BINARY|TYPE_EXE32);
return (TYPE_BINARY);
}
} else {
return (TYPE_BINARY|TYPE_EXE32);
@ -1922,6 +1976,21 @@ detect_type_by_data(uchar_t *buf, size_t len)
return (TYPE_BINARY);
}
// x86 Unix format object files (COFF)
leval = LE16(U16_P(buf));
if (leval == 0502 || leval == 0503 || leval == 0510 || leval == 0511 ||
leval == 0512 || leval == 0514 || leval == 0522) {
return (TYPE_BINARY|TYPE_EXE32);
}
// AMD64 COFF
if (leval == 0x8664)
return (TYPE_BINARY|TYPE_EXE64);
// Intel BIOS ROM images
if (*buf == 0x55 && *(buf + 1) == 0xaa)
return (TYPE_BINARY|TYPE_EXE32);
if (U16_P(buf + 2) == COM_MAGIC || U16_P(buf + 4) == COM_MAGIC ||
U16_P(buf + 4) == COM_MAGIC || U16_P(buf + 5) == COM_MAGIC ||
U16_P(buf + 13) == COM_MAGIC || U16_P(buf + 18) == COM_MAGIC ||

View file

@ -156,6 +156,8 @@ DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
for(i = 0; i < size-5;) {
if (dstSize > *dstsize-4)
return (0);
if (src[i] >= 'a' && src[i] <= 'z') {
u32 matchSymbol = 0,longestWord = 0;
@ -199,6 +201,8 @@ DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
}
for (; i<size; i++) {
if (dstSize > *dstsize-4)
return (0);
if (src[i] >= 0x82) {
dst[dstSize++] = 254;
dst[dstSize++] = src[i];
@ -282,7 +286,7 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
dst = to + 4;
dl -= 4;
if (df->Forward_Dict(from, fl, dst, &dl)) {
*dstlen = dl + 8;
*dstlen = dl + 4;
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
fromlen, *dstlen));

View file

@ -24,6 +24,7 @@
#include "types.hpp"
#include "dis.hpp"
#include <utils.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -151,7 +152,6 @@ using namespace std;
#define DISFILTER_BLOCK (32768)
#define DISFILTERED 1
#define ORIGSIZE 2
#define E8E9 4
#define NORMAL_HDR (1 + 2)
#define EXTENDED_HDR (1 + 2 + 2)
// Dispack min reduction should be 8%, otherwise we abort
@ -927,139 +927,86 @@ is_x86_code(uchar_t *buf, int len)
return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6);
}
/*
* E8E9 Filter from CSC 3.2 (Fu Siyuan). This is applied to blocks that can't
* be Disfiltered.
*/
class EFilter
{
public:
static void Forward_E89(sU8 *src, sU32 size)
{
sU32 i,j;
sS32 c;
E89init();
for(i=0, j=0; i < size; i++) {
c = E89forward(src[i]);
if (c >= 0) src[j++]=c;
}
while((c = E89flush()) >= 0) src[j++] = c;
}
static void Inverse_E89( sU8* src, sU32 size)
{
sU32 i,j;
sS32 c;
E89init();
for(i=0, j=0; i < size; i++) {
c = E89inverse(src[i]);
if (c >= 0) src[j++]=c;
}
while((c = E89flush()) >= 0) src[j++] = c;
}
protected:
static sU32 x0,x1;
static sU32 i,k;
static sU8 cs; // cache size, F8 - 5 bytes
~EFilter() {}
EFilter() {}
static void E89init(void)
{
cs = 0xFF;
x0 = x1 = 0;
i = 0;
k = 5;
}
static sS32 E89cache_byte(sS32 c)
{
sS32 d = cs&0x80 ? -1 : (sU8)(x1);
x1 >>= 8;
x1 |= (x0<<24);
x0 >>= 8;
x0 |= (c<<24);
cs <<= 1; i++;
return d;
}
static sU32 E89xswap(sU32 x)
{
x<<=7;
return (x>>24)|((sU8)(x>>16)<<8)|((sU8)(x>>8)<<16)|((sU8)(x)<<(24-7));
}
static sU32 E89yswap(sU32 x)
{
x = ((sU8)(x>>24)<<7)|((sU8)(x>>16)<<8)|((sU8)(x>>8)<<16)|(x<<24);
return x>>7;
}
static sS32 E89forward(sS32 c)
{
sU32 x;
if(i >= k) {
if((x1&0xFE000000) == 0xE8000000) {
k = i+4;
x= x0 - 0xFF000000;
if( x<0x02000000 ) {
x = (x+i) & 0x01FFFFFF;
x = E89xswap(x);
x0 = x + 0xFF000000;
}
}
}
return E89cache_byte(c);
}
static sS32 E89inverse(sS32 c)
{
sU32 x;
if(i >= k) {
if((x1&0xFE000000) == 0xE8000000) {
k = i+4;
x = x0 - 0xFF000000;
if(x < 0x02000000) {
x = E89yswap(x);
x = (x-i) & 0x01FFFFFF;
x0 = x + 0xFF000000;
}
}
}
return E89cache_byte(c);
}
static sS32 E89flush(void)
{
sS32 d;
if(cs != 0xFF) {
while(cs & 0x80) E89cache_byte(0),++cs;
d = E89cache_byte(0); ++cs;
return d;
} else {
E89init();
return -1;
}
}
};
/*
* Linker weirdo!
*/
sU32 EFilter::x0;
sU32 EFilter::x1;
sU32 EFilter::i;
sU32 EFilter::k;
sU8 EFilter::cs;
#ifdef __cplusplus
extern "C" {
#endif
/*
* E8 E9 Call/Jmp transform routines. Convert relative Call and Jmp addresses
* to absolute values to improve compression. A couple of tricks are employed:
* 1) Avoid transforming zero adresses or where adding the current offset to
* to the presumed address results in a zero result. This avoids a bunch of
* false positives.
* 2) Store transformed values in big-endian format. This improves compression.
*/
int
Forward_E89(uint8_t *src, uint64_t sz)
{
uint32_t i;
uint32_t size;
if (sz > UINT32_MAX) {
return (-1);
}
size = sz;
i = 0;
while (i < size-4) {
if ((src[i] & 0xfe) == 0xe8 &&
(src[i+4] == 0 || src[i+4] == 0xff))
{
uint32_t off;
off = (src[i+1] | (src[i+2] << 8) | (src[i+3] << 16));
if (off > 0) {
off += i;
off &= 0xffffff;
if (off > 0) {
src[i+1] = (uint8_t)(off >> 16);
src[i+2] = (uint8_t)(off >> 8);
src[i+3] = (uint8_t)off;
}
}
}
i++;
}
return (0);
}
int
Inverse_E89(uint8_t *src, uint64_t sz)
{
uint32_t i;
uint32_t size;
if (sz > UINT32_MAX) {
return (-1);
}
size = sz;
i = size-5;;
while (i > 0) {
if ((src[i] & 0xfe) == 0xe8 &&
(src[i+4] == 0 || src[i+4] == 0xff))
{
uint32_t val;
val = (src[i+3] | (src[i+2] << 8) | (src[i+1] << 16));
if (val > 0) {
val -= i;
val &= 0xffffff;
if (val > 0) {
src[i+1] = (uint8_t)val;
src[i+2] = (uint8_t)(val >> 8);
src[i+3] = (uint8_t)(val >> 16);
}
}
}
i--;
}
return (0);
}
/*
* 32-bit x86 executable packer top-level routines. Detected x86 executable data
* are passed through these encoding routines. The data chunk is split into 32KB
@ -1067,7 +1014,7 @@ extern "C" {
* a block contains valid x86 code by trying to estimate some instruction metrics.
*/
int
dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen, int stype)
{
uchar_t *pos, *hdr, type, *pos_to, *to_last;
sU32 len;
@ -1094,7 +1041,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
sU16 origsize;
sU32 out;
sU8 *rv;
int dis_tried;
if (len > DISFILTER_BLOCK)
sz = DISFILTER_BLOCK;
@ -1113,11 +1059,9 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
}
out = sz;
dis_tried = 0;
if (is_x86_code(pos, sz)) {
ctx.ResetCtx(0, sz);
rv = DisFilter(ctx, pos, sz, 0, pos_to, out);
dis_tried = 1;
} else {
rv = NULL;
}
@ -1126,15 +1070,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
return (-1);
}
memcpy(pos_to, pos, origsize);
/*
* If Dispack failed, we apply a simple E8E9 filter
* on the block.
*/
if (dis_tried) {
EFilter::Forward_E89(pos_to, origsize);
type |= E8E9;
}
*hdr = type;
hdr++;
U16_P(hdr) = LE16(origsize);
@ -1218,8 +1153,6 @@ dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
* This only happens if this block was detected as x86 instruction
* stream and Dispack was tried but it failed.
*/
if (type & E8E9)
EFilter::Inverse_E89(pos_to, cmpsz);
pos += cmpsz;
pos_to += cmpsz;
len -= cmpsz;

View file

@ -31,9 +31,13 @@
extern "C" {
#endif
int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *_dstlen);
int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *_dstlen,
int stype);
int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int Forward_E89(uint8_t *src, uint64_t sz);
int Inverse_E89(uint8_t *src, uint64_t sz);
#ifdef __cplusplus
}
#endif

View file

@ -232,22 +232,23 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
}
/*
* If Dispack is enabled it has to be done first since Dispack analyses the
* x86 instruction stream in the raw data.
* AR archives are typically static libraries. So we Dispack them unconditionally.
* TODO: Is this too much to assume in the generic case? Can we look inside ar archives?
* Dispack is used for 32-bit EXE files via a libarchive filter routine.
* However if Dispack fails or 64-bit exes are detected we apply an E8E9
* CALL/JMP transform filter.
*/
if (pctx->dispack_preprocess && (stype == TYPE_EXE32 || stype == TYPE_EXE64 ||
stype == TYPE_ARCHIVE_AR)) {
_dstlen = fromlen;
result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DISPACK;
if (pctx->exe_preprocess) {
if (stype == TYPE_EXE32 || stype == TYPE_EXE64 ||
stype == TYPE_ARCHIVE_AR) {
_dstlen = fromlen;
memcpy(to, from, fromlen);
if (Forward_E89(to, fromlen) == 0) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_E8E9;
}
}
}
@ -449,7 +450,18 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
}
}
if (type & PREPROC_TYPE_DISPACK) {
if (type & PREPROC_TYPE_E8E9) {
_dstlen1 = srclen;
memcpy(dst, src, srclen);
result = Inverse_E89(dst, srclen);
if (result != -1) {
*dstlen = _dstlen1;
} else {
log_msg(LOG_ERR, 0, "E8E9 decoding failed.");
return (result);
}
} else if (type & PREPROC_TYPE_DISPACK) { // Backward compatibility
result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen1);
if (result != -1) {
*dstlen = _dstlen1;
@ -1769,7 +1781,6 @@ redo:
dedupe_index_sz = 0;
type = COMPRESSED;
/* Perform Dedup if enabled. */
if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan)) {
dedupe_context_t *rctx;
@ -3106,7 +3117,6 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
char *pos;
struct filter_flags ff;
pctx->level = -1;
err = 0;
pctx->keylen = DEFAULT_KEYLEN;
@ -3171,7 +3181,6 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
break;
case 'B':
pctx->advanced_opts = 1;
pctx->rab_blk_size = atoi(optarg);
if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) {
log_msg(LOG_ERR, 0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)");
@ -3293,7 +3302,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
case 'x':
pctx->advanced_opts = 1;
pctx->dispack_preprocess = 1;
pctx->exe_preprocess = 1;
break;
case 'T':
@ -3415,11 +3424,11 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
}
/*
* Dispack, PackJPG and WavPack are only valid when archiving files.
* EXE, PackJPG and WavPack are only valid when archiving files.
*/
if ((pctx->dispack_preprocess || ff.enable_packjpg || ff.enable_wavpack)
if ((pctx->exe_preprocess || ff.enable_packjpg || ff.enable_wavpack)
&& !pctx->archive_mode) {
log_msg(LOG_ERR, 0, "Dispack Executable Preprocessor and PackJPG are "
log_msg(LOG_ERR, 0, "Executable File Preprocessor and PackJPG are "
"only valid when archiving.");
return (1);
}
@ -3597,7 +3606,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
init_filters(&ff);
pctx->enable_packjpg = ff.enable_packjpg;
pctx->enable_wavpack = ff.enable_wavpack;
if (pctx->level > 8) pctx->dispack_preprocess = 1;
if (pctx->level > 8) pctx->exe_preprocess = 1;
if (pctx->meta_stream != -1)
pctx->meta_stream = 1;
else
@ -3622,7 +3631,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
}
if (pctx->level > 9) pctx->delta2_nstrides = NSTRIDES_EXTRA;
}
if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) {
if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->exe_preprocess) {
pctx->preprocess_mode = 1;
pctx->enable_analyzer = 1;
}

View file

@ -71,6 +71,7 @@ extern "C" {
#define PREPROC_TYPE_DELTA2 2
#define PREPROC_TYPE_DISPACK 4
#define PREPROC_TYPE_DICT 8
#define PREPROC_TYPE_E8E9 16
#define PREPROC_COMPRESSED 128
/*
@ -220,7 +221,7 @@ typedef struct pc_ctx {
int enable_analyzer;
int preprocess_mode;
int lzp_preprocess;
int dispack_preprocess;
int exe_preprocess;
int encrypt_type;
int archive_mode;
int enable_archive_sort;

View file

@ -74,7 +74,7 @@
//Use prime constant from Bulat Ziganshin's REP. Seems to work best across wide range of data.
#define RAB_POLYNOMIAL_CONST 153191
#define POLY_MASK (0xffffffffffULL)
#define RAB_BLK_DEFAULT 1
#define RAB_BLK_DEFAULT 2
#define RAB_BLK_MIN_BITS 11
#define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
#define RAB_POLYNOMIAL_WIN_SIZE 16

View file

@ -255,7 +255,7 @@ Read_Adjusted(int fd, uchar_t *buf, uint64_t count, int64_t *rabin_count, void *
rcount = Read(fd, buf2, count);
if (rcount > 0) {
rcount += *rabin_count;
if (rcount == count) {
if (rcount == count + *rabin_count) {
uint64_t rc, rbc;
rc = rcount;
rbc = *rabin_count;