f970b41e34
- Fix heap corruption in DICT Filter. - Make default Dedup block size as 8KB. - Revamp executable file handling: Part#1. - Developed new E8E9 filter that works better than Dispack on raw data blocks. - Remove block-based Dispack encoding. File-specific Dispack filter to be added. - Improve file header based executable file detection. - Introduce new sorting algorithm for filenames without extension.
1168 lines
38 KiB
C++
1168 lines
38 KiB
C++
/*
|
|
* This file is a part of Pcompress, a chunked parallel multi-
|
|
* algorithm lossless compression and decompression program.
|
|
*
|
|
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
|
|
* Use is subject to license terms.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 3 of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program.
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
|
*/
|
|
|
|
#include "types.hpp"
|
|
#include "dis.hpp"
|
|
#include <utils.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#ifndef __APPLE__
|
|
#include <malloc.h>
|
|
#endif
|
|
#include <assert.h>
|
|
#include <iostream>
|
|
|
|
using namespace std;
|
|
|
|
/* Version history:
|
|
*
|
|
* 1.00 (Nov 2009) Initial release
|
|
* 1.01 (Jan 2011) Don't assert on bytes > MAXINSTR when dealing with jump tables
|
|
* 1.02 (Nov 2013) (Moinak Ghosh) Changes to integrate with Pcompress.
|
|
* Adapted and modified from:
|
|
* http://www.farbrausch.de/~fg/code/disfilter/
|
|
*/
|
|
|
|
/****************************************************************************/
|
|
|
|
/* This is a filter for x86 binary code, intended to improve its compressibility
|
|
* by standard algorithms. The basic ideas are quite old; for example, the LZX
|
|
* algorithm used in Microsoft .CAB files uses a special preprocessor that
|
|
* converts the target address in CALL opcodes from a relative offset to an
|
|
* absolute address. This simple transforms greatly helps both LZ-based and
|
|
* statistical coders: the same function being called repeatedly now results
|
|
* in the same byte sequence for the call being repeated, instead of having
|
|
* a different encoding every time. The preprocessor doesn't really understand
|
|
* the instruction stream; it just looks for a 0xE8 byte (the opcode for near
|
|
* call) and adds the current position to the 4 bytes that follow it.
|
|
*
|
|
* Most modern compressors include this filter or variations, to be used on .EXE
|
|
* files; newer variants usually try to detect whether the target offset would be
|
|
* within the executable image to reduce the number of false positives. Another
|
|
* common modification stores the transformed offsets in big endian byte order:
|
|
* this clusters the high bits (which are likely to be similar along a stretch of
|
|
* code) together with the opcode, again yielding somewhat better compression.
|
|
*
|
|
* However, all this is based on a very limited understanding of x86 binary code.
|
|
* It is possible to do significantly with a more thorough understanding of the
|
|
* bytestream and its underlying structure. This algorithm borrows heavily from the
|
|
* Split-Stream^2 method described in [1] (or, more precisely, an earlier variant
|
|
* published somewhen in 2004; I don't remember the details anymore). It also introduces
|
|
* some (to my knowledge) novel ideas, though.
|
|
*
|
|
* The basic idea behind Split-Stream is to disassemble the target program,
|
|
* splitting it into several distinct streams that can be coded separately. Examples
|
|
* of such streams are the opcodes themselves, 8 bit immediates, 32 bit immediates,
|
|
* jump and call target addresses, and so on - the idea being that the individual
|
|
* fields are highly correlated amongst themselves, but largely independent of each
|
|
* other. Splitting the streams reduces the context dilution (the inclusion of
|
|
* irrelevant values in the context used for prediction) that otherwise harms compression
|
|
* in compiled code. Since the actual compressor in kkrunchy is a LZ-based dictionary
|
|
* coder and not a context coder, there's no easy way to mix multiple models or use
|
|
* alphabets with more than 256 symbols; hence the streams are simply stored sequentially,
|
|
* with a small header denoting the size of each. This interface sacrifices some
|
|
* compression potential, but has the advantage that the filter inputs and outputs
|
|
* simple bytestreams; kkrunchy actually compresses the (several hundred bytes long)
|
|
* unfiltering code along with the transformed code, so part of the decompressor is
|
|
* stored in compressed form. This results in a somewhat peculiar "bootstrapping"
|
|
* decompression process but saved roughly 200 bytes when it was originally written;
|
|
* a big enough gain to be worth it when targeting 64k executables.
|
|
*
|
|
* The actual list of streams that are identified can be found below (the "Streams" enum).
|
|
* To categorize which byte belongs where, the code needs to be disassembled. This
|
|
* is simpler than it sounds, given the complexity of x86 instruction encoding;
|
|
* luckily, there's no need to fully "understand" each instruction. We mainly need to
|
|
* be able to identify the opcode, the addressing mode used, and the presence of
|
|
* immediate data fields. This is implemented using a mostly table-driven disassembler.
|
|
* Since the original decoder was heavily optimized for size and the tables need to be
|
|
* included with the decoder, the encoding is very compact: It mainly consists of two
|
|
* tables of 256 entries each with 4 bits per entry used - the first table describing
|
|
* one-byte opcodes, the second for two-byte opcodes (when this code was written, there
|
|
* were no three-byte opcodes yet). There are some simplifications present in the tables
|
|
* and the disassembler, where doing so poses no problems. For example, all prefixes
|
|
* are treated as one-byte opcodes with no operands; this is incorrect, but as long as
|
|
* the encoder and decoder agree on it, there's no problem. There's also no need to
|
|
* distinguish between different instructions when they all have the same addressing modes
|
|
* and combination of immediate operands. All this gets rid of a lot of special cases.
|
|
* There is one significant deviation from the PPMexe paper [1], though: the code
|
|
* is very careful never to assume that its parsing of the instruction stream is correct,
|
|
* and absolutely no irreversible transforms take place (such as the instruction
|
|
* rescheduling in [1]). Unrecognizable and invalid opcodes are preserved. This is done
|
|
* by using a very uncommon opcode as escape code, encapsulating otherwise invalid
|
|
* sequences within the bytestream. This property is critical in practice: code sections
|
|
* often contain jump tables and other data that isn't decodable as x86 instruction
|
|
* stream. Corrupting such data during the compression process is unacceptable.
|
|
*
|
|
* The target adresses of near jumps and calls of course still get converted from
|
|
* relative to absolute; additionally, all values larger than 8 bit are stored in big
|
|
* endian byte order. Both transforms are trivial to undo on the decoder side and yield
|
|
* notable improvements in compression ratio. Additionally, the last 255 call targets
|
|
* are kept in an array that's updated using the "move to front" heuristic. If a target
|
|
* occurs repeatedly (as is common in practice), the offset doesn't need to be coded at
|
|
* all; instead the position in the array is transmitted. (This is the ST_CALL_IDX
|
|
* stream). Additionally, the instruction stream is analyzed to identify potential
|
|
* call targets (i.e. start addresses of functions) even before they are first
|
|
* referenced: if a RET or INT3 opcode is found in the instruction stream, the filter
|
|
* assumes that the next instruction is likely to start a new function (MSVC++ uses
|
|
* INT3 opcodes to fill the "no man's land" between functions) and adds its address to
|
|
* the function table automatically. Typical overall hit rates for the function table
|
|
* are between 70 and 80 per cent - so only a quarter of all call target addresses ever
|
|
* needs to be stored explicitly.
|
|
*
|
|
* The most common type of data intermixed with code sections is jump tables and
|
|
* virtual function tables. Generally speaking, any data inside the code section is
|
|
* bad for the filter; its statistics are very different from the binary code being
|
|
* encoded which hurts compression, and it causes the disassembler to lose sync
|
|
* temporarily. To work around this problem, the encoder tries to identify jump
|
|
* tables, using another escape code to identify them in the output stream. The
|
|
* heuristic used here is rather simple, but works very well: When an instruction
|
|
* is expected, the encoder looks at the next 12 bytes. If they evaluate to
|
|
* addresses within the code section when interpreted as 3 dwords, the encoder assumes
|
|
* that it has found a jump table (or vtable). Jump table entries are encoded the
|
|
* same way that call targets are.
|
|
*
|
|
* [1] "PPMexe: Program Compression"
|
|
* M. Drinic, D. Kirovski, and H. Vo, MS Research
|
|
* ACM Transactions on Programming Languages and Systems, Vol.29, (no.1), 2007.
|
|
* http://research.microsoft.com/en-us/um/people/darkok/papers/TOPLAS.pdf
|
|
*/
|
|
|
|
#define DISFILTER_BLOCK (32768)
|
|
#define DISFILTERED 1
|
|
#define ORIGSIZE 2
|
|
#define NORMAL_HDR (1 + 2)
|
|
#define EXTENDED_HDR (1 + 2 + 2)
|
|
// Dispack min reduction should be 8%, otherwise we abort
|
|
#define DIS_MIN_REDUCE (2622)
|
|
|
|
#define MAXINSTR 15 // maximum size of a single instruction in bytes (actually, decodeable ones are shorter)
|
|
|
|
enum Opcodes
|
|
{
|
|
// 1-byte opcodes of special interest (for one reason or another)
|
|
OP_2BYTE = 0x0f, // start of 2-byte opcode
|
|
OP_OSIZE = 0x66, // operand size prefix
|
|
OP_CALLF = 0x9a,
|
|
OP_RETNI = 0xc2, // ret near+immediate
|
|
OP_RETN = 0xc3,
|
|
OP_ENTER = 0xc8,
|
|
OP_INT3 = 0xcc,
|
|
OP_INTO = 0xce,
|
|
OP_CALLN = 0xe8,
|
|
OP_JMPF = 0xea,
|
|
OP_ICEBP = 0xf1,
|
|
|
|
// escape codes we use (these need to be 1-byte opcodes without an address or immediate operand!)
|
|
ESCAPE = OP_ICEBP,
|
|
JUMPTAB = OP_INTO
|
|
};
|
|
|
|
// formats
|
|
enum InstructionFormat
|
|
{
|
|
// encoding mode
|
|
fNM = 0x0, // no ModRM
|
|
fAM = 0x1, // no ModRM, "address mode" (jumps or direct addresses)
|
|
fMR = 0x2, // ModRM present
|
|
fMEXTRA = 0x3, // ModRM present, includes extra bits for opcode
|
|
fMODE = 0x3, // bitmask for mode
|
|
|
|
// no ModRM: size of immediate operand
|
|
fNI = 0x0, // no immediate
|
|
fBI = 0x4, // byte immediate
|
|
fWI = 0x8, // word immediate
|
|
fDI = 0xc, // dword immediate
|
|
fTYPE = 0xc, // type mask
|
|
|
|
// address mode: type of address operand
|
|
fAD = 0x0, // absolute address
|
|
fDA = 0x4, // dword absolute jump target
|
|
fBR = 0x8, // byte relative jump target
|
|
fDR = 0xc, // dword relative jump target
|
|
|
|
// others
|
|
fERR = 0xf // denotes invalid opcodes
|
|
};
|
|
|
|
enum Streams
|
|
{
|
|
ST_OP, // prefixes, first byte of opcode
|
|
ST_SIB, // SIB byte
|
|
ST_CALL_IDX, // call table index
|
|
ST_DISP8_R0, // byte displacement on ModRM, reg no. 0 and following
|
|
ST_DISP8_R1, ST_DISP8_R2, ST_DISP8_R3, ST_DISP8_R4, ST_DISP8_R5, ST_DISP8_R6, ST_DISP8_R7,
|
|
ST_JUMP8, // short jump
|
|
ST_IMM8, // 8-bit immediate
|
|
ST_IMM16, // 16-bit immediate
|
|
ST_IMM32, // 32-bit immediate
|
|
ST_DISP32, // 32-bit displacement
|
|
ST_ADDR32, // 32-bit direct address
|
|
ST_CALL32, // 32-bit call target
|
|
ST_JUMP32, // 32-bit jump target
|
|
|
|
ST_MAX,
|
|
|
|
// these components of the instruction stream are also identified
|
|
// seperately, but stored together with another stream since there's
|
|
// high correlation between them (or just because one streams provides
|
|
// good context to predict the other)
|
|
ST_MODRM = ST_OP, // ModRM byte
|
|
ST_OP2 = ST_OP, // second byte of opcode
|
|
ST_AJUMP32 = ST_JUMP32, // absolute jump target
|
|
ST_JUMPTBL_COUNT = ST_OP
|
|
};
|
|
|
|
/****************************************************************************/
|
|
|
|
// These helper functions assume that this code is being compiled on a
|
|
// little-endian platform with no alignment restrictions on data accesses.
|
|
// If this isn't a safe assumption, change these functions appropriately.
|
|
// All byte order dependent operations end up calling them.
|
|
//
|
|
// I also use the VC++ _byteswap intrinsics to implement big endian stores;
|
|
// if your compiler doesn't have them, it should be trivial to get rid of them.
|
|
|
|
static inline sU8 Load8(const sU8 *s) { return *s; }
|
|
static inline sU16 Load16(const sU8 *s) { return *((const sU16 *) s); }
|
|
static inline sU16 Load16B(const sU8 *s) { return _byteswap_ushort(Load16(s)); }
|
|
static inline sU32 Load32(const sU8 *s) { return *((const sU32 *) s); }
|
|
static inline sU32 Load32B(const sU8 *s) { return _byteswap_ulong(Load32(s)); }
|
|
|
|
static inline void Store8(sU8 *d,sU8 v) { *d = v; }
|
|
static inline void Store16(sU8 *d,sU16 v) { *((sU16 *) d) = v; }
|
|
static inline void Store16B(sU8 *d,sU16 v) { *((sU16 *) d) = _byteswap_ushort(v); }
|
|
static inline void Store32(sU8 *d,sU32 v) { *((sU32 *) d) = v; }
|
|
static inline void Store32B(sU8 *d,sU32 v) { *((sU32 *) d) = _byteswap_ulong(v); }
|
|
|
|
static inline sU8 Fetch8(sU8 *&s) { return *s++; }
|
|
static inline sU16 Fetch16(sU8 *&s) { sU16 v = Load16(s); s += 2; return v; }
|
|
static inline sU16 Fetch16B(sU8 *&s) { sU16 v = Load16B(s); s += 2; return v; }
|
|
static inline sU32 Fetch32(sU8 *&s) { sU32 v = Load32(s); s += 4; return v; }
|
|
static inline sU32 Fetch32B(sU8 *&s) { sU32 v = Load32B(s); s += 4; return v; }
|
|
|
|
static inline sU8 Write8(sU8 *&d,sU8 v) { Store8(d,v); d += 1; return v; }
|
|
static inline sU16 Write16(sU8 *&d,sU16 v) { Store16(d,v); d += 2; return v; }
|
|
static inline sU32 Write32(sU8 *&d,sU32 v) { Store32(d,v); d += 4; return v; }
|
|
|
|
/****************************************************************************/
|
|
|
|
static sU32 MoveToFront(sU32 *table,sInt pos,sU32 val)
|
|
{
|
|
for(;pos > 0;pos--)
|
|
table[pos] = table[pos-1];
|
|
|
|
table[0] = val;
|
|
return val;
|
|
}
|
|
|
|
static inline void AddMTF(sU32 *mtf,sU32 val)
|
|
{
|
|
MoveToFront(mtf,255,val);
|
|
}
|
|
|
|
static sInt FindMTF(sU32 *mtf,sU32 val)
|
|
{
|
|
for(sInt i=0;i<255;i++)
|
|
{
|
|
if(mtf[i] == val)
|
|
{
|
|
MoveToFront(mtf,i,val);
|
|
return i;
|
|
}
|
|
}
|
|
|
|
AddMTF(mtf,val);
|
|
return -1;
|
|
}
|
|
|
|
/****************************************************************************/
|
|
|
|
struct DataBuffer
|
|
{
|
|
sInt Size,Max;
|
|
sU8 *Data;
|
|
|
|
DataBuffer()
|
|
{
|
|
Max = 256;
|
|
Data = (sU8 *) malloc(Max);
|
|
ResetBuffer();
|
|
}
|
|
|
|
void ResetBuffer()
|
|
{
|
|
Size = 0;
|
|
}
|
|
|
|
~DataBuffer()
|
|
{
|
|
free(Data);
|
|
}
|
|
|
|
sU8 *Add(sInt bytes)
|
|
{
|
|
if(Size+bytes>Max)
|
|
{
|
|
Max = (Max*2 < Size+bytes) ? Size+bytes : Max*2;
|
|
Data = (sU8 *) realloc(Data,Max);
|
|
}
|
|
|
|
sU8 *ret = Data+Size;
|
|
Size += bytes;
|
|
return ret;
|
|
}
|
|
};
|
|
|
|
/****************************************************************************/
|
|
|
|
// 1-byte opcodes
|
|
sU8 Table1[256] =
|
|
{
|
|
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 0
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 1
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 2
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 3
|
|
|
|
fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 4
|
|
fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 5
|
|
fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fDI,fMR|fDI,fNM|fBI,fMR|fBI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 6
|
|
fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR, // 7
|
|
|
|
fMR|fBI,fMR|fDI,fMR|fBI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 8
|
|
fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fAM|fDA,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 9
|
|
fAM|fAD,fAM|fAD,fAM|fAD,fAM|fAD,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // a
|
|
fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI, // b
|
|
|
|
fMR|fBI,fMR|fBI,fNM|fWI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fBI,fMR|fDI,fNM|fBI,fNM|fNI,fNM|fWI,fNM|fNI,fNM|fNI,fNM|fBI,fERR ,fNM|fNI, // c
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fBI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // d
|
|
fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fAM|fDR,fAM|fDR,fAM|fAD,fAM|fBR,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // e
|
|
fNM|fNI,fERR ,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fMEXTRA,fMEXTRA,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fMEXTRA,fMEXTRA, // f
|
|
};
|
|
|
|
/****************************************************************************/
|
|
|
|
// 2-byte opcodes
|
|
sU8 Table2[256] =
|
|
{
|
|
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
|
fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fNM|fNI,fERR ,fNM|fNI,fNM|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // 0
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // 1
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 2
|
|
fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fERR ,fNM|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // 3
|
|
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 4
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 5
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 6
|
|
fMR|fBI,fMR|fBI,fMR|fBI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fMR|fNI,fMR|fNI, // 7
|
|
|
|
fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR, // 8
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 9
|
|
fNM|fNI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fMR|fNI,fMR|fBI,fMR|fNI,fERR ,fMR|fNI, // a
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // b
|
|
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // c
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // d
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // e
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR , // f
|
|
};
|
|
|
|
/****************************************************************************/
|
|
|
|
// escape opcodes using ModRM byte to get more variants
|
|
sU8 TableX[32] =
|
|
{
|
|
// 0 1 2 3 4 5 6 7
|
|
fMR|fBI,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // escapes for 0xf6
|
|
fMR|fDI,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // escapes for 0xf7
|
|
fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // escapes for 0xfe
|
|
fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fMR|fNI,fERR ,fMR|fNI,fERR , // escapes for 0xff
|
|
};
|
|
|
|
/****************************************************************************/
|
|
/****************************************************************************/
|
|
|
|
struct DisFilterCtx
|
|
{
|
|
DataBuffer Buffer[ST_MAX];
|
|
sU32 FuncTable[256];
|
|
sBool NextIsFunc;
|
|
|
|
sU32 CodeStart,CodeEnd;
|
|
|
|
DisFilterCtx(sU32 codeStart,sU32 codeEnd)
|
|
{
|
|
ResetCtx(codeStart, codeEnd);
|
|
}
|
|
|
|
void ResetCtx(sU32 codeStart,sU32 codeEnd)
|
|
{
|
|
NextIsFunc = sTRUE;
|
|
for(sInt i=0;i<256;i++)
|
|
FuncTable[i] = 0;
|
|
|
|
CodeStart = codeStart;
|
|
CodeEnd = codeEnd;
|
|
for (sInt i=0; i<ST_MAX; i++)
|
|
Buffer[i].ResetBuffer();
|
|
}
|
|
|
|
sInt DetectJumpTable(sU8 *instr,sU32 addr)
|
|
{
|
|
assert(addr < CodeEnd);
|
|
sInt nMax = (CodeEnd - addr) / 4;
|
|
sInt count = 0;
|
|
|
|
while(count<nMax)
|
|
{
|
|
sU32 codedAddr = Load32(instr + count*4);
|
|
if(codedAddr >= CodeStart && codedAddr < CodeEnd)
|
|
count++;
|
|
else
|
|
break;
|
|
}
|
|
|
|
if(count < 3) // if it's less than 3 entries, it's probably not a jump table.
|
|
count = 0;
|
|
|
|
return count;
|
|
}
|
|
|
|
sInt ProcessInstr(sU8 *instr,sU32 memory)
|
|
{
|
|
if(sInt nJump = DetectJumpTable(instr,memory))
|
|
{
|
|
// probable jump table with nJump entries
|
|
sInt remaining = nJump;
|
|
|
|
while(remaining)
|
|
{
|
|
sInt count = (remaining < 256) ? remaining : 256;
|
|
Put8(ST_OP,JUMPTAB);
|
|
Put8(ST_JUMPTBL_COUNT,count-1);
|
|
|
|
for(sInt i=0;i<count;i++)
|
|
{
|
|
sU32 target = Fetch32(instr);
|
|
sInt ind = FindMTF(FuncTable,target);
|
|
Put8(ST_CALL_IDX,ind+1);
|
|
if(ind == -1)
|
|
Put32(ST_CALL32,target);
|
|
}
|
|
|
|
remaining -= count;
|
|
}
|
|
|
|
return nJump*4;
|
|
}
|
|
|
|
sU8 *start = instr;
|
|
sInt code = Fetch8(instr);
|
|
sInt code2 = 0;
|
|
sBool o16 = sFALSE;
|
|
sInt flags;
|
|
|
|
if(NextIsFunc && code != 0xcc)
|
|
{
|
|
AddMTF(FuncTable,memory);
|
|
NextIsFunc = sFALSE;
|
|
}
|
|
|
|
if(code == OP_OSIZE)
|
|
{
|
|
o16 = sTRUE;
|
|
code = Fetch8(instr);
|
|
}
|
|
|
|
if(code == OP_2BYTE)
|
|
{
|
|
code2 = Fetch8(instr);
|
|
flags = Table2[code2];
|
|
}
|
|
else
|
|
flags = Table1[code];
|
|
|
|
if(code == OP_RETNI || code == OP_RETN || code == OP_INT3) // return. function is going to start next.
|
|
NextIsFunc = sTRUE;
|
|
|
|
if(flags == fMEXTRA)
|
|
flags = TableX[((*instr >> 3) & 7) | ((code & 0x01) << 3) | ((code & 0x08) << 1)];
|
|
|
|
if(flags != fERR)
|
|
{
|
|
if(o16)
|
|
Put8(ST_OP,OP_OSIZE);
|
|
|
|
Put8(ST_OP,code);
|
|
if(code == OP_2BYTE)
|
|
Put8(ST_OP2,code2);
|
|
|
|
if(code == OP_CALLF || code == OP_JMPF || code == OP_ENTER)
|
|
{
|
|
// far call/jump have a *48-bit* immediate address. we deal with it here by copying the segment index
|
|
// manually and encoding the rest as a normal 32-bit direct address.
|
|
// similarly, enter has a word operand and a byte operand. again, we code the word here, and
|
|
// deal with the byte later during the normal flow.
|
|
Copy16(ST_IMM16,instr);
|
|
}
|
|
|
|
if((flags & fMODE) == fMR)
|
|
{
|
|
sInt modrm = Copy8(ST_MODRM,instr);
|
|
sInt sib = 0;
|
|
|
|
if((modrm & 0x07) == 4 && modrm < 0xc0)
|
|
sib = Copy8(ST_SIB,instr);
|
|
|
|
if((modrm & 0xc0) == 0x40) // register+byte displacement
|
|
Copy8(ST_DISP8_R0 + (modrm & 0x07),instr);
|
|
|
|
if((modrm & 0xc0) == 0x80 || (modrm & 0xc7) == 0x05 || (modrm < 0x40 && (sib & 0x07) == 5))
|
|
{
|
|
// register+dword displacement
|
|
Copy32((modrm & 0xc7) == 0x05 ? ST_ADDR32 : ST_DISP32,instr);
|
|
}
|
|
}
|
|
|
|
if((flags & fMODE) == fAM)
|
|
{
|
|
switch(flags & fTYPE)
|
|
{
|
|
case fAD: Copy32(ST_ADDR32,instr); break;
|
|
case fDA: Copy32(ST_AJUMP32,instr); break;
|
|
case fBR: Copy8(ST_JUMP8,instr); break;
|
|
|
|
case fDR:
|
|
{
|
|
sU32 target = Fetch32(instr);
|
|
target += (instr - start) + memory;
|
|
if(code != OP_CALLN) // not a near call
|
|
Put32(ST_JUMP32,target);
|
|
else
|
|
{
|
|
sInt ind = FindMTF(FuncTable,target);
|
|
Put8(ST_CALL_IDX,ind+1);
|
|
if(ind == -1)
|
|
Put32(ST_CALL32,target);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
switch(flags & fTYPE)
|
|
{
|
|
case fBI: Copy8(ST_IMM8,instr); break;
|
|
case fWI: Copy16(ST_IMM16,instr); break;
|
|
|
|
case fDI:
|
|
if(!o16)
|
|
Copy32(ST_IMM32,instr);
|
|
else
|
|
Copy16(ST_IMM16,instr);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return instr - start;
|
|
}
|
|
else // couldn't decode instruction
|
|
{
|
|
Put8(ST_OP,ESCAPE); // escape code
|
|
Put8(ST_OP,*start); // the unrecognized opcode
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
sU8 *Flush(sU8 *out, sU32 &sz)
|
|
{
|
|
sU32 size = 0;
|
|
|
|
if (sz < ST_MAX * 16)
|
|
return (NULL);
|
|
size = ST_MAX * 4; // 4 bytes per stream to encode the size
|
|
for(sInt i=0;i<ST_MAX;i++) {
|
|
size += Buffer[i].Size;
|
|
if (size >= sz) return (NULL); // Check for output overflow
|
|
}
|
|
|
|
// Output ptr is supplied by caller
|
|
sU8 *outPtr = out;
|
|
|
|
for(sInt i=0;i<ST_MAX;i++)
|
|
Write32(outPtr,Buffer[i].Size);
|
|
|
|
for(sInt i=0;i<ST_MAX;i++)
|
|
{
|
|
memcpy(outPtr,Buffer[i].Data,Buffer[i].Size);
|
|
outPtr += Buffer[i].Size;
|
|
}
|
|
|
|
assert(outPtr == out + size);
|
|
sz = size;
|
|
return out;
|
|
}
|
|
|
|
sU8 Put8(sInt stream,sU8 v) { Store8 (Buffer[stream].Add(1),v); return v; }
|
|
sU16 Put16(sInt stream,sU16 v) { Store16B(Buffer[stream].Add(2),v); return v; }
|
|
sU32 Put32(sInt stream,sU32 v) { Store32B(Buffer[stream].Add(4),v); return v; }
|
|
|
|
sU8 Copy8(sInt stream,sU8 *&s) { return Put8 (stream,Fetch8(s)); }
|
|
sU16 Copy16(sInt stream,sU8 *&s) { return Put16(stream,Fetch16(s)); }
|
|
sU32 Copy32(sInt stream,sU8 *&s) { return Put32(stream,Fetch32(s)); }
|
|
};
|
|
|
|
/****************************************************************************/
|
|
|
|
static sU8 *
|
|
DisFilter(DisFilterCtx &ctx, sU8 *src, sU32 size, sU32 origin, sU8 *dst, sU32 &outputSize)
|
|
{
|
|
// DisFilterCtx ctx(origin,origin+size);
|
|
|
|
// main loop: handle everything but the last few bytes
|
|
sU32 pos = 0;
|
|
while(pos < size - MAXINSTR)
|
|
{
|
|
sInt bytes = ctx.ProcessInstr(src + pos,origin + pos);
|
|
pos += bytes;
|
|
}
|
|
|
|
// for the last few bytes, be very careful not to read past the end
|
|
// of the input instruction stream. create a check point on every
|
|
// instruction; if PackInstr would've read past the end of the input
|
|
// stream, we undo the last step.
|
|
while(pos < size)
|
|
{
|
|
// copy remaining instr bytes into buffer
|
|
sU8 instrBuf[MAXINSTR] = { 0 };
|
|
memcpy(instrBuf,src + pos,size - pos);
|
|
|
|
// save current output size for all streams
|
|
sInt checkpt[ST_MAX];
|
|
for(sInt i=0;i<ST_MAX;i++)
|
|
checkpt[i] = ctx.Buffer[i].Size;
|
|
|
|
// process the instruction
|
|
sInt bytes = ctx.ProcessInstr(instrBuf,origin + pos);
|
|
|
|
if(pos + bytes <= size) // valid instruction
|
|
pos += bytes;
|
|
else
|
|
{
|
|
// we read past the end. restore to checkpoint!
|
|
for(sInt i=0;i<ST_MAX;i++)
|
|
ctx.Buffer[i].Size = checkpt[i];
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
// if there's still bytes left, encode them as escapes.
|
|
while(pos < size)
|
|
{
|
|
ctx.Put8(ST_OP,ESCAPE);
|
|
ctx.Put8(ST_OP,src[pos]);
|
|
pos++;
|
|
}
|
|
|
|
return ctx.Flush(dst, outputSize);
|
|
}
|
|
|
|
/****************************************************************************/
|
|
|
|
static inline sU8 Copy8(sU8 *&d,sU8 *&s) { sU8 v = Fetch8(s); Write8(d,v); return v; }
|
|
static inline sU16 Copy16(sU8 *&d,sU8 *&s) { sU16 v = Fetch16B(s); Write16(d,v); return v; }
|
|
static inline sU32 Copy32(sU8 *&d,sU8 *&s) { sU32 v = Fetch32B(s); Write32(d,v); return v; }
|
|
|
|
// some helpers for bounds checking. this really sucks, but I didn't see any
|
|
// better way to make this safe...
|
|
#define CheckSrc(strm,size) if(stream[strm]+size > streamEnd[strm]) return sFALSE
|
|
#define CheckDst(size) if(dest+size > destEnd) return sFALSE
|
|
#define CheckSrcDst(strm,size) if(stream[strm]+size > streamEnd[strm] || dest+size > destEnd) return sFALSE
|
|
|
|
#define Copy8Chk(strm) do { CheckSrcDst(strm,1); Copy8 (dest,stream[strm]); } while(0)
|
|
#define Copy16Chk(strm) do { CheckSrcDst(strm,2); Copy16(dest,stream[strm]); } while(0)
|
|
#define Copy32Chk(strm) do { CheckSrcDst(strm,4); Copy32(dest,stream[strm]); } while(0)
|
|
|
|
static sBool
|
|
DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart)
|
|
{
|
|
sU8 *stream[ST_MAX];
|
|
sU8 *streamEnd[ST_MAX];
|
|
sU32 funcTable[256];
|
|
|
|
// read header (list of stream sizes)
|
|
if(sourceSize < ST_MAX*4)
|
|
return sFALSE;
|
|
|
|
sU8 *hdr = source;
|
|
sU8 *cur = source + ST_MAX*4;
|
|
for(sInt i=0;i<ST_MAX;i++)
|
|
{
|
|
stream[i] = cur;
|
|
cur += Fetch32(hdr);
|
|
streamEnd[i] = cur;
|
|
}
|
|
|
|
if(cur != source + sourceSize)
|
|
return sFALSE; // size doesn't make sense
|
|
|
|
// start decoding
|
|
for(sInt i=0;i<256;i++)
|
|
funcTable[i] = 0;
|
|
|
|
sBool nextIsFunc = sTRUE;
|
|
|
|
sU8 *destStart = dest;
|
|
sU8 *destEnd = destStart + destSize;
|
|
|
|
while(stream[ST_OP]<streamEnd[ST_OP])
|
|
{
|
|
sU8 *start = dest;
|
|
sU32 memory = memStart + (dest - destStart);
|
|
|
|
sInt code = Fetch8(stream[ST_OP]);
|
|
if(code == JUMPTAB) // jump table escape
|
|
{
|
|
CheckSrc(ST_JUMPTBL_COUNT,1);
|
|
sInt count = Fetch8(stream[ST_JUMPTBL_COUNT]) + 1;
|
|
|
|
for(sInt i=0;i<count;i++)
|
|
{
|
|
sU32 target;
|
|
|
|
CheckSrc(ST_CALL_IDX,1);
|
|
sInt ind = Fetch8(stream[ST_CALL_IDX]);
|
|
if(ind)
|
|
target = MoveToFront(funcTable,ind-1,funcTable[ind-1]);
|
|
else
|
|
{
|
|
CheckSrc(ST_CALL32,4);
|
|
target = Fetch32B(stream[ST_CALL32]);
|
|
AddMTF(funcTable,target);
|
|
}
|
|
|
|
CheckDst(4);
|
|
Write32(dest,target);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
if(nextIsFunc && code != OP_INT3)
|
|
{
|
|
AddMTF(funcTable,memory);
|
|
nextIsFunc = sFALSE;
|
|
}
|
|
|
|
if(code == ESCAPE) // escape
|
|
Copy8Chk(ST_OP);
|
|
else
|
|
{
|
|
CheckDst(1);
|
|
Write8(dest,code);
|
|
|
|
sInt flags = 0;
|
|
sBool o16 = sFALSE;
|
|
if(code == OP_OSIZE) // operand size prefix
|
|
{
|
|
o16 = sTRUE;
|
|
CheckSrcDst(ST_OP,1);
|
|
code = Copy8(dest,stream[ST_OP]);
|
|
}
|
|
|
|
if(code == OP_RETNI || code == OP_RETN || code == OP_INT3) // return/padding
|
|
nextIsFunc = sTRUE; // next opcode is likely to be first of a new function
|
|
|
|
if(code == OP_2BYTE) // two-byte opcode, additional opcode byte follows
|
|
{
|
|
CheckSrcDst(ST_OP2,1);
|
|
flags = Table2[Copy8(dest,stream[ST_OP2])];
|
|
}
|
|
else
|
|
flags = Table1[code];
|
|
|
|
assert(flags != fERR);
|
|
|
|
if(code == OP_CALLF || code == OP_JMPF || code == OP_ENTER)
|
|
{
|
|
// far call/jump have a *48-bit* immediate address. we deal with it here by copying the segment
|
|
// index manually and encoding the rest as a normal 32-bit direct address.
|
|
// similarly, enter has a word operand and a byte operand. again, we code the word here, and
|
|
// deal with the byte later during the normal flow.
|
|
Copy16Chk(ST_IMM16);
|
|
}
|
|
|
|
if(flags & fMR)
|
|
{
|
|
CheckSrcDst(ST_MODRM,1);
|
|
sInt modrm = Copy8(dest,stream[ST_MODRM]);
|
|
sInt sib = 0;
|
|
|
|
if(flags == fMEXTRA)
|
|
flags = TableX[((modrm >> 3) & 7) | ((code & 0x01) << 3) | ((code & 0x08) << 1)];
|
|
|
|
if((modrm & 0x07) == 4 && modrm < 0xc0)
|
|
{
|
|
CheckSrcDst(ST_SIB,1);
|
|
sib = Copy8(dest,stream[ST_SIB]);
|
|
}
|
|
|
|
if((modrm & 0xc0) == 0x40) // register+byte displacement
|
|
{
|
|
sInt st = (modrm & 0x07) + ST_DISP8_R0;
|
|
Copy8Chk(st);
|
|
}
|
|
|
|
if((modrm & 0xc0) == 0x80 || (modrm & 0xc7) == 0x05 || (modrm < 0x40 && (sib & 0x07) == 0x05))
|
|
{
|
|
sInt st = (modrm & 0xc7) == 5 ? ST_ADDR32 : ST_DISP32;
|
|
Copy32Chk(st);
|
|
}
|
|
}
|
|
|
|
if((flags & fMODE) == fAM)
|
|
{
|
|
switch(flags & fTYPE)
|
|
{
|
|
case fAD: Copy32Chk(ST_ADDR32); break;
|
|
case fDA: Copy32Chk(ST_AJUMP32); break;
|
|
case fBR: Copy8Chk(ST_JUMP8); break;
|
|
|
|
case fDR:
|
|
{
|
|
sU32 target;
|
|
if(code == OP_CALLN)
|
|
{
|
|
CheckSrc(ST_CALL_IDX,1);
|
|
sInt ind = Fetch8(stream[ST_CALL_IDX]);
|
|
if(ind)
|
|
target = MoveToFront(funcTable,ind-1,funcTable[ind-1]);
|
|
else
|
|
{
|
|
CheckSrc(ST_CALL32,4);
|
|
target = Fetch32B(stream[ST_CALL32]);
|
|
AddMTF(funcTable,target);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
CheckSrc(ST_JUMP32,4);
|
|
target = Fetch32B(stream[ST_JUMP32]);
|
|
}
|
|
|
|
target -= (dest - start) + 4 + memory;
|
|
CheckDst(4);
|
|
Write32(dest,target);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
switch(flags & fTYPE)
|
|
{
|
|
case fBI: Copy8Chk(ST_IMM8); break;
|
|
case fWI: Copy16Chk(ST_IMM16); break;
|
|
|
|
case fDI:
|
|
if(!o16)
|
|
Copy32Chk(ST_IMM32);
|
|
else
|
|
Copy16Chk(ST_IMM16);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return sTRUE;
|
|
}
|
|
|
|
/*
|
|
* Try to estimate if the given data block contains 32-bit x86 instructions
|
|
* especially of the call and jmp variety.
|
|
* Estimator is adapted from CSC 3.2 Analyzer (Fu Siyuan).
|
|
*/
|
|
static int
|
|
is_x86_code(uchar_t *buf, int len)
|
|
{
|
|
uint32_t avgFreq, freq[256] = {0};
|
|
uint32_t freq0x80[2] = {0};
|
|
uint32_t ln = len;
|
|
int i;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
freq[buf[i]]++;
|
|
}
|
|
|
|
for (i = 0; i< 256; i++) {
|
|
freq0x80[i>>7] += freq[i];
|
|
}
|
|
|
|
avgFreq = ln>>8;
|
|
return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6);
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*
|
|
* E8 E9 Call/Jmp transform routines. Convert relative Call and Jmp addresses
|
|
* to absolute values to improve compression. A couple of tricks are employed:
|
|
* 1) Avoid transforming zero adresses or where adding the current offset to
|
|
* to the presumed address results in a zero result. This avoids a bunch of
|
|
* false positives.
|
|
* 2) Store transformed values in big-endian format. This improves compression.
|
|
*/
|
|
int
|
|
Forward_E89(uint8_t *src, uint64_t sz)
|
|
{
|
|
uint32_t i;
|
|
uint32_t size;
|
|
|
|
if (sz > UINT32_MAX) {
|
|
return (-1);
|
|
}
|
|
|
|
size = sz;
|
|
i = 0;
|
|
while (i < size-4) {
|
|
if ((src[i] & 0xfe) == 0xe8 &&
|
|
(src[i+4] == 0 || src[i+4] == 0xff))
|
|
{
|
|
uint32_t off;
|
|
|
|
off = (src[i+1] | (src[i+2] << 8) | (src[i+3] << 16));
|
|
if (off > 0) {
|
|
off += i;
|
|
off &= 0xffffff;
|
|
if (off > 0) {
|
|
src[i+1] = (uint8_t)(off >> 16);
|
|
src[i+2] = (uint8_t)(off >> 8);
|
|
src[i+3] = (uint8_t)off;
|
|
}
|
|
}
|
|
}
|
|
i++;
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
Inverse_E89(uint8_t *src, uint64_t sz)
|
|
{
|
|
uint32_t i;
|
|
uint32_t size;
|
|
|
|
if (sz > UINT32_MAX) {
|
|
return (-1);
|
|
}
|
|
|
|
size = sz;
|
|
i = size-5;;
|
|
while (i > 0) {
|
|
if ((src[i] & 0xfe) == 0xe8 &&
|
|
(src[i+4] == 0 || src[i+4] == 0xff))
|
|
{
|
|
uint32_t val;
|
|
|
|
val = (src[i+3] | (src[i+2] << 8) | (src[i+1] << 16));
|
|
if (val > 0) {
|
|
val -= i;
|
|
val &= 0xffffff;
|
|
if (val > 0) {
|
|
src[i+1] = (uint8_t)val;
|
|
src[i+2] = (uint8_t)(val >> 8);
|
|
src[i+3] = (uint8_t)(val >> 16);
|
|
}
|
|
}
|
|
}
|
|
i--;
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* 32-bit x86 executable packer top-level routines. Detected x86 executable data
|
|
* are passed through these encoding routines. The data chunk is split into 32KB
|
|
* blocks and each block is separately Dispack-ed. The code tries to detect if
|
|
* a block contains valid x86 code by trying to estimate some instruction metrics.
|
|
*/
|
|
int
|
|
dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen, int stype)
|
|
{
|
|
uchar_t *pos, *hdr, type, *pos_to, *to_last;
|
|
sU32 len;
|
|
#ifdef DEBUG_STATS
|
|
double strt, en;
|
|
#endif
|
|
|
|
if (fromlen > UINT32_MAX)
|
|
return (-1);
|
|
|
|
if (fromlen < DISFILTER_BLOCK)
|
|
return (-1);
|
|
|
|
#ifdef DEBUG_STATS
|
|
strt = get_wtime_millis();
|
|
#endif
|
|
pos = from;
|
|
len = (sU32)fromlen;
|
|
pos_to = to;
|
|
to_last = to + *dstlen;
|
|
while (len > 0) {
|
|
DisFilterCtx ctx(0, DISFILTER_BLOCK);
|
|
sU32 sz;
|
|
sU16 origsize;
|
|
sU32 out;
|
|
sU8 *rv;
|
|
|
|
if (len > DISFILTER_BLOCK)
|
|
sz = DISFILTER_BLOCK;
|
|
else
|
|
sz = len;
|
|
|
|
hdr = pos_to;
|
|
type = 0;
|
|
origsize = sz;
|
|
if (sz < DISFILTER_BLOCK) {
|
|
type |= ORIGSIZE;
|
|
pos_to += EXTENDED_HDR;
|
|
U16_P(hdr + NORMAL_HDR) = LE16(origsize);
|
|
} else {
|
|
pos_to += NORMAL_HDR;
|
|
}
|
|
|
|
out = sz;
|
|
if (is_x86_code(pos, sz)) {
|
|
ctx.ResetCtx(0, sz);
|
|
rv = DisFilter(ctx, pos, sz, 0, pos_to, out);
|
|
} else {
|
|
rv = NULL;
|
|
}
|
|
if (rv != pos_to || sz == out) {
|
|
if (pos_to + origsize >= to_last) {
|
|
return (-1);
|
|
}
|
|
memcpy(pos_to, pos, origsize);
|
|
*hdr = type;
|
|
hdr++;
|
|
U16_P(hdr) = LE16(origsize);
|
|
pos_to += origsize;
|
|
} else {
|
|
sU16 csize;
|
|
|
|
if (pos_to + out >= to_last) {
|
|
return (-1);
|
|
}
|
|
type |= DISFILTERED;
|
|
*hdr = type;
|
|
hdr++;
|
|
csize = out;
|
|
U16_P(hdr) = LE16(csize);
|
|
pos_to += csize;
|
|
}
|
|
pos += sz;
|
|
len -= sz;
|
|
}
|
|
*dstlen = pos_to - to;
|
|
#ifdef DEBUG_STATS
|
|
en = get_wtime_millis();
|
|
cerr << "Dispack: Processed at " << get_mb_s(fromlen, strt, en) << " MB/s" << endl;
|
|
#endif
|
|
if ((fromlen - *dstlen) < DIS_MIN_REDUCE) {
|
|
#ifdef DEBUG_STATS
|
|
cerr << "Dispack: Failed, reduction too less" << endl;
|
|
#endif
|
|
return (-1);
|
|
}
|
|
#ifdef DEBUG_STATS
|
|
cerr << "Dispack: srclen: " << fromlen << ", dstlen: " << *dstlen << endl;
|
|
#endif
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
|
{
|
|
uchar_t *pos, type, *pos_to, *to_last;
|
|
uint64_t len;
|
|
|
|
pos = from;
|
|
len = fromlen;
|
|
pos_to = to;
|
|
to_last = to + *dstlen;
|
|
while (len > 0) {
|
|
sU32 sz, cmpsz;
|
|
|
|
type = *pos++;
|
|
len--;
|
|
sz = DISFILTER_BLOCK;
|
|
cmpsz = LE16(U16_P(pos));
|
|
pos += 2;
|
|
len -= 2;
|
|
if (type & ORIGSIZE) {
|
|
sz = LE16(U16_P(pos));
|
|
pos += 2;
|
|
len -= 2;
|
|
}
|
|
|
|
if (type & DISFILTERED) {
|
|
if (pos_to + sz > to_last) {
|
|
return (-1);
|
|
}
|
|
if (DisUnFilter(pos, cmpsz, pos_to, sz, 0) != sTRUE) {
|
|
return (-1);
|
|
}
|
|
pos += cmpsz;
|
|
pos_to += sz;
|
|
len -= cmpsz;
|
|
} else {
|
|
if (pos_to + cmpsz > to_last) {
|
|
return (-1);
|
|
}
|
|
memcpy(pos_to, pos, cmpsz);
|
|
|
|
/*
|
|
* If E8E9 was applied on this block, apply the inverse transform.
|
|
* This only happens if this block was detected as x86 instruction
|
|
* stream and Dispack was tried but it failed.
|
|
*/
|
|
pos += cmpsz;
|
|
pos_to += cmpsz;
|
|
len -= cmpsz;
|
|
}
|
|
}
|
|
*dstlen = pos_to - to;
|
|
return (0);
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|