/* * This file is a part of Pcompress, a chunked parallel multi- * algorithm lossless compression and decompression program. * * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved. * Use is subject to license terms. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program. * If not, see . * * moinakg@belenix.org, http://moinakg.wordpress.com/ */ #include "types.hpp" #include "dis.hpp" #include #include #include #include #ifndef __APPLE__ #include #endif #include #include using namespace std; /* Version history: * * 1.00 (Nov 2009) Initial release * 1.01 (Jan 2011) Don't assert on bytes > MAXINSTR when dealing with jump tables * 1.02 (Nov 2013) (Moinak Ghosh) Changes to integrate with Pcompress. * Adapted and modified from: * http://www.farbrausch.de/~fg/code/disfilter/ */ /****************************************************************************/ /* This is a filter for x86 binary code, intended to improve its compressibility * by standard algorithms. The basic ideas are quite old; for example, the LZX * algorithm used in Microsoft .CAB files uses a special preprocessor that * converts the target address in CALL opcodes from a relative offset to an * absolute address. This simple transforms greatly helps both LZ-based and * statistical coders: the same function being called repeatedly now results * in the same byte sequence for the call being repeated, instead of having * a different encoding every time. The preprocessor doesn't really understand * the instruction stream; it just looks for a 0xE8 byte (the opcode for near * call) and adds the current position to the 4 bytes that follow it. * * Most modern compressors include this filter or variations, to be used on .EXE * files; newer variants usually try to detect whether the target offset would be * within the executable image to reduce the number of false positives. Another * common modification stores the transformed offsets in big endian byte order: * this clusters the high bits (which are likely to be similar along a stretch of * code) together with the opcode, again yielding somewhat better compression. * * However, all this is based on a very limited understanding of x86 binary code. * It is possible to do significantly with a more thorough understanding of the * bytestream and its underlying structure. This algorithm borrows heavily from the * Split-Stream^2 method described in [1] (or, more precisely, an earlier variant * published somewhen in 2004; I don't remember the details anymore). It also introduces * some (to my knowledge) novel ideas, though. * * The basic idea behind Split-Stream is to disassemble the target program, * splitting it into several distinct streams that can be coded separately. Examples * of such streams are the opcodes themselves, 8 bit immediates, 32 bit immediates, * jump and call target addresses, and so on - the idea being that the individual * fields are highly correlated amongst themselves, but largely independent of each * other. Splitting the streams reduces the context dilution (the inclusion of * irrelevant values in the context used for prediction) that otherwise harms compression * in compiled code. Since the actual compressor in kkrunchy is a LZ-based dictionary * coder and not a context coder, there's no easy way to mix multiple models or use * alphabets with more than 256 symbols; hence the streams are simply stored sequentially, * with a small header denoting the size of each. This interface sacrifices some * compression potential, but has the advantage that the filter inputs and outputs * simple bytestreams; kkrunchy actually compresses the (several hundred bytes long) * unfiltering code along with the transformed code, so part of the decompressor is * stored in compressed form. This results in a somewhat peculiar "bootstrapping" * decompression process but saved roughly 200 bytes when it was originally written; * a big enough gain to be worth it when targeting 64k executables. * * The actual list of streams that are identified can be found below (the "Streams" enum). * To categorize which byte belongs where, the code needs to be disassembled. This * is simpler than it sounds, given the complexity of x86 instruction encoding; * luckily, there's no need to fully "understand" each instruction. We mainly need to * be able to identify the opcode, the addressing mode used, and the presence of * immediate data fields. This is implemented using a mostly table-driven disassembler. * Since the original decoder was heavily optimized for size and the tables need to be * included with the decoder, the encoding is very compact: It mainly consists of two * tables of 256 entries each with 4 bits per entry used - the first table describing * one-byte opcodes, the second for two-byte opcodes (when this code was written, there * were no three-byte opcodes yet). There are some simplifications present in the tables * and the disassembler, where doing so poses no problems. For example, all prefixes * are treated as one-byte opcodes with no operands; this is incorrect, but as long as * the encoder and decoder agree on it, there's no problem. There's also no need to * distinguish between different instructions when they all have the same addressing modes * and combination of immediate operands. All this gets rid of a lot of special cases. * There is one significant deviation from the PPMexe paper [1], though: the code * is very careful never to assume that its parsing of the instruction stream is correct, * and absolutely no irreversible transforms take place (such as the instruction * rescheduling in [1]). Unrecognizable and invalid opcodes are preserved. This is done * by using a very uncommon opcode as escape code, encapsulating otherwise invalid * sequences within the bytestream. This property is critical in practice: code sections * often contain jump tables and other data that isn't decodable as x86 instruction * stream. Corrupting such data during the compression process is unacceptable. * * The target adresses of near jumps and calls of course still get converted from * relative to absolute; additionally, all values larger than 8 bit are stored in big * endian byte order. Both transforms are trivial to undo on the decoder side and yield * notable improvements in compression ratio. Additionally, the last 255 call targets * are kept in an array that's updated using the "move to front" heuristic. If a target * occurs repeatedly (as is common in practice), the offset doesn't need to be coded at * all; instead the position in the array is transmitted. (This is the ST_CALL_IDX * stream). Additionally, the instruction stream is analyzed to identify potential * call targets (i.e. start addresses of functions) even before they are first * referenced: if a RET or INT3 opcode is found in the instruction stream, the filter * assumes that the next instruction is likely to start a new function (MSVC++ uses * INT3 opcodes to fill the "no man's land" between functions) and adds its address to * the function table automatically. Typical overall hit rates for the function table * are between 70 and 80 per cent - so only a quarter of all call target addresses ever * needs to be stored explicitly. * * The most common type of data intermixed with code sections is jump tables and * virtual function tables. Generally speaking, any data inside the code section is * bad for the filter; its statistics are very different from the binary code being * encoded which hurts compression, and it causes the disassembler to lose sync * temporarily. To work around this problem, the encoder tries to identify jump * tables, using another escape code to identify them in the output stream. The * heuristic used here is rather simple, but works very well: When an instruction * is expected, the encoder looks at the next 12 bytes. If they evaluate to * addresses within the code section when interpreted as 3 dwords, the encoder assumes * that it has found a jump table (or vtable). Jump table entries are encoded the * same way that call targets are. * * [1] "PPMexe: Program Compression" * M. Drinic, D. Kirovski, and H. Vo, MS Research * ACM Transactions on Programming Languages and Systems, Vol.29, (no.1), 2007. * http://research.microsoft.com/en-us/um/people/darkok/papers/TOPLAS.pdf */ #define DISFILTER_BLOCK (32768) #define DISFILTERED 1 #define ORIGSIZE 2 #define NORMAL_HDR (1 + 2) #define EXTENDED_HDR (1 + 2 + 2) // Dispack min reduction should be 8%, otherwise we abort #define DIS_MIN_REDUCE (2622) #define MAXINSTR 15 // maximum size of a single instruction in bytes (actually, decodeable ones are shorter) enum Opcodes { // 1-byte opcodes of special interest (for one reason or another) OP_2BYTE = 0x0f, // start of 2-byte opcode OP_OSIZE = 0x66, // operand size prefix OP_CALLF = 0x9a, OP_RETNI = 0xc2, // ret near+immediate OP_RETN = 0xc3, OP_ENTER = 0xc8, OP_INT3 = 0xcc, OP_INTO = 0xce, OP_CALLN = 0xe8, OP_JMPF = 0xea, OP_ICEBP = 0xf1, // escape codes we use (these need to be 1-byte opcodes without an address or immediate operand!) ESCAPE = OP_ICEBP, JUMPTAB = OP_INTO }; // formats enum InstructionFormat { // encoding mode fNM = 0x0, // no ModRM fAM = 0x1, // no ModRM, "address mode" (jumps or direct addresses) fMR = 0x2, // ModRM present fMEXTRA = 0x3, // ModRM present, includes extra bits for opcode fMODE = 0x3, // bitmask for mode // no ModRM: size of immediate operand fNI = 0x0, // no immediate fBI = 0x4, // byte immediate fWI = 0x8, // word immediate fDI = 0xc, // dword immediate fTYPE = 0xc, // type mask // address mode: type of address operand fAD = 0x0, // absolute address fDA = 0x4, // dword absolute jump target fBR = 0x8, // byte relative jump target fDR = 0xc, // dword relative jump target // others fERR = 0xf // denotes invalid opcodes }; enum Streams { ST_OP, // prefixes, first byte of opcode ST_SIB, // SIB byte ST_CALL_IDX, // call table index ST_DISP8_R0, // byte displacement on ModRM, reg no. 0 and following ST_DISP8_R1, ST_DISP8_R2, ST_DISP8_R3, ST_DISP8_R4, ST_DISP8_R5, ST_DISP8_R6, ST_DISP8_R7, ST_JUMP8, // short jump ST_IMM8, // 8-bit immediate ST_IMM16, // 16-bit immediate ST_IMM32, // 32-bit immediate ST_DISP32, // 32-bit displacement ST_ADDR32, // 32-bit direct address ST_CALL32, // 32-bit call target ST_JUMP32, // 32-bit jump target ST_MAX, // these components of the instruction stream are also identified // seperately, but stored together with another stream since there's // high correlation between them (or just because one streams provides // good context to predict the other) ST_MODRM = ST_OP, // ModRM byte ST_OP2 = ST_OP, // second byte of opcode ST_AJUMP32 = ST_JUMP32, // absolute jump target ST_JUMPTBL_COUNT = ST_OP }; /****************************************************************************/ // These helper functions assume that this code is being compiled on a // little-endian platform with no alignment restrictions on data accesses. // If this isn't a safe assumption, change these functions appropriately. // All byte order dependent operations end up calling them. // // I also use the VC++ _byteswap intrinsics to implement big endian stores; // if your compiler doesn't have them, it should be trivial to get rid of them. static inline sU8 Load8(const sU8 *s) { return *s; } static inline sU16 Load16(const sU8 *s) { return *((const sU16 *) s); } static inline sU16 Load16B(const sU8 *s) { return _byteswap_ushort(Load16(s)); } static inline sU32 Load32(const sU8 *s) { return *((const sU32 *) s); } static inline sU32 Load32B(const sU8 *s) { return _byteswap_ulong(Load32(s)); } static inline void Store8(sU8 *d,sU8 v) { *d = v; } static inline void Store16(sU8 *d,sU16 v) { *((sU16 *) d) = v; } static inline void Store16B(sU8 *d,sU16 v) { *((sU16 *) d) = _byteswap_ushort(v); } static inline void Store32(sU8 *d,sU32 v) { *((sU32 *) d) = v; } static inline void Store32B(sU8 *d,sU32 v) { *((sU32 *) d) = _byteswap_ulong(v); } static inline sU8 Fetch8(sU8 *&s) { return *s++; } static inline sU16 Fetch16(sU8 *&s) { sU16 v = Load16(s); s += 2; return v; } static inline sU16 Fetch16B(sU8 *&s) { sU16 v = Load16B(s); s += 2; return v; } static inline sU32 Fetch32(sU8 *&s) { sU32 v = Load32(s); s += 4; return v; } static inline sU32 Fetch32B(sU8 *&s) { sU32 v = Load32B(s); s += 4; return v; } static inline sU8 Write8(sU8 *&d,sU8 v) { Store8(d,v); d += 1; return v; } static inline sU16 Write16(sU8 *&d,sU16 v) { Store16(d,v); d += 2; return v; } static inline sU32 Write32(sU8 *&d,sU32 v) { Store32(d,v); d += 4; return v; } /****************************************************************************/ static sU32 MoveToFront(sU32 *table,sInt pos,sU32 val) { for(;pos > 0;pos--) table[pos] = table[pos-1]; table[0] = val; return val; } static inline void AddMTF(sU32 *mtf,sU32 val) { MoveToFront(mtf,255,val); } static sInt FindMTF(sU32 *mtf,sU32 val) { for(sInt i=0;i<255;i++) { if(mtf[i] == val) { MoveToFront(mtf,i,val); return i; } } AddMTF(mtf,val); return -1; } /****************************************************************************/ struct DataBuffer { sInt Size,Max; sU8 *Data; DataBuffer() { Max = 256; Data = (sU8 *) malloc(Max); ResetBuffer(); } void ResetBuffer() { Size = 0; } ~DataBuffer() { free(Data); } sU8 *Add(sInt bytes) { if(Size+bytes>Max) { Max = (Max*2 < Size+bytes) ? Size+bytes : Max*2; Data = (sU8 *) realloc(Data,Max); } sU8 *ret = Data+Size; Size += bytes; return ret; } }; /****************************************************************************/ // 1-byte opcodes sU8 Table1[256] = { // 0 1 2 3 4 5 6 7 8 9 a b c d e f fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 0 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 1 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 2 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 3 fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 4 fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 5 fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fDI,fMR|fDI,fNM|fBI,fMR|fBI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 6 fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR, // 7 fMR|fBI,fMR|fDI,fMR|fBI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 8 fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fAM|fDA,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 9 fAM|fAD,fAM|fAD,fAM|fAD,fAM|fAD,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // a fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI, // b fMR|fBI,fMR|fBI,fNM|fWI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fBI,fMR|fDI,fNM|fBI,fNM|fNI,fNM|fWI,fNM|fNI,fNM|fNI,fNM|fBI,fERR ,fNM|fNI, // c fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fBI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // d fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fAM|fDR,fAM|fDR,fAM|fAD,fAM|fBR,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // e fNM|fNI,fERR ,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fMEXTRA,fMEXTRA,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fMEXTRA,fMEXTRA, // f }; /****************************************************************************/ // 2-byte opcodes sU8 Table2[256] = { // 0 1 2 3 4 5 6 7 8 9 a b c d e f fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fNM|fNI,fERR ,fNM|fNI,fNM|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // 0 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // 1 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 2 fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fERR ,fNM|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // 3 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 4 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 5 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 6 fMR|fBI,fMR|fBI,fMR|fBI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR ,fMR|fNI,fMR|fNI, // 7 fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR, // 8 fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 9 fNM|fNI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fMR|fNI,fMR|fBI,fMR|fNI,fERR ,fMR|fNI, // a fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // b fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // c fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // d fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // e fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR , // f }; /****************************************************************************/ // escape opcodes using ModRM byte to get more variants sU8 TableX[32] = { // 0 1 2 3 4 5 6 7 fMR|fBI,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // escapes for 0xf6 fMR|fDI,fERR ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // escapes for 0xf7 fMR|fNI,fMR|fNI,fERR ,fERR ,fERR ,fERR ,fERR ,fERR , // escapes for 0xfe fMR|fNI,fMR|fNI,fMR|fNI,fERR ,fMR|fNI,fERR ,fMR|fNI,fERR , // escapes for 0xff }; /****************************************************************************/ /****************************************************************************/ struct DisFilterCtx { DataBuffer Buffer[ST_MAX]; sU32 FuncTable[256]; sBool NextIsFunc; sU32 CodeStart,CodeEnd; DisFilterCtx(sU32 codeStart,sU32 codeEnd) { ResetCtx(codeStart, codeEnd); } void ResetCtx(sU32 codeStart,sU32 codeEnd) { NextIsFunc = sTRUE; for(sInt i=0;i<256;i++) FuncTable[i] = 0; CodeStart = codeStart; CodeEnd = codeEnd; for (sInt i=0; i= CodeStart && codedAddr < CodeEnd) count++; else break; } if(count < 3) // if it's less than 3 entries, it's probably not a jump table. count = 0; return count; } sInt ProcessInstr(sU8 *instr,sU32 memory) { if(sInt nJump = DetectJumpTable(instr,memory)) { // probable jump table with nJump entries sInt remaining = nJump; while(remaining) { sInt count = (remaining < 256) ? remaining : 256; Put8(ST_OP,JUMPTAB); Put8(ST_JUMPTBL_COUNT,count-1); for(sInt i=0;i> 3) & 7) | ((code & 0x01) << 3) | ((code & 0x08) << 1)]; if(flags != fERR) { if(o16) Put8(ST_OP,OP_OSIZE); Put8(ST_OP,code); if(code == OP_2BYTE) Put8(ST_OP2,code2); if(code == OP_CALLF || code == OP_JMPF || code == OP_ENTER) { // far call/jump have a *48-bit* immediate address. we deal with it here by copying the segment index // manually and encoding the rest as a normal 32-bit direct address. // similarly, enter has a word operand and a byte operand. again, we code the word here, and // deal with the byte later during the normal flow. Copy16(ST_IMM16,instr); } if((flags & fMODE) == fMR) { sInt modrm = Copy8(ST_MODRM,instr); sInt sib = 0; if((modrm & 0x07) == 4 && modrm < 0xc0) sib = Copy8(ST_SIB,instr); if((modrm & 0xc0) == 0x40) // register+byte displacement Copy8(ST_DISP8_R0 + (modrm & 0x07),instr); if((modrm & 0xc0) == 0x80 || (modrm & 0xc7) == 0x05 || (modrm < 0x40 && (sib & 0x07) == 5)) { // register+dword displacement Copy32((modrm & 0xc7) == 0x05 ? ST_ADDR32 : ST_DISP32,instr); } } if((flags & fMODE) == fAM) { switch(flags & fTYPE) { case fAD: Copy32(ST_ADDR32,instr); break; case fDA: Copy32(ST_AJUMP32,instr); break; case fBR: Copy8(ST_JUMP8,instr); break; case fDR: { sU32 target = Fetch32(instr); target += (instr - start) + memory; if(code != OP_CALLN) // not a near call Put32(ST_JUMP32,target); else { sInt ind = FindMTF(FuncTable,target); Put8(ST_CALL_IDX,ind+1); if(ind == -1) Put32(ST_CALL32,target); } } break; } } else { switch(flags & fTYPE) { case fBI: Copy8(ST_IMM8,instr); break; case fWI: Copy16(ST_IMM16,instr); break; case fDI: if(!o16) Copy32(ST_IMM32,instr); else Copy16(ST_IMM16,instr); break; } } return instr - start; } else // couldn't decode instruction { Put8(ST_OP,ESCAPE); // escape code Put8(ST_OP,*start); // the unrecognized opcode return 1; } } sU8 *Flush(sU8 *out, sU32 &sz) { sU32 size = 0; if (sz < ST_MAX * 16) return (NULL); size = ST_MAX * 4; // 4 bytes per stream to encode the size for(sInt i=0;i= sz) return (NULL); // Check for output overflow } // Output ptr is supplied by caller sU8 *outPtr = out; for(sInt i=0;i streamEnd[strm]) return sFALSE #define CheckDst(size) if(dest+size > destEnd) return sFALSE #define CheckSrcDst(strm,size) if(stream[strm]+size > streamEnd[strm] || dest+size > destEnd) return sFALSE #define Copy8Chk(strm) do { CheckSrcDst(strm,1); Copy8 (dest,stream[strm]); } while(0) #define Copy16Chk(strm) do { CheckSrcDst(strm,2); Copy16(dest,stream[strm]); } while(0) #define Copy32Chk(strm) do { CheckSrcDst(strm,4); Copy32(dest,stream[strm]); } while(0) static sBool DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart) { sU8 *stream[ST_MAX]; sU8 *streamEnd[ST_MAX]; sU32 funcTable[256]; // read header (list of stream sizes) if(sourceSize < ST_MAX*4) return sFALSE; sU8 *hdr = source; sU8 *cur = source + ST_MAX*4; for(sInt i=0;i> 3) & 7) | ((code & 0x01) << 3) | ((code & 0x08) << 1)]; if((modrm & 0x07) == 4 && modrm < 0xc0) { CheckSrcDst(ST_SIB,1); sib = Copy8(dest,stream[ST_SIB]); } if((modrm & 0xc0) == 0x40) // register+byte displacement { sInt st = (modrm & 0x07) + ST_DISP8_R0; Copy8Chk(st); } if((modrm & 0xc0) == 0x80 || (modrm & 0xc7) == 0x05 || (modrm < 0x40 && (sib & 0x07) == 0x05)) { sInt st = (modrm & 0xc7) == 5 ? ST_ADDR32 : ST_DISP32; Copy32Chk(st); } } if((flags & fMODE) == fAM) { switch(flags & fTYPE) { case fAD: Copy32Chk(ST_ADDR32); break; case fDA: Copy32Chk(ST_AJUMP32); break; case fBR: Copy8Chk(ST_JUMP8); break; case fDR: { sU32 target; if(code == OP_CALLN) { CheckSrc(ST_CALL_IDX,1); sInt ind = Fetch8(stream[ST_CALL_IDX]); if(ind) target = MoveToFront(funcTable,ind-1,funcTable[ind-1]); else { CheckSrc(ST_CALL32,4); target = Fetch32B(stream[ST_CALL32]); AddMTF(funcTable,target); } } else { CheckSrc(ST_JUMP32,4); target = Fetch32B(stream[ST_JUMP32]); } target -= (dest - start) + 4 + memory; CheckDst(4); Write32(dest,target); } break; } } else { switch(flags & fTYPE) { case fBI: Copy8Chk(ST_IMM8); break; case fWI: Copy16Chk(ST_IMM16); break; case fDI: if(!o16) Copy32Chk(ST_IMM32); else Copy16Chk(ST_IMM16); break; } } } } return sTRUE; } /* * Try to estimate if the given data block contains 32-bit x86 instructions * especially of the call and jmp variety. * Estimator is adapted from CSC 3.2 Analyzer (Fu Siyuan). */ static int is_x86_code(uchar_t *buf, int len) { uint32_t avgFreq, freq[256] = {0}; uint32_t freq0x80[2] = {0}; uint32_t ln = len; int i; for (i = 0; i < len; i++) { freq[buf[i]]++; } for (i = 0; i< 256; i++) { freq0x80[i>>7] += freq[i]; } avgFreq = ln>>8; return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6); } #ifdef __cplusplus extern "C" { #endif /* * E8 E9 Call/Jmp transform routines. Convert relative Call and Jmp addresses * to absolute values to improve compression. A couple of tricks are employed: * 1) Avoid transforming zero adresses or where adding the current offset to * to the presumed address results in a zero result. This avoids a bunch of * false positives. * 2) Store transformed values in big-endian format. This improves compression. */ int Forward_E89(uint8_t *src, uint64_t sz) { uint32_t i; uint32_t size; if (sz > UINT32_MAX) { return (-1); } size = sz; i = 0; while (i < size-4) { if ((src[i] & 0xfe) == 0xe8 && (src[i+4] == 0 || src[i+4] == 0xff)) { uint32_t off; off = (src[i+1] | (src[i+2] << 8) | (src[i+3] << 16)); if (off > 0) { off += i; off &= 0xffffff; if (off > 0) { src[i+1] = (uint8_t)(off >> 16); src[i+2] = (uint8_t)(off >> 8); src[i+3] = (uint8_t)off; } } } i++; } return (0); } int Inverse_E89(uint8_t *src, uint64_t sz) { uint32_t i; uint32_t size; if (sz > UINT32_MAX) { return (-1); } size = sz; i = size-5;; while (i > 0) { if ((src[i] & 0xfe) == 0xe8 && (src[i+4] == 0 || src[i+4] == 0xff)) { uint32_t val; val = (src[i+3] | (src[i+2] << 8) | (src[i+1] << 16)); if (val > 0) { val -= i; val &= 0xffffff; if (val > 0) { src[i+1] = (uint8_t)val; src[i+2] = (uint8_t)(val >> 8); src[i+3] = (uint8_t)(val >> 16); } } } i--; } return (0); } /* * 32-bit x86 executable packer top-level routines. Detected x86 executable data * are passed through these encoding routines. The data chunk is split into 32KB * blocks and each block is separately Dispack-ed. The code tries to detect if * a block contains valid x86 code by trying to estimate some instruction metrics. */ int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen, int stype) { uchar_t *pos, *hdr, type, *pos_to, *to_last; sU32 len; #ifdef DEBUG_STATS double strt, en; #endif if (fromlen > UINT32_MAX) return (-1); if (fromlen < DISFILTER_BLOCK) return (-1); #ifdef DEBUG_STATS strt = get_wtime_millis(); #endif pos = from; len = (sU32)fromlen; pos_to = to; to_last = to + *dstlen; while (len > 0) { DisFilterCtx ctx(0, DISFILTER_BLOCK); sU32 sz; sU16 origsize; sU32 out; sU8 *rv; if (len > DISFILTER_BLOCK) sz = DISFILTER_BLOCK; else sz = len; hdr = pos_to; type = 0; origsize = sz; if (sz < DISFILTER_BLOCK) { type |= ORIGSIZE; pos_to += EXTENDED_HDR; U16_P(hdr + NORMAL_HDR) = LE16(origsize); } else { pos_to += NORMAL_HDR; } out = sz; if (is_x86_code(pos, sz)) { ctx.ResetCtx(0, sz); rv = DisFilter(ctx, pos, sz, 0, pos_to, out); } else { rv = NULL; } if (rv != pos_to || sz == out) { if (pos_to + origsize >= to_last) { return (-1); } memcpy(pos_to, pos, origsize); *hdr = type; hdr++; U16_P(hdr) = LE16(origsize); pos_to += origsize; } else { sU16 csize; if (pos_to + out >= to_last) { return (-1); } type |= DISFILTERED; *hdr = type; hdr++; csize = out; U16_P(hdr) = LE16(csize); pos_to += csize; } pos += sz; len -= sz; } *dstlen = pos_to - to; #ifdef DEBUG_STATS en = get_wtime_millis(); cerr << "Dispack: Processed at " << get_mb_s(fromlen, strt, en) << " MB/s" << endl; #endif if ((fromlen - *dstlen) < DIS_MIN_REDUCE) { #ifdef DEBUG_STATS cerr << "Dispack: Failed, reduction too less" << endl; #endif return (-1); } #ifdef DEBUG_STATS cerr << "Dispack: srclen: " << fromlen << ", dstlen: " << *dstlen << endl; #endif return (0); } int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) { uchar_t *pos, type, *pos_to, *to_last; uint64_t len; pos = from; len = fromlen; pos_to = to; to_last = to + *dstlen; while (len > 0) { sU32 sz, cmpsz; type = *pos++; len--; sz = DISFILTER_BLOCK; cmpsz = LE16(U16_P(pos)); pos += 2; len -= 2; if (type & ORIGSIZE) { sz = LE16(U16_P(pos)); pos += 2; len -= 2; } if (type & DISFILTERED) { if (pos_to + sz > to_last) { return (-1); } if (DisUnFilter(pos, cmpsz, pos_to, sz, 0) != sTRUE) { return (-1); } pos += cmpsz; pos_to += sz; len -= cmpsz; } else { if (pos_to + cmpsz > to_last) { return (-1); } memcpy(pos_to, pos, cmpsz); /* * If E8E9 was applied on this block, apply the inverse transform. * This only happens if this block was detected as x86 instruction * stream and Dispack was tried but it failed. */ pos += cmpsz; pos_to += cmpsz; len -= cmpsz; } } *dstlen = pos_to - to; return (0); } #ifdef __cplusplus } #endif