From 0192790c02c64fb66e1e208c0681bc63e165a31c Mon Sep 17 00:00:00 2001
From: Moinak Ghosh <moinakg@gmail.com>
Date: Sun, 24 Nov 2013 19:45:58 +0530
Subject: [PATCH] Add Dispack filter with auto-detection of x86 executables in
 archive mode. More elaborate magic header based detection of 32-bit and
 64-bit x86 binaries. Always use fast-mode LZ4 in Adaptive modes.

---
 Makefile.in                |   12 +-
 adaptive_compress.c        |    7 +-
 archive/pc_archive.c       |   70 ++-
 filters/dispack/dis.cpp    | 1067 ++++++++++++++++++++++++++++++++++++
 filters/dispack/dis.hpp    |   41 ++
 filters/dispack/types.hpp  |   51 ++
 pcompress.c                |   39 +-
 pcompress.h                |    2 +
 utils/phash/extensions.h   |   12 +-
 utils/phash/extensions.txt |   12 +-
 utils/phash/phash.c        |   16 +-
 utils/phash/phash.h        |    2 +-
 utils/utils.h              |    7 +-
 13 files changed, 1293 insertions(+), 45 deletions(-)
 create mode 100644 filters/dispack/dis.cpp
 create mode 100644 filters/dispack/dis.hpp
 create mode 100644 filters/dispack/types.hpp

diff --git a/Makefile.in b/Makefile.in
index 0041d9f..7fa005a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -30,7 +30,7 @@ MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
 	adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
 	utils/xxhash_base.c utils/heap.c utils/cpuid.c pcompress.c
 MAINHDRS = allocator.h  pcompress.h  utils/utils.h utils/xxhash.h utils/heap.h \
-	utils/cpuid.h utils/xxhash.h archive/pc_archive.h
+	utils/cpuid.h utils/xxhash.h archive/pc_archive.h filters/dispack/dis.hpp
 MAINOBJS = $(MAINSRCS:.c=.o)
 
 PROGSRCS = main.c
@@ -142,6 +142,10 @@ PJPGHDRS = filters/packjpg/aricoder.h filters/packjpg/bitops.h filters/packjpg/d
 	filters/packjpg/packjpglib.h filters/packjpg/pjpgtbl.h
 PJPGOBJS = $(PJPGSRCS:.cpp=.o)
 
+DISPACKSRCS = filters/dispack/dis.cpp
+DISPACKHDRS = filters/dispack/dis.hpp filters/dispack/types.hpp
+DISPACKOBJS = $(DISPACKSRCS:.cpp=.o)
+
 SKEIN_BLOCK_C = crypto/skein/skein_block.c
 SKEIN_BLOCK_ASM = crypto/skein/skein_block_x64.s
 SKEIN_BLOCK_SRC = @SKEIN_BLOCK@
@@ -218,7 +222,7 @@ OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
 $(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
 $(SKEIN_BLOCK_OBJ) @SHA2ASM_OBJS@ @SHA2_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
 $(TRANSP_OBJS) $(CRYPTO_OBJS) $(ZLIB_OBJS) $(BZLIB_OBJS) $(XXHASH_OBJS) $(BLAKE2_OBJS) \
-@CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS)
+@CRYPTO_COMPAT_OBJS@ $(CRYPTO_ASM_OBJS) $(ARCHIVEOBJS) $(PJPGOBJS) $(DISPACKOBJS)
 
 DEBUG_LINK = g++ -pthread @LIBBSCGEN_OPT@ @EXTRA_OPT_FLAGS@ -fopenmp -fPIC
 DEBUG_COMPILE = gcc -g -c @EXTRA_OPT_FLAGS@ -fPIC
@@ -296,6 +300,10 @@ $(PJPGOBJS): $(PJPGSRCS) $(PJPGHDRS)
 	$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @SSE_OPT_FLAGS@ -O2 -fsched-spec-load \
 	$(VEC_FLAGS) -DBUILD_LIB $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
 
+$(DISPACKOBJS): $(DISPACKSRCS) $(DISPACKHDRS)
+	$(COMPILE_cpp) $(COMMON_VEC_FLAGS) @SSE_OPT_FLAGS@ -O2 -fsched-spec-load \
+	$(VEC_FLAGS) $(COMMON_CPPFLAGS_cpp) $(@:.o=.cpp) -o $@
+
 $(SKEIN_BLOCK_OBJ): $(SKEIN_BLOCK_SRC)
 	$(COMPILE) $(SKEIN_FLAGS) $(SKEIN_BLOCK_SRC) -o $@
 
diff --git a/adaptive_compress.c b/adaptive_compress.c
index 568006d..74b6f98 100644
--- a/adaptive_compress.c
+++ b/adaptive_compress.c
@@ -124,7 +124,7 @@ adapt_init(void **data, int *level, int nthreads, uint64_t chunksize,
 	   int file_version, compress_op_t op)
 {
 	struct adapt_data *adat = (struct adapt_data *)(*data);
-	int rv = 0;
+	int rv = 0, lv = 1;
 
 	if (!adat) {
 		adat = (struct adapt_data *)slab_alloc(NULL, sizeof (struct adapt_data));
@@ -137,7 +137,7 @@ adapt_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		 * compression level.
 		 */
 		if (rv == 0)
-			rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op);
+			rv = lz4_init(&(adat->lz4_data), &lv, nthreads, chunksize, file_version, op);
 		adat->lzma_data = NULL;
 		adat->bsc_data = NULL;
 		*data = adat;
@@ -178,8 +178,9 @@ adapt2_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		 * otherwise incompressible data. So we always use it at the lowest and fastest
 		 * compression level.
 		 */
+		lv = 1;
 		if (rv == 0)
-			rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op);
+			rv = lz4_init(&(adat->lz4_data), &lv, nthreads, chunksize, file_version, op);
 		*data = adat;
 		if (*level > 9) *level = 9;
 	}
diff --git a/archive/pc_archive.c b/archive/pc_archive.c
index 93ca24a..44e981f 100644
--- a/archive/pc_archive.c
+++ b/archive/pc_archive.c
@@ -915,6 +915,7 @@ do_map:
 
 		if (typ == TYPE_UNKNOWN) {
 			pctx->ctype = detect_type_by_data(src, len);
+			typ = pctx->ctype;
 			if (typ != TYPE_UNKNOWN) {
 				if (typetab[(typ >> 3)].filter_func != NULL) {
 					int64_t rv;
@@ -1428,6 +1429,9 @@ out:
 
 /* TTA1 packed into 32-bit integer. */
 #	define	TTA1	(0x54544131)
+
+/* Magic for different MSDOS COM file types. */
+#	define COM_MAGIC	(0xcd21)
 #else
 /* 0x7fELF packed into 32-bit integer. */
 #	define	ELFINT (0x464c457fU)
@@ -1443,6 +1447,9 @@ out:
 
 /* TTA1 packed into 32-bit integer. */
 #	define	TTA1	(0x31415454)
+
+/* Magic for different MSDOS COM file types. */
+#	define COM_MAGIC	(0x21cd)
 #endif
 
 /*
@@ -1454,12 +1461,63 @@ detect_type_by_data(uchar_t *buf, size_t len)
 	// At least a few bytes.
 	if (len < 16) return (TYPE_UNKNOWN);
 
-	if (U32_P(buf) == ELFINT)
-		return (TYPE_BINARY|TYPE_EXE); // Regular ELF
-	if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z')
-		return (TYPE_BINARY|TYPE_EXE); // MSDOS Exe
-	if (buf[0] == 0xe9)
-		return (TYPE_BINARY|TYPE_EXE); // MSDOS COM
+	if (U32_P(buf) == ELFINT) {  // Regular ELF, check for 32/64-bit, core dump
+		if (*(buf + 16) != 4) {
+			if (*(buf + 4) == 2) {
+				return (TYPE_BINARY|TYPE_EXE64);
+			} else {
+				return (TYPE_BINARY|TYPE_EXE32);
+			}
+		} else {
+			return (TYPE_BINARY);
+		}
+	}
+	if (buf[1] == 'Z') {
+		 // Check for MSDOS/Windows Exe types
+		if (buf[0] == 'L') {
+			return (TYPE_BINARY|TYPE_EXE32);
+		} else if (buf[0] == 'M') {
+			// If relocation table is less than 0x40 bytes into file then
+			// it is a 32-bit MSDOS exe.
+			if (LE16(U16_P(buf + 0x18)) < 0x40) {
+				return (TYPE_BINARY|TYPE_EXE32);
+			} else {
+				uint32_t off = LE32(U32_P(buf + 0x3c));
+				// This is non-MSDOS, check whether PE
+				if (off < len - 3) {
+					if (buf[off] == 'P' && buf[off+1] == 'E' &&
+					    buf[off+2] == '\0' && buf[off+3] == '\0') {
+						// This is a PE executable.
+						// Check 32/64-bit.
+						off = LE32(U32_P(buf + 0x3c))+4;
+						if (LE16(U16_P(buf + off)) == 0x8664) {
+							return (TYPE_BINARY|TYPE_EXE64);
+						} else {
+							return (TYPE_BINARY|TYPE_EXE32);
+						}
+					} else {
+						return (TYPE_BINARY|TYPE_EXE32);
+					}
+				}
+			}
+		}
+	}
+
+	// MSDOS COM types
+	if (buf[0] == 0xe9 || buf[0] == 0xeb) {
+		if (LE16(U16_P(buf + 0x1fe)) == 0xaa55)
+			return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM
+		else
+			return (TYPE_BINARY);
+	}
+	if (U16_P(buf + 2) == COM_MAGIC || U16_P(buf + 4) == COM_MAGIC ||
+	    U16_P(buf + 4) == COM_MAGIC || U16_P(buf + 5) == COM_MAGIC ||
+	    U16_P(buf + 13) == COM_MAGIC || U16_P(buf + 18) == COM_MAGIC ||
+	    U16_P(buf + 23) == COM_MAGIC || U16_P(buf + 30) == COM_MAGIC ||
+	    U16_P(buf + 70) == COM_MAGIC) {
+			return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM
+	}
+
 	if (U32_P(buf) == TZINT)
 		return (TYPE_BINARY); // Timezone data
 	if (U32_P(buf) == PPMINT)
diff --git a/filters/dispack/dis.cpp b/filters/dispack/dis.cpp
new file mode 100644
index 0000000..3e14378
--- /dev/null
+++ b/filters/dispack/dis.cpp
@@ -0,0 +1,1067 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ */
+
+#include "types.hpp"
+#include "dis.hpp"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <assert.h>
+
+/* Version history:
+ *
+ * 1.00  (Nov 2009)  Initial release
+ * 1.01  (Jan 2011)  Don't assert on bytes > MAXINSTR when dealing with jump tables
+ * 1.02  (Nov 2013)  (Moinak Ghosh) Changes to integrate with Pcompress.
+ *                   Adapted and modified from:
+ *                   http://www.farbrausch.de/~fg/code/disfilter/
+ */
+
+/****************************************************************************/
+
+/* This is a filter for x86 binary code, intended to improve its compressibility
+ * by standard algorithms. The basic ideas are quite old; for example, the LZX
+ * algorithm used in Microsoft .CAB files uses a special preprocessor that
+ * converts the target address in CALL opcodes from a relative offset to an
+ * absolute address. This simple transforms greatly helps both LZ-based and
+ * statistical coders: the same function being called repeatedly now results
+ * in the same byte sequence for the call being repeated, instead of having
+ * a different encoding every time. The preprocessor doesn't really understand
+ * the instruction stream; it just looks for a 0xE8 byte (the opcode for near
+ * call) and adds the current position to the 4 bytes that follow it.
+ *
+ * Most modern compressors include this filter or variations, to be used on .EXE
+ * files; newer variants usually try to detect whether the target offset would be
+ * within the executable image to reduce the number of false positives. Another
+ * common modification stores the transformed offsets in big endian byte order:
+ * this clusters the high bits (which are likely to be similar along a stretch of
+ * code) together with the opcode, again yielding somewhat better compression.
+ *
+ * However, all this is based on a very limited understanding of x86 binary code.
+ * It is possible to do significantly with a more thorough understanding of the
+ * bytestream and its underlying structure. This algorithm borrows heavily from the
+ * Split-Stream^2 method described in [1] (or, more precisely, an earlier variant
+ * published somewhen in 2004; I don't remember the details anymore). It also introduces
+ * some (to my knowledge) novel ideas, though.
+ *
+ * The basic idea behind Split-Stream is to disassemble the target program,
+ * splitting it into several distinct streams that can be coded separately. Examples
+ * of such streams are the opcodes themselves, 8 bit immediates, 32 bit immediates,
+ * jump and call target addresses, and so on - the idea being that the individual
+ * fields are highly correlated amongst themselves, but largely independent of each
+ * other. Splitting the streams reduces the context dilution (the inclusion of
+ * irrelevant values in the context used for prediction) that otherwise harms compression
+ * in compiled code. Since the actual compressor in kkrunchy is a LZ-based dictionary
+ * coder and not a context coder, there's no easy way to mix multiple models or use
+ * alphabets with more than 256 symbols; hence the streams are simply stored sequentially,
+ * with a small header denoting the size of each. This interface sacrifices some
+ * compression potential, but has the advantage that the filter inputs and outputs
+ * simple bytestreams; kkrunchy actually compresses the (several hundred bytes long)
+ * unfiltering code along with the transformed code, so part of the decompressor is
+ * stored in compressed form. This results in a somewhat peculiar "bootstrapping"
+ * decompression process but saved roughly 200 bytes when it was originally written;
+ * a big enough gain to be worth it when targeting 64k executables.
+ *
+ * The actual list of streams that are identified can be found below (the "Streams" enum).
+ * To categorize which byte belongs where, the code needs to be disassembled. This
+ * is simpler than it sounds, given the complexity of x86 instruction encoding;
+ * luckily, there's no need to fully "understand" each instruction. We mainly need to
+ * be able to identify the opcode, the addressing mode used, and the presence of
+ * immediate data fields. This is implemented using a mostly table-driven disassembler.
+ * Since the original decoder was heavily optimized for size and the tables need to be
+ * included with the decoder, the encoding is very compact: It mainly consists of two
+ * tables of 256 entries each with 4 bits per entry used - the first table describing
+ * one-byte opcodes, the second for two-byte opcodes (when this code was written, there
+ * were no three-byte opcodes yet). There are some simplifications present in the tables
+ * and the disassembler, where doing so poses no problems. For example, all prefixes
+ * are treated as one-byte opcodes with no operands; this is incorrect, but as long as
+ * the encoder and decoder agree on it, there's no problem. There's also no need to
+ * distinguish between different instructions when they all have the same addressing modes
+ * and combination of immediate operands. All this gets rid of a lot of special cases.
+ * There is one significant deviation from the PPMexe paper [1], though: the code
+ * is very careful never to assume that its parsing of the instruction stream is correct,
+ * and absolutely no irreversible transforms take place (such as the instruction
+ * rescheduling in [1]). Unrecognizable and invalid opcodes are preserved. This is done
+ * by using a very uncommon opcode as escape code, encapsulating otherwise invalid
+ * sequences within the bytestream. This property is critical in practice: code sections
+ * often contain jump tables and other data that isn't decodable as x86 instruction
+ * stream. Corrupting such data during the compression process is unacceptable.
+ *
+ * The target adresses of near jumps and calls of course still get converted from
+ * relative to absolute; additionally, all values larger than 8 bit are stored in big
+ * endian byte order. Both transforms are trivial to undo on the decoder side and yield
+ * notable improvements in compression ratio. Additionally, the last 255 call targets
+ * are kept in an array that's updated using the "move to front" heuristic. If a target
+ * occurs repeatedly (as is common in practice), the offset doesn't need to be coded at
+ * all; instead the position in the array is transmitted. (This is the ST_CALL_IDX
+ * stream). Additionally, the instruction stream is analyzed to identify potential
+ * call targets (i.e. start addresses of functions) even before they are first
+ * referenced: if a RET or INT3 opcode is found in the instruction stream, the filter
+ * assumes that the next instruction is likely to start a new function (MSVC++ uses
+ * INT3 opcodes to fill the "no man's land" between functions) and adds its address to
+ * the function table automatically. Typical overall hit rates for the function table
+ * are between 70 and 80 per cent - so only a quarter of all call target addresses ever
+ * needs to be stored explicitly.
+ *
+ * The most common type of data intermixed with code sections is jump tables and
+ * virtual function tables. Generally speaking, any data inside the code section is
+ * bad for the filter; its statistics are very different from the binary code being
+ * encoded which hurts compression, and it causes the disassembler to lose sync
+ * temporarily. To work around this problem, the encoder tries to identify jump
+ * tables, using another escape code to identify them in the output stream. The
+ * heuristic used here is rather simple, but works very well: When an instruction
+ * is expected, the encoder looks at the next 12 bytes. If they evaluate to
+ * addresses within the code section when interpreted as 3 dwords, the encoder assumes
+ * that it has found a jump table (or vtable). Jump table entries are encoded the
+ * same way that call targets are.
+ *
+ * [1] "PPMexe: Program Compression"
+ *     M. Drinic, D. Kirovski, and H. Vo, MS Research
+ *     ACM Transactions on Programming Languages and Systems, Vol.29, (no.1), 2007.
+ *     http://research.microsoft.com/en-us/um/people/darkok/papers/TOPLAS.pdf
+ */
+
+#define	DISFILTER_BLOCK	(32768)
+#define	DISFILTERED	1
+#define	ORIGSIZE		2
+#define	CLEAR_DISFILTER	0xfe
+#define	NORMAL_HDR	(1 + 2)
+#define	EXTENDED_HDR	(1 + 2 + 2)
+// Dispack min reduction should be 8%, otherwise we abort
+#define	DIS_MIN_REDUCE	(2622) 
+
+#define	MAXINSTR 15     // maximum size of a single instruction in bytes (actually, decodeable ones are shorter)
+
+enum Opcodes
+{
+  // 1-byte opcodes of special interest (for one reason or another)
+  OP_2BYTE  = 0x0f,     // start of 2-byte opcode
+  OP_OSIZE  = 0x66,     // operand size prefix
+  OP_CALLF  = 0x9a,
+  OP_RETNI  = 0xc2,     // ret near+immediate
+  OP_RETN   = 0xc3,
+  OP_ENTER  = 0xc8,
+  OP_INT3   = 0xcc,
+  OP_INTO   = 0xce,
+  OP_CALLN  = 0xe8,
+  OP_JMPF   = 0xea,
+  OP_ICEBP  = 0xf1,
+
+  // escape codes we use (these need to be 1-byte opcodes without an address or immediate operand!)
+  ESCAPE = OP_ICEBP,
+  JUMPTAB = OP_INTO
+};
+
+// formats
+enum InstructionFormat
+{
+  // encoding mode
+  fNM = 0x0,      // no ModRM
+  fAM = 0x1,      // no ModRM, "address mode" (jumps or direct addresses)
+  fMR = 0x2,      // ModRM present
+  fMEXTRA = 0x3,  // ModRM present, includes extra bits for opcode
+  fMODE = 0x3,    // bitmask for mode
+
+  // no ModRM: size of immediate operand
+  fNI = 0x0,      // no immediate
+  fBI = 0x4,      // byte immediate
+  fWI = 0x8,      // word immediate
+  fDI = 0xc,      // dword immediate
+  fTYPE = 0xc,    // type mask
+
+  // address mode: type of address operand
+  fAD = 0x0,      // absolute address
+  fDA = 0x4,      // dword absolute jump target
+  fBR = 0x8,      // byte relative jump target
+  fDR = 0xc,      // dword relative jump target
+
+  // others
+  fERR = 0xf      // denotes invalid opcodes
+};
+
+enum Streams
+{
+  ST_OP,                    // prefixes, first byte of opcode
+  ST_SIB,                   // SIB byte
+  ST_CALL_IDX,              // call table index
+  ST_DISP8_R0,              // byte displacement on ModRM, reg no. 0 and following
+  ST_DISP8_R1, ST_DISP8_R2, ST_DISP8_R3, ST_DISP8_R4, ST_DISP8_R5, ST_DISP8_R6, ST_DISP8_R7,
+  ST_JUMP8,                 // short jump
+  ST_IMM8,                  // 8-bit immediate
+  ST_IMM16,                 // 16-bit immediate
+  ST_IMM32,                 // 32-bit immediate
+  ST_DISP32,                // 32-bit displacement
+  ST_ADDR32,                // 32-bit direct address
+  ST_CALL32,                // 32-bit call target
+  ST_JUMP32,                // 32-bit jump target
+
+  ST_MAX,
+  
+  // these components of the instruction stream are also identified
+  // seperately, but stored together with another stream since there's
+  // high correlation between them (or just because one streams provides
+  // good context to predict the other)
+  ST_MODRM = ST_OP,         // ModRM byte
+  ST_OP2 = ST_OP,           // second byte of opcode
+  ST_AJUMP32 = ST_JUMP32,   // absolute jump target
+  ST_JUMPTBL_COUNT = ST_OP
+};
+
+/****************************************************************************/
+
+// These helper functions assume that this code is being compiled on a
+// little-endian platform with no alignment restrictions on data accesses.
+// If this isn't a safe assumption, change these functions appropriately.
+// All byte order dependent operations end up calling them.
+//
+// I also use the VC++ _byteswap intrinsics to implement big endian stores;
+// if your compiler doesn't have them, it should be trivial to get rid of them.
+
+static inline sU8 Load8(const sU8 *s)       { return *s; }
+static inline sU16 Load16(const sU8 *s)     { return *((const sU16 *) s); }
+static inline sU16 Load16B(const sU8 *s)    { return _byteswap_ushort(Load16(s)); }
+static inline sU32 Load32(const sU8 *s)     { return *((const sU32 *) s); }
+static inline sU32 Load32B(const sU8 *s)    { return _byteswap_ulong(Load32(s)); }
+
+static inline void Store8(sU8 *d,sU8 v)     { *d = v; }
+static inline void Store16(sU8 *d,sU16 v)   { *((sU16 *) d) = v; }
+static inline void Store16B(sU8 *d,sU16 v)  { *((sU16 *) d) = _byteswap_ushort(v); }
+static inline void Store32(sU8 *d,sU32 v)   { *((sU32 *) d) = v; }
+static inline void Store32B(sU8 *d,sU32 v)  { *((sU32 *) d) = _byteswap_ulong(v); }
+
+static inline sU8 Fetch8(sU8 *&s)           { return *s++; }
+static inline sU16 Fetch16(sU8 *&s)         { sU16 v = Load16(s);   s += 2; return v; }
+static inline sU16 Fetch16B(sU8 *&s)        { sU16 v = Load16B(s);  s += 2; return v; }
+static inline sU32 Fetch32(sU8 *&s)         { sU32 v = Load32(s);   s += 4; return v; }
+static inline sU32 Fetch32B(sU8 *&s)        { sU32 v = Load32B(s);  s += 4; return v; }
+
+static inline sU8  Write8(sU8 *&d,sU8 v)    { Store8(d,v);  d += 1; return v; }
+static inline sU16 Write16(sU8 *&d,sU16 v)  { Store16(d,v); d += 2; return v; }
+static inline sU32 Write32(sU8 *&d,sU32 v)  { Store32(d,v); d += 4; return v; }
+
+/****************************************************************************/
+
+static sU32 MoveToFront(sU32 *table,sInt pos,sU32 val)
+{
+  for(;pos > 0;pos--)
+    table[pos] = table[pos-1];
+
+  table[0] = val;
+  return val;
+}
+
+static inline void AddMTF(sU32 *mtf,sU32 val)
+{
+  MoveToFront(mtf,255,val);
+}
+
+static sInt FindMTF(sU32 *mtf,sU32 val)
+{
+  for(sInt i=0;i<255;i++)
+  {
+    if(mtf[i] == val)
+    {
+      MoveToFront(mtf,i,val);
+      return i;
+    }
+  }
+
+  AddMTF(mtf,val);
+  return -1;
+}
+
+/****************************************************************************/
+
+struct DataBuffer
+{
+  sInt Size,Max;
+  sU8 *Data;
+
+  DataBuffer()
+  {
+    Max = 256;
+    Data = (sU8 *) malloc(Max);
+    ResetBuffer();
+  }
+
+  void ResetBuffer()
+  {
+    Size = 0;
+  }
+
+  ~DataBuffer()
+  {
+    free(Data);
+  }
+
+  sU8 *Add(sInt bytes)
+  {
+    if(Size+bytes>Max)
+    {
+      Max = (Max*2 < Size+bytes) ? Size+bytes : Max*2;
+      Data = (sU8 *) realloc(Data,Max);
+    }
+
+    sU8 *ret = Data+Size;
+    Size += bytes;
+    return ret;
+  }
+};
+
+/****************************************************************************/
+
+// 1-byte opcodes
+sU8 Table1[256] =
+{
+  // 0       1       2       3       4       5       6       7       8       9       a       b       c       d       e       f
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 0
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 1
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 2
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI, // 3
+
+  fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 4
+  fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 5
+  fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fDI,fMR|fDI,fNM|fBI,fMR|fBI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 6
+  fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR, // 7
+
+  fMR|fBI,fMR|fDI,fMR|fBI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 8
+  fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fAM|fDA,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // 9
+  fAM|fAD,fAM|fAD,fAM|fAD,fAM|fAD,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fBI,fNM|fDI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // a
+  fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI,fNM|fDI, // b
+
+  fMR|fBI,fMR|fBI,fNM|fWI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fBI,fMR|fDI,fNM|fBI,fNM|fNI,fNM|fWI,fNM|fNI,fNM|fNI,fNM|fBI,fERR   ,fNM|fNI, // c
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fBI,fNM|fBI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // d
+  fAM|fBR,fAM|fBR,fAM|fBR,fAM|fBR,fNM|fBI,fNM|fBI,fNM|fBI,fNM|fBI,fAM|fDR,fAM|fDR,fAM|fAD,fAM|fBR,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // e
+  fNM|fNI,fERR   ,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fMEXTRA,fMEXTRA,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fMEXTRA,fMEXTRA, // f
+};
+
+/****************************************************************************/
+
+// 2-byte opcodes
+sU8 Table2[256] =
+{
+  // 0       1       2       3       4       5       6       7       8       9       a       b       c       d       e       f
+  fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fNM|fNI,fERR   ,fNM|fNI,fNM|fNI,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   , // 0
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   , // 1
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR   ,fERR   ,fERR   ,fERR   ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 2
+  fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fERR   ,fNM|fNI,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   , // 3
+
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 4
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 5
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 6
+  fMR|fBI,fMR|fBI,fMR|fBI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fMR|fNI,fMR|fNI, // 7
+
+  fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR,fAM|fDR, // 8
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // 9
+  fNM|fNI,fNM|fNI,fNM|fNI,fMR|fNI,fMR|fBI,fMR|fNI,fMR|fNI,fMR|fNI,fERR   ,fERR   ,fERR   ,fMR|fNI,fMR|fBI,fMR|fNI,fERR   ,fMR|fNI, // a
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR   ,fERR   ,fERR   ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // b
+
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI,fNM|fNI, // c
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // d
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // e
+  fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fERR   , // f
+};
+
+/****************************************************************************/
+
+// escape opcodes using ModRM byte to get more variants
+sU8 TableX[32] =
+{
+  // 0       1       2       3       4       5       6       7
+  fMR|fBI,fERR   ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // escapes for 0xf6
+  fMR|fDI,fERR   ,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI,fMR|fNI, // escapes for 0xf7
+  fMR|fNI,fMR|fNI,fERR   ,fERR   ,fERR   ,fERR   ,fERR   ,fERR   , // escapes for 0xfe
+  fMR|fNI,fMR|fNI,fMR|fNI,fERR   ,fMR|fNI,fERR   ,fMR|fNI,fERR   , // escapes for 0xff
+};
+
+/****************************************************************************/
+/****************************************************************************/
+
+struct DisFilterCtx
+{
+  DataBuffer Buffer[ST_MAX];
+  sU32 FuncTable[256];
+  sBool NextIsFunc;
+
+  sU32 CodeStart,CodeEnd;
+
+  DisFilterCtx(sU32 codeStart,sU32 codeEnd)
+  {
+    ResetCtx(codeStart, codeEnd);
+  }
+
+  void ResetCtx(sU32 codeStart,sU32 codeEnd)
+  {
+    NextIsFunc = sTRUE;
+    for(sInt i=0;i<256;i++)
+      FuncTable[i] = 0;
+
+    CodeStart = codeStart;
+    CodeEnd = codeEnd;
+    for (sInt i=0; i<ST_MAX; i++)
+      Buffer[i].ResetBuffer();
+  }
+
+  sInt DetectJumpTable(sU8 *instr,sU32 addr)
+  {
+    assert(addr < CodeEnd);
+    sInt nMax = (CodeEnd - addr) / 4;
+    sInt count = 0;
+
+    while(count<nMax)
+    {
+      sU32 codedAddr = Load32(instr + count*4);
+      if(codedAddr >= CodeStart && codedAddr < CodeEnd)
+        count++;
+      else
+        break;
+    }
+
+    if(count < 3) // if it's less than 3 entries, it's probably not a jump table.
+      count = 0;
+
+    return count;
+  }
+
+  sInt ProcessInstr(sU8 *instr,sU32 memory)
+  {
+    if(sInt nJump = DetectJumpTable(instr,memory))
+    {
+      // probable jump table with nJump entries
+      sInt remaining = nJump;
+
+      while(remaining)
+      {
+        sInt count = (remaining < 256) ? remaining : 256;
+        Put8(ST_OP,JUMPTAB);
+        Put8(ST_JUMPTBL_COUNT,count-1);
+
+        for(sInt i=0;i<count;i++)
+        {
+          sU32 target = Fetch32(instr);
+          sInt ind = FindMTF(FuncTable,target);
+          Put8(ST_CALL_IDX,ind+1);
+          if(ind == -1)
+            Put32(ST_CALL32,target);
+        }
+
+        remaining -= count;
+      }
+
+      return nJump*4;
+    }
+
+    sU8 *start = instr;
+    sInt code = Fetch8(instr);
+    sInt code2 = 0;
+    sBool o16 = sFALSE;
+    sInt flags;
+
+    if(NextIsFunc && code != 0xcc)
+    {
+      AddMTF(FuncTable,memory);
+      NextIsFunc = sFALSE;
+    }
+
+    if(code == OP_OSIZE)
+    {
+      o16 = sTRUE;
+      code = Fetch8(instr);
+    }
+
+    if(code == OP_2BYTE)
+    {
+      code2 = Fetch8(instr);
+      flags = Table2[code2];
+    }
+    else
+      flags = Table1[code];
+
+    if(code == OP_RETNI || code == OP_RETN || code == OP_INT3) // return. function is going to start next.
+      NextIsFunc = sTRUE;
+
+    if(flags == fMEXTRA)
+      flags = TableX[((*instr >> 3) & 7) | ((code & 0x01) << 3) | ((code & 0x08) << 1)];
+
+    if(flags != fERR)
+    {
+      if(o16)
+        Put8(ST_OP,OP_OSIZE);
+
+      Put8(ST_OP,code);
+      if(code == OP_2BYTE)
+        Put8(ST_OP2,code2);
+
+      if(code == OP_CALLF || code == OP_JMPF || code == OP_ENTER)
+      {
+        // far call/jump have a *48-bit* immediate address. we deal with it here by copying the segment index
+        // manually and encoding the rest as a normal 32-bit direct address.
+        // similarly, enter has a word operand and a byte operand. again, we code the word here, and
+        // deal with the byte later during the normal flow.
+        Copy16(ST_IMM16,instr);
+      }
+
+      if((flags & fMODE) == fMR)
+      {
+        sInt modrm = Copy8(ST_MODRM,instr);
+        sInt sib = 0;
+
+        if((modrm & 0x07) == 4 && modrm < 0xc0)
+          sib = Copy8(ST_SIB,instr);
+
+        if((modrm & 0xc0) == 0x40) // register+byte displacement
+          Copy8(ST_DISP8_R0 + (modrm & 0x07),instr);
+
+        if((modrm & 0xc0) == 0x80 || (modrm & 0xc7) == 0x05 || (modrm < 0x40 && (sib & 0x07) == 5))
+        {
+          // register+dword displacement
+          Copy32((modrm & 0xc7) == 0x05 ? ST_ADDR32 : ST_DISP32,instr);
+        }
+      }
+
+      if((flags & fMODE) == fAM)
+      {
+        switch(flags & fTYPE)
+        {
+        case fAD: Copy32(ST_ADDR32,instr);  break;
+        case fDA: Copy32(ST_AJUMP32,instr); break;
+        case fBR: Copy8(ST_JUMP8,instr);    break;
+
+        case fDR:
+          {
+            sU32 target = Fetch32(instr);
+            target += (instr - start) + memory;
+            if(code != OP_CALLN) // not a near call
+              Put32(ST_JUMP32,target);
+            else
+            {
+              sInt ind = FindMTF(FuncTable,target);
+              Put8(ST_CALL_IDX,ind+1);
+              if(ind == -1)
+                Put32(ST_CALL32,target);
+            }
+          }
+          break;
+        }
+      }
+      else
+      {
+        switch(flags & fTYPE)
+        {
+        case fBI: Copy8(ST_IMM8,instr);   break;
+        case fWI: Copy16(ST_IMM16,instr); break;
+
+        case fDI:
+          if(!o16)
+            Copy32(ST_IMM32,instr);
+          else
+            Copy16(ST_IMM16,instr);
+          break;
+        }
+      }
+
+      return instr - start;
+    }
+    else // couldn't decode instruction
+    {
+      Put8(ST_OP,ESCAPE); // escape code
+      Put8(ST_OP,*start); // the unrecognized opcode
+      return 1;
+    }
+  }
+
+  sU8 *Flush(sU8 *out, sU32 &sz)
+  {
+    sU32 size = 0;
+
+    if (sz < ST_MAX * 16)
+      return (NULL);
+    size = ST_MAX * 4; // 4 bytes per stream to encode the size
+    for(sInt i=0;i<ST_MAX;i++) {
+      size += Buffer[i].Size;
+      if (size >= sz)  return (NULL); // Check for output overflow
+    }
+
+    // Output ptr is supplied by caller
+    sU8 *outPtr = out;
+
+    for(sInt i=0;i<ST_MAX;i++)
+      Write32(outPtr,Buffer[i].Size);
+
+    for(sInt i=0;i<ST_MAX;i++)
+    {
+      memcpy(outPtr,Buffer[i].Data,Buffer[i].Size);
+      outPtr += Buffer[i].Size;
+    }
+
+    assert(outPtr == out + size);
+    sz = size;
+    return out;
+  }
+
+  sU8  Put8(sInt stream,sU8 v)      { Store8  (Buffer[stream].Add(1),v); return v; }
+  sU16 Put16(sInt stream,sU16 v)    { Store16B(Buffer[stream].Add(2),v); return v; }
+  sU32 Put32(sInt stream,sU32 v)    { Store32B(Buffer[stream].Add(4),v); return v; }
+
+  sU8  Copy8(sInt stream,sU8 *&s)   { return Put8 (stream,Fetch8(s));  }
+  sU16 Copy16(sInt stream,sU8 *&s)  { return Put16(stream,Fetch16(s)); }
+  sU32 Copy32(sInt stream,sU8 *&s)  { return Put32(stream,Fetch32(s)); }
+};
+
+/****************************************************************************/
+
+static sU8 *
+DisFilter(DisFilterCtx &ctx, sU8 *src, sU32 size, sU32 origin, sU8 *dst, sU32 &outputSize)
+{
+//  DisFilterCtx ctx(origin,origin+size);
+  
+  // main loop: handle everything but the last few bytes
+  sU32 pos = 0;
+  while(pos < size - MAXINSTR)
+  {
+    sInt bytes = ctx.ProcessInstr(src + pos,origin + pos);
+    pos += bytes;
+  }
+
+  // for the last few bytes, be very careful not to read past the end
+  // of the input instruction stream. create a check point on every
+  // instruction; if PackInstr would've read past the end of the input
+  // stream, we undo the last step.
+  while(pos < size)
+  {
+    // copy remaining instr bytes into buffer
+    sU8 instrBuf[MAXINSTR] = { 0 };
+    memcpy(instrBuf,src + pos,size - pos);
+
+    // save current output size for all streams
+    sInt checkpt[ST_MAX];
+    for(sInt i=0;i<ST_MAX;i++)
+      checkpt[i] = ctx.Buffer[i].Size;
+
+    // process the instruction
+    sInt bytes = ctx.ProcessInstr(instrBuf,origin + pos);
+
+    if(pos + bytes <= size) // valid instruction
+      pos += bytes;
+    else
+    {
+      // we read past the end. restore to checkpoint!
+      for(sInt i=0;i<ST_MAX;i++)
+        ctx.Buffer[i].Size = checkpt[i];
+
+      break;
+    }
+  }
+
+  // if there's still bytes left, encode them as escapes.
+  while(pos < size)
+  {
+    ctx.Put8(ST_OP,ESCAPE);
+    ctx.Put8(ST_OP,src[pos]);
+    pos++;
+  }
+
+  return ctx.Flush(dst, outputSize);
+}
+
+/****************************************************************************/
+
+static inline sU8 Copy8(sU8 *&d,sU8 *&s)      { sU8 v = Fetch8(s); Write8(d,v); return v; }
+static inline sU16 Copy16(sU8 *&d,sU8 *&s)    { sU16 v = Fetch16B(s); Write16(d,v); return v; }
+static inline sU32 Copy32(sU8 *&d,sU8 *&s)    { sU32 v = Fetch32B(s); Write32(d,v); return v; }
+
+// some helpers for bounds checking. this really sucks, but I didn't see any
+// better way to make this safe...
+#define CheckSrc(strm,size)     if(stream[strm]+size > streamEnd[strm]) return sFALSE
+#define CheckDst(size)          if(dest+size > destEnd) return sFALSE
+#define CheckSrcDst(strm,size)  if(stream[strm]+size > streamEnd[strm] || dest+size > destEnd) return sFALSE
+
+#define Copy8Chk(strm)          do { CheckSrcDst(strm,1); Copy8 (dest,stream[strm]); } while(0)
+#define Copy16Chk(strm)         do { CheckSrcDst(strm,2); Copy16(dest,stream[strm]); } while(0)
+#define Copy32Chk(strm)         do { CheckSrcDst(strm,4); Copy32(dest,stream[strm]); } while(0)
+
+static sBool
+DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart)
+{
+  sU8 *stream[ST_MAX];
+  sU8 *streamEnd[ST_MAX];
+  sU32 funcTable[256];
+  
+  // read header (list of stream sizes)
+  if(sourceSize < ST_MAX*4)
+    return sFALSE;
+
+  sU8 *hdr = source;
+  sU8 *cur = source + ST_MAX*4;
+  for(sInt i=0;i<ST_MAX;i++)
+  {
+    stream[i] = cur;
+    cur += Fetch32(hdr);
+    streamEnd[i] = cur;
+  }
+
+  if(cur != source + sourceSize)
+    return sFALSE; // size doesn't make sense
+
+  // start decoding
+  for(sInt i=0;i<256;i++)
+    funcTable[i] = 0;
+
+  sBool nextIsFunc = sTRUE;
+  
+  sU8 *destStart = dest;
+  sU8 *destEnd = destStart + destSize;
+
+  while(stream[ST_OP]<streamEnd[ST_OP])
+  {
+    sU8 *start = dest;
+    sU32 memory = memStart + (dest - destStart);
+
+    sInt code = Fetch8(stream[ST_OP]);
+    if(code == JUMPTAB) // jump table escape
+    {
+      CheckSrc(ST_JUMPTBL_COUNT,1);
+      sInt count = Fetch8(stream[ST_JUMPTBL_COUNT]) + 1;
+      
+      for(sInt i=0;i<count;i++)
+      {
+        sU32 target;
+
+        CheckSrc(ST_CALL_IDX,1);
+        sInt ind = Fetch8(stream[ST_CALL_IDX]);
+        if(ind)
+          target = MoveToFront(funcTable,ind-1,funcTable[ind-1]);
+        else
+        {
+          CheckSrc(ST_CALL32,4);
+          target = Fetch32B(stream[ST_CALL32]);
+          AddMTF(funcTable,target);
+        }
+
+        CheckDst(4);
+        Write32(dest,target);
+      }
+
+      continue;
+    }
+
+    if(nextIsFunc && code != OP_INT3)
+    {
+      AddMTF(funcTable,memory);
+      nextIsFunc = sFALSE;
+    }
+
+    if(code == ESCAPE) // escape
+      Copy8Chk(ST_OP);
+    else
+    {
+      CheckDst(1);
+      Write8(dest,code);
+
+      sInt flags = 0;
+      sBool o16 = sFALSE;
+      if(code == OP_OSIZE) // operand size prefix
+      {
+        o16 = sTRUE;
+        CheckSrcDst(ST_OP,1);
+        code = Copy8(dest,stream[ST_OP]);
+      }
+
+      if(code == OP_RETNI || code == OP_RETN || code == OP_INT3) // return/padding
+        nextIsFunc = sTRUE; // next opcode is likely to be first of a new function
+
+      if(code == OP_2BYTE) // two-byte opcode, additional opcode byte follows
+      {
+        CheckSrcDst(ST_OP2,1);
+        flags = Table2[Copy8(dest,stream[ST_OP2])];
+      }
+      else
+        flags = Table1[code];
+
+      assert(flags != fERR);
+
+      if(code == OP_CALLF || code == OP_JMPF || code == OP_ENTER)
+      {
+        // far call/jump have a *48-bit* immediate address. we deal with it here by copying the segment
+        // index manually and encoding the rest as a normal 32-bit direct address.
+        // similarly, enter has a word operand and a byte operand. again, we code the word here, and
+        // deal with the byte later during the normal flow.
+        Copy16Chk(ST_IMM16);
+      }
+
+      if(flags & fMR)
+      {
+        CheckSrcDst(ST_MODRM,1);
+        sInt modrm = Copy8(dest,stream[ST_MODRM]);
+        sInt sib = 0;
+
+        if(flags == fMEXTRA)
+          flags = TableX[((modrm >> 3) & 7) | ((code & 0x01) << 3) | ((code & 0x08) << 1)];
+
+        if((modrm & 0x07) == 4 && modrm < 0xc0)
+        {
+          CheckSrcDst(ST_SIB,1);
+          sib = Copy8(dest,stream[ST_SIB]);
+        }
+
+        if((modrm & 0xc0) == 0x40) // register+byte displacement
+        {
+          sInt st = (modrm & 0x07) + ST_DISP8_R0;
+          Copy8Chk(st);
+        }
+
+        if((modrm & 0xc0) == 0x80 || (modrm & 0xc7) == 0x05 || (modrm < 0x40 && (sib & 0x07) == 0x05))
+        {
+          sInt st = (modrm & 0xc7) == 5 ? ST_ADDR32 : ST_DISP32;
+          Copy32Chk(st);
+        }
+      }
+
+      if((flags & fMODE) == fAM)
+      {
+        switch(flags & fTYPE)
+        {
+        case fAD: Copy32Chk(ST_ADDR32);   break;
+        case fDA: Copy32Chk(ST_AJUMP32);  break;
+        case fBR: Copy8Chk(ST_JUMP8);     break;
+
+        case fDR:
+          {
+            sU32 target;
+            if(code == OP_CALLN)
+            {
+              CheckSrc(ST_CALL_IDX,1);
+              sInt ind = Fetch8(stream[ST_CALL_IDX]);
+              if(ind)
+                target = MoveToFront(funcTable,ind-1,funcTable[ind-1]);
+              else
+              {
+                CheckSrc(ST_CALL32,4);
+                target = Fetch32B(stream[ST_CALL32]);
+                AddMTF(funcTable,target);
+              }
+            }
+            else
+            {
+              CheckSrc(ST_JUMP32,4);
+              target = Fetch32B(stream[ST_JUMP32]);
+            }
+
+            target -= (dest - start) + 4 + memory;
+            CheckDst(4);
+            Write32(dest,target);
+          }
+          break;
+        }
+      }
+      else
+      {
+        switch(flags & fTYPE)
+        {
+        case fBI: Copy8Chk(ST_IMM8);    break;
+        case fWI: Copy16Chk(ST_IMM16);  break;
+
+        case fDI:
+          if(!o16)
+            Copy32Chk(ST_IMM32);
+          else
+            Copy16Chk(ST_IMM16);
+          break;
+        }
+      }
+    }
+  }
+
+  return sTRUE;
+}
+
+/*
+ * Try to estimate if the given data block contains 32-bit x86 instructions
+ * especially of the call and jmp variety.
+ * TODO: This is a very rough estimation and can probably be improved.
+ */
+static int
+is_x86_code(uchar_t *buf, int len)
+{
+	int e8e9 = 0, ff = 0;
+	uchar_t *pos, *last;
+
+	pos = buf;
+	last = buf + len - 4;
+	while (pos < last) {
+		if (*pos == 0xe8 || *pos == 0xe9) {
+			if (pos[3] == 0xff && pos[4] == 0xff) {
+				e8e9++;
+				ff++;
+				pos += 4;
+			} else if (pos[3] == 0 && pos[4] == 0) {
+				e8e9++;
+				pos += 4;
+			} else {
+				pos++;
+			}
+		} else {
+			pos++;
+		}
+	}
+	return ((double)e8e9/len >= 0.003 && (double)ff/e8e9 >= 0.1);
+}
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * 32-bit x86 executable packer top-level routines. Detected x86 executable data
+ * are passed through these encoding routines. The data chunk is split into 32KB
+ * blocks and each block is separately Dispack-ed. The code tries to detect if
+ * a block contains valid x86 code by trying to estimate some instruction metrics.
+ */
+int
+dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+	uchar_t *pos, *hdr, type, *pos_to, *to_last;
+	uint64_t len;
+
+	if (fromlen < DISFILTER_BLOCK)
+		return (-1);
+
+	pos = from;
+	len = fromlen;
+	pos_to = to;
+	to_last = to + *dstlen;
+	while (len > 0) {
+		DisFilterCtx ctx(0, DISFILTER_BLOCK);
+		sU32 sz;
+		sU16 origsize;
+		sU32 out;
+		sU8 *rv;
+
+		if (len > DISFILTER_BLOCK)
+			sz = DISFILTER_BLOCK;
+		else
+			sz = len;
+
+		hdr = pos_to;
+		type = 0;
+		origsize = sz;
+		if (sz < DISFILTER_BLOCK) {
+			type |= ORIGSIZE;
+			pos_to += EXTENDED_HDR;
+			U16_P(hdr + NORMAL_HDR) = LE16(origsize);
+		} else {
+			pos_to += NORMAL_HDR;
+		}
+
+		out = sz;
+		if (is_x86_code(pos, sz)) {
+			ctx.ResetCtx(0, sz);
+			rv = DisFilter(ctx, pos, sz, 0, pos_to, out);
+		} else {
+			rv = NULL;
+		}
+		if (rv != pos_to || sz == out) {
+			if (pos_to + origsize >= to_last) {
+				return (-1);
+			}
+			type &= CLEAR_DISFILTER;
+			*hdr = type;
+			hdr++;
+			U16_P(hdr) = LE16(origsize);
+			memcpy(pos_to, pos, origsize);
+			pos_to += origsize;
+		} else {
+			sU16 csize;
+
+			if (pos_to + out >= to_last) {
+				return (-1);
+			}
+			type |= DISFILTERED;
+			*hdr = type;
+			hdr++;
+			csize = out;
+			U16_P(hdr) = LE16(csize);
+			pos_to += csize;
+		}
+		pos += sz;
+		len -= sz;
+	}
+	*dstlen = pos_to - to;
+	if ((fromlen - *dstlen) < DIS_MIN_REDUCE) {
+		return (-1);
+	}
+	return (0);
+}
+
+int
+dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+{
+	uchar_t *pos, type, *pos_to, *to_last;
+	uint64_t len;
+
+	pos = from;
+	len = fromlen;
+	pos_to = to;
+	to_last = to + *dstlen;
+	while (len > 0) {
+		sU32 sz, cmpsz;
+
+		type = *pos++;
+		len--;
+		sz = DISFILTER_BLOCK;
+		cmpsz = LE16(U16_P(pos));
+		pos += 2;
+		len -= 2;
+		if (type & ORIGSIZE) {
+			sz = LE16(U16_P(pos));
+			pos += 2;
+			len -= 2;
+		}
+
+		if (type & DISFILTERED) {
+			if (pos_to + sz > to_last)
+				return (-1);
+			if (DisUnFilter(pos, cmpsz, pos_to, sz, 0) != sTRUE)
+				return (-1);
+			pos += cmpsz;
+			pos_to += sz;
+			len -= cmpsz;
+		} else {
+			if (pos_to + cmpsz > to_last)
+				return (-1);
+			memcpy(pos_to, pos, cmpsz);
+			pos += cmpsz;
+			pos_to += cmpsz;
+			len -= cmpsz;
+		}
+	}
+	*dstlen = pos_to - to;
+	return (0);
+}
+
+#ifdef	__cplusplus
+}
+#endif
+
diff --git a/filters/dispack/dis.hpp b/filters/dispack/dis.hpp
new file mode 100644
index 0000000..fc6bb23
--- /dev/null
+++ b/filters/dispack/dis.hpp
@@ -0,0 +1,41 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ */
+
+#ifndef __DIS_HPP__
+#define __DIS_HPP__
+
+#include <utils.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *_dstlen);
+int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/filters/dispack/types.hpp b/filters/dispack/types.hpp
new file mode 100644
index 0000000..f672e1f
--- /dev/null
+++ b/filters/dispack/types.hpp
@@ -0,0 +1,51 @@
+/*
+ * This file is a part of Pcompress, a chunked parallel multi-
+ * algorithm lossless compression and decompression program.
+ *
+ * Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ * moinakg@belenix.org, http://moinakg.wordpress.com/
+ */
+
+#include <stdint.h>
+#include <inttypes.h>
+#include <arpa/inet.h>
+
+#ifndef __TYPES_HPP__
+#define __TYPES_HPP__
+
+typedef unsigned char             sU8;
+typedef signed char               sS8;
+typedef unsigned short            sU16;
+typedef signed short              sS16;
+typedef unsigned int              sU32;
+typedef signed int                sS32;
+typedef uint64_t                  sU64;
+typedef int64_t                   sS64;
+typedef int                       sInt;
+typedef char                      sChar;
+typedef bool                      sBool;
+typedef float                     sF32;
+typedef double                    sF64;
+
+#define sTRUE                     true
+#define sFALSE                    false
+
+#define _byteswap_ushort          htons
+#define _byteswap_ulong           htonl
+#endif
diff --git a/pcompress.c b/pcompress.c
index abf4bfe..dac13d7 100644
--- a/pcompress.c
+++ b/pcompress.c
@@ -55,6 +55,7 @@
 #include <ctype.h>
 #include <errno.h>
 #include <pc_archive.h>
+#include <filters/dispack/dis.hpp>
 
 /*
  * We use 8MB chunks by default.
@@ -215,6 +216,23 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	fromlen = srclen;
 	result = 0;
 
+	/*
+	 * If Dispack is enabled it has to be done first since Dispack analyses the
+	 * x86 instruction stream in the raw data.
+	 */
+	if (pctx->dispack_preprocess && PC_SUBTYPE(btype) == TYPE_EXE32) {
+		_dstlen = fromlen;
+		result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
+		if (result != -1) {
+			uchar_t *tmp;
+			tmp = from;
+			from = to;
+			to = tmp;
+			fromlen = _dstlen;
+			type |= PREPROC_TYPE_DISPACK;
+		}
+	}
+
 	if (pctx->lzp_preprocess) {
 		int hashsize;
 
@@ -335,7 +353,23 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
 		*dstlen = result;
 	}
 
-	if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP)) && type > 0) {
+	/*
+	 * If Dispack is enabled it has to be done first since Dispack analyses the
+	 * x86 instruction stream in the raw data.
+	 */
+	if (type & PREPROC_TYPE_DISPACK) {
+		result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen);
+		if (result != -1) {
+			memcpy(src, dst, _dstlen);
+			srclen = _dstlen;
+			*dstlen = _dstlen;
+		} else {
+			return (result);
+		}
+	}
+
+	if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP | PREPROC_TYPE_DISPACK))
+	    && type > 0) {
 		log_msg(LOG_ERR, 0, "Invalid preprocessing flags: %d", type);
 		return (-1);
 	}
@@ -3153,8 +3187,9 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
 			if (pctx->level > 9) ff.enable_packjpg = 1;
 			init_filters(&ff);
 			pctx->enable_packjpg = ff.enable_packjpg;
+			if (pctx->level > 8) pctx->dispack_preprocess = 1;
 		}
-		if (pctx->lzp_preprocess || pctx->enable_delta2_encode) {
+		if (pctx->lzp_preprocess || pctx->enable_delta2_encode || pctx->dispack_preprocess) {
 			pctx->preprocess_mode = 1;
 		}
 	} else if (pctx->do_uncompress) {
diff --git a/pcompress.h b/pcompress.h
index 45d02bf..38172c1 100644
--- a/pcompress.h
+++ b/pcompress.h
@@ -60,6 +60,7 @@ extern "C" {
 
 #define	PREPROC_TYPE_LZP		1
 #define	PREPROC_TYPE_DELTA2	2
+#define	PREPROC_TYPE_DISPACK	4
 #define	PREPROC_COMPRESSED	128
 
 /*
@@ -205,6 +206,7 @@ typedef struct pc_ctx {
 	int enable_fixed_scan;
 	int preprocess_mode;
 	int lzp_preprocess;
+	int dispack_preprocess;
 	int encrypt_type;
 	int archive_mode;
 	int verbose;
diff --git a/utils/phash/extensions.h b/utils/phash/extensions.h
index c6c765f..98679c7 100644
--- a/utils/phash/extensions.h
+++ b/utils/phash/extensions.h
@@ -88,16 +88,6 @@ struct ext_entry {
 	{"upp"	, TYPE_TEXT, 3},
 	{"mom"	, TYPE_TEXT, 3},
 	{"tmac"	, TYPE_TEXT, 4},
-	{"exe"	, TYPE_BINARY|TYPE_EXE, 3},
-	{"dll"	, TYPE_BINARY|TYPE_EXE, 3},
-	{"bin"	, TYPE_BINARY|TYPE_EXE, 3},
-	{"o"	, TYPE_BINARY|TYPE_EXE, 1},
-	{"a"	, TYPE_BINARY|TYPE_EXE, 1},
-	{"obj"	, TYPE_BINARY|TYPE_EXE, 3},
-	{"so"	, TYPE_BINARY|TYPE_EXE, 2},
-	{"com"	, TYPE_BINARY|TYPE_EXE, 3},
-	{"xpi"	, TYPE_BINARY|TYPE_EXE, 3},
-	{"off"	, TYPE_BINARY|TYPE_EXE, 3},
 	{"pdf"	, TYPE_BINARY, 3},
 	{"jpg"	, TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3},
 	{"jpeg"	, TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4},
@@ -152,5 +142,7 @@ struct ext_entry {
 	{"SVGZ"	, TYPE_BINARY, 4},
 	{"ODT"	, TYPE_BINARY, 3},
 	{"3DM"	, TYPE_BINARY, 3},
+	{"chm"	, TYPE_BINARY, 3},
+	{"CHM"	, TYPE_BINARY, 3},
 };
 #endif
diff --git a/utils/phash/extensions.txt b/utils/phash/extensions.txt
index ccde73f..8c43b42 100644
--- a/utils/phash/extensions.txt
+++ b/utils/phash/extensions.txt
@@ -78,16 +78,6 @@ am,TYPE_TEXT
 upp,TYPE_TEXT
 mom,TYPE_TEXT
 tmac,TYPE_TEXT
-exe,TYPE_BINARY|TYPE_EXE
-dll,TYPE_BINARY|TYPE_EXE
-bin,TYPE_BINARY|TYPE_EXE
-o,TYPE_BINARY|TYPE_EXE
-a,TYPE_BINARY|TYPE_EXE
-obj,TYPE_BINARY|TYPE_EXE
-so,TYPE_BINARY|TYPE_EXE
-com,TYPE_BINARY|TYPE_EXE
-xpi,TYPE_BINARY|TYPE_EXE
-off,TYPE_BINARY|TYPE_EXE
 pdf,TYPE_BINARY
 jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
 jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
@@ -142,3 +132,5 @@ swf,TYPE_BINARY
 SVGZ,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
 ODT,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP
 3DM,TYPE_BINARY
+chm,TYPE_BINARY
+CHM,TYPE_BINARY
diff --git a/utils/phash/phash.c b/utils/phash/phash.c
index 3196f2f..2c13753 100644
--- a/utils/phash/phash.c
+++ b/utils/phash/phash.c
@@ -12,14 +12,14 @@
 
 /* small adjustments to _a_ to make values distinct */
 ub1 tab[] = {
-125,0,0,220,235,125,82,0,113,0,0,7,0,0,82,0,
-0,0,7,124,0,0,82,0,0,125,0,7,0,220,125,120,
-0,0,0,0,22,0,0,113,0,113,113,0,0,125,85,0,
-113,0,11,113,125,7,0,0,0,40,0,113,85,0,0,125,
-0,113,0,0,113,0,125,183,40,27,7,15,58,183,113,0,
-124,0,0,22,125,220,0,40,0,87,87,125,113,0,183,125,
-0,125,87,7,0,85,0,0,59,229,85,7,135,116,0,146,
-0,0,82,0,0,0,200,0,56,125,0,0,61,202,0,0,
+125,0,0,87,7,113,82,120,113,0,0,113,0,0,113,125,
+0,0,7,113,0,113,0,0,0,7,0,131,0,85,0,22,
+0,113,0,0,85,0,0,113,0,113,125,113,0,7,22,0,
+82,0,0,113,125,125,0,0,0,0,0,113,22,0,0,125,
+0,87,0,0,113,0,125,183,82,0,124,88,40,125,0,0,
+124,0,168,125,0,125,0,40,0,82,125,113,113,125,116,0,
+0,0,113,85,0,88,0,0,42,27,0,0,0,40,183,61,
+0,0,0,0,0,111,17,0,87,125,0,0,166,91,0,0,
 };
 
 /* The hash function */
diff --git a/utils/phash/phash.h b/utils/phash/phash.h
index aa7445c..74bd726 100644
--- a/utils/phash/phash.h
+++ b/utils/phash/phash.h
@@ -8,7 +8,7 @@
 
 extern ub1 tab[];
 #define PHASHLEN 0x80  /* length of hash mapping table */
-#define PHASHNKEYS 141  /* How many keys were hashed */
+#define PHASHNKEYS 133  /* How many keys were hashed */
 #define PHASHRANGE 256  /* Range any input might map to */
 #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */
 
diff --git a/utils/utils.h b/utils/utils.h
index 973b97d..7bb0d6f 100644
--- a/utils/utils.h
+++ b/utils/utils.h
@@ -245,8 +245,8 @@ typedef enum {
 	/*
 	 * Sub-types.
 	 */
-#define	NUM_SUB_TYPES	20
-	TYPE_EXE = 8,
+#define	NUM_SUB_TYPES	24
+	TYPE_EXE32 = 8,
 	TYPE_JPEG = 16,
 	TYPE_MARKUP = 24,
 	TYPE_COMPRESSED_GZ = 32,
@@ -268,7 +268,8 @@ typedef enum {
 	TYPE_PACKJPG = 160,
 	TYPE_DNA_SEQ = 168,
 	TYPE_MJPEG = 176,
-	TYPE_AUDIO_COMPRESSED = 184
+	TYPE_AUDIO_COMPRESSED = 184,
+	TYPE_EXE64 = 192
 } data_type_t;
 
 /*