/* * This file is a part of Pcompress, a chunked parallel multi- * algorithm lossless compression and decompression program. * * Copyright (C) 2012 Moinak Ghosh. All rights reserved. * Use is subject to license terms. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * moinakg@belenix.org, http://moinakg.wordpress.com/ */ /* * These routines perform a kind of Adaptive Delta Encoding. * Initially the buffer is scanned to identify spans of values that * are monotonically increasing in arithmetic progression. These * values are not single bytes but consists of a stride of bytes * packed into an integer representation. Multiple stride lengths * (3, 5, 7, 8) are tried to find the one that gives the maximum * reduction. A span length threshold in bytes is used. Byte spans * less than this threshold are ignored. * Bytes are packed into integers in little-endian format. * * After an optimal stride length has been identified the encoder * performs a delta run length encoding on the spans. Two types of * objects are output by the encoder: * 1) A literal run of unmodified bytes. Header: * 64-bit encoded value of the following format * Most Significant Byte = 0 * Remaining Bytes = Length of literal span in bytes * 2) An encoded run length of a series in arithmetic progression. * Header: 64bit encoded value * 64bit starting value of series * 64bit delta value * 64-bit encoded value is of the following format * Most Significant Byte = Stride length * Remaining Bytes = Number of bytes in the span */ #include #include #include #include #include "delta2.h" // Size of original data. 64 bits. #define MAIN_HDR (sizeof (uint64_t)) // Literal text header block: // 64bit encoded value. #define LIT_HDR (sizeof (uint64_t)) // Delta encoded header block: // 64bit encoded value // 64bit initial value // 64bit delta value #define DELTA_HDR ((sizeof (uint64_t)) * 3) // Minimum span length #define MIN_THRESH (50) // Maximum data length (16TB) #define MAX_THRESH (0x100000000000ULL) #define MSB_SETZERO_MASK (0xffffffffffffffULL) #define MSB_SHIFT (56) /* * Delta2 algorithm processes data in blocks. The 4K size below is somewhat * adhoc but a couple of considerations were looked at: * 1) Balance between number of headers and delta runs. Too small chunks * will increase header counts for long delta runs spanning chunks. * Too large chunks will reduce effectiveness of locating more data * tables. * 2) Chunk size should ideally be small enough to fit into L1 cache. */ #define DELTA2_CHUNK (4096) /* * Byteswap macros. We optimize for little-endian, so values are stored * and interpreted in little-endian order. */ #if BYTE_ORDER == BIG_ENDIAN #define HTONLL htonll #define NTOHLL ntohll #else #define HTONLL #define NTOHLL #endif static int delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int rle_thresh, int last_encode, int *hdr_ovr); int delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int rle_thresh) { if (srclen > MAX_THRESH) { DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 " is too big.\n", srclen)); return (-1); } if (*dstlen < DELTA2_CHUNK) { int hdr_ovr; int rv; hdr_ovr = 0; rv = delta2_encode_real(src, srclen, dst, dstlen, rle_thresh, 1, &hdr_ovr); DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 ", dstlen: %" PRIu64 "\n", srclen, *dstlen)); DEBUG_STAT_EN(fprintf(stderr, "DELTA2: header overhead: %d\n", hdr_ovr)); } else { uchar_t *srcpos, *dstpos, *lastdst, *lastsrc, *dstend; uint64_t slen, sz, dsz, pending; int rem, lenc, hdr_ovr; DEBUG_STAT_EN(double strt, en); srcpos = src; dstpos = dst; dstend = dst + *dstlen; slen = srclen; pending = 0; *((uint64_t *)dstpos) = HTONLL(srclen); dstpos += MAIN_HDR; lastdst = dstpos; lastsrc = srcpos; hdr_ovr = 0; DEBUG_STAT_EN(strt = get_wtime_millis()); while (slen > 0) { if (slen > DELTA2_CHUNK) { sz = DELTA2_CHUNK; lenc = 0; } else { sz = slen; lenc = 1; } dsz = sz; rem = delta2_encode_real(srcpos, sz, dstpos, &dsz, rle_thresh, lenc, &hdr_ovr); if (rem == -1) { if (pending == 0) { lastdst = dstpos; lastsrc = srcpos; dstpos += LIT_HDR; } pending += dsz; srcpos += dsz; dstpos += dsz; slen -= dsz; } else { if (pending) { pending &= MSB_SETZERO_MASK; *((uint64_t *)lastdst) = HTONLL(pending); lastdst += sizeof (uint64_t); memcpy(lastdst, lastsrc, pending); pending = 0; } srcpos += (sz - rem); slen -= (sz - rem); dstpos += dsz; if (dstpos > dstend) { DEBUG_STAT_EN(fprintf(stderr, "No Delta\n")); return (-1); } } } if (pending) { pending &= MSB_SETZERO_MASK; *((uint64_t *)lastdst) = HTONLL(pending); lastdst += sizeof (uint64_t); if (lastdst + pending > dstend) { DEBUG_STAT_EN(fprintf(stderr, "No Delta\n")); return (-1); } memcpy(lastdst, lastsrc, pending); } *dstlen = dstpos - dst; DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 ", dstlen: %" PRIu64 "\n", srclen, *dstlen)); DEBUG_STAT_EN(fprintf(stderr, "DELTA2: header overhead: %d\n", hdr_ovr)); DEBUG_STAT_EN(fprintf(stderr, "DELTA2: Processed at %.3f MB/s\n", get_mb_s(srclen, strt, en))); } return (0); } static int delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int rle_thresh, int last_encode, int *hdr_ovr) { uint64_t snum, gtot1, gtot2, tot; uint64_t cnt, val, sval; uint64_t vl1, vl2, vld1, vld2; uchar_t *pos, *pos2, stride, st1; uchar_t strides[4] = {3, 5, 7, 8}; int st, sz; if (rle_thresh < MIN_THRESH) return (-1); gtot1 = ULL_MAX; stride = 0; val = 0; tot = 0; sz = sizeof (strides) / sizeof (strides[0]); /* * Estimate which stride length gives the max reduction given rle_thresh. */ for (st = 0; st < sz; st++) { int gt; snum = 0; gtot2 = LIT_HDR; vl1 = 0; vld1 = 0; tot = 0; pos = src; st1 = strides[st]; sval = st1; sval = ((sval << 3) - 1); sval = (1ULL << sval); sval |= (sval - 1); for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += st1) { vl2 = *((uint64_t *)pos); vl2 = HTONLL(vl2); vl2 &= sval; vld2 = vl2 - vl1; if (vld1 != vld2) { if (snum > rle_thresh) { gt = (tot > 0); gtot2 += (LIT_HDR * gt); tot = 0; gtot2 += DELTA_HDR; } else { gtot2 += snum; tot += snum; } snum = 0; } snum += st1; vld1 = vld2; vl1 = vl2; pos += st1; } if (snum > rle_thresh) { gtot2 += DELTA_HDR; /* * If this ended into another table reset next scan * point to beginning of the table. */ val = cnt - snum; } else { gtot2 += snum; /* * If this ended into another table reset next scan * point to beginning of the table. */ if (snum >= (MIN_THRESH>>1)) val = cnt - snum; } if (gtot2 < gtot1) { gtot1 = gtot2; stride = st1; tot = val; } } if ( !(gtot1 < srclen && srclen - gtot1 > (DELTA_HDR + LIT_HDR + MAIN_HDR) && gtot1 < *dstlen) ) { if (srclen >= DELTA2_CHUNK) { if (tot > 0) *dstlen = tot; } return (-1); } /* * Now perform encoding using the stride length. */ snum = 0; vl1 = 0; vld1 = 0; gtot1 = 0; pos = src; pos2 = dst; vl2 = *((uint64_t *)pos); vl2 = HTONLL(vl2); val = stride; val = ((val << 3) - 1); val = (1ULL << val); val |= (val - 1); vl2 &= val; sval = vl2; for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += stride) { vl2 = *((uint64_t *)pos); vl2 = HTONLL(vl2); vl2 &= val; vld2 = vl2 - vl1; if (vld1 != vld2) { if (snum > rle_thresh) { if (gtot1 > 0) { gtot1 &= MSB_SETZERO_MASK; *((uint64_t *)pos2) = HTONLL(gtot1); pos2 += sizeof (uint64_t); DEBUG_STAT_EN(*hdr_ovr += LIT_HDR); memcpy(pos2, pos - (gtot1+snum), gtot1); pos2 += gtot1; gtot1 = 0; } /* * RLE Encode delta series. */ gtot2 = stride; gtot2 <<= MSB_SHIFT; gtot2 |= (snum & MSB_SETZERO_MASK); *((uint64_t *)pos2) = HTONLL(gtot2); pos2 += sizeof (uint64_t); *((uint64_t *)pos2) = HTONLL(sval); pos2 += sizeof (uint64_t); *((uint64_t *)pos2) = HTONLL(vld1); pos2 += sizeof (uint64_t); DEBUG_STAT_EN(*hdr_ovr += DELTA_HDR); } else { gtot1 += snum; } snum = 0; sval = vl2; } snum += stride; vld1 = vld2; vl1 = vl2; pos += stride; } if (snum > 0) { if (snum > rle_thresh) { if (gtot1 > 0) { gtot1 &= MSB_SETZERO_MASK; *((uint64_t *)pos2) = HTONLL(gtot1); pos2 += sizeof (uint64_t); DEBUG_STAT_EN(*hdr_ovr += LIT_HDR); memcpy(pos2, pos - (gtot1+snum), gtot1); pos2 += gtot1; gtot1 = 0; } gtot2 = stride; gtot2 <<= MSB_SHIFT; gtot2 |= (snum & MSB_SETZERO_MASK); *((uint64_t *)pos2) = HTONLL(gtot2); pos2 += sizeof (uint64_t); *((uint64_t *)pos2) = HTONLL(sval); pos2 += sizeof (uint64_t); *((uint64_t *)pos2) = HTONLL(vld1); pos2 += sizeof (uint64_t); DEBUG_STAT_EN(*hdr_ovr += DELTA_HDR); } else if (last_encode) { gtot1 += snum; gtot1 &= MSB_SETZERO_MASK; *((uint64_t *)pos2) = HTONLL(gtot1); pos2 += sizeof (uint64_t); DEBUG_STAT_EN(*hdr_ovr += LIT_HDR); memcpy(pos2, pos - gtot1, gtot1); pos2 += gtot1; } else { gtot1 += snum; } } if (last_encode) { val = srclen - (pos - src); if (val > 0) { /* * Encode left over bytes, if any, at the end into a * literal run. */ val &= MSB_SETZERO_MASK; *((uint64_t *)pos2) = HTONLL(val); pos2 += sizeof (uint64_t); for (cnt = 0; cnt < val; cnt++) { *pos2 = *pos; pos2++; pos++; } DEBUG_STAT_EN(*hdr_ovr += LIT_HDR); } val = 0; } else { val = gtot1 + (srclen - (pos - src)); } *dstlen = pos2 - dst; return (val); } int delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen) { uchar_t *pos, *pos1, *last; uint64_t olen, val, sval, delta, rcnt, cnt, out, vl; uchar_t stride, flags; DEBUG_STAT_EN(double strt, en); pos = src; pos1 = dst; DEBUG_STAT_EN(strt = get_wtime_millis()); last = pos + srclen; olen = NTOHLL(*((uint64_t *)pos)); if (*dstlen < olen) { fprintf(stderr, "DELTA2 Decode: Destination buffer too small.\n"); return (-1); } out = 0; pos += MAIN_HDR; while (pos < last) { val = *((uint64_t *)pos); val = NTOHLL(val); flags = (val >> MSB_SHIFT) & 0xff; if (flags == 0) { /* * Copy over literal run of bytes. */ rcnt = val & MSB_SETZERO_MASK; pos += sizeof (rcnt); if (out + rcnt > *dstlen) { fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n"); return (-1); } memcpy(pos1, pos, rcnt); pos += rcnt; pos1 += rcnt; out += rcnt; } else { stride = flags; rcnt = val & MSB_SETZERO_MASK; pos += sizeof (rcnt); sval = NTOHLL(*((uint64_t *)pos)); pos += sizeof (sval); delta = NTOHLL(*((uint64_t *)pos)); pos += sizeof (delta); if (out + rcnt > *dstlen) { fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n"); return (-1); } vl = stride; vl = (vl << 3) - 1; vl = (1ULL << vl); vl |= (vl - 1); /* * Recover original bytes from the arithmetic series using * length, starting value and delta. */ for (cnt = 0; cnt < rcnt/stride; cnt++) { val = (sval & vl); *((uint64_t *)pos1) = NTOHLL(val); out += stride; sval += delta; pos1 += stride; } } } *dstlen = out; DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(fprintf(stderr, "DELTA2: Decoded at %.3f MB/s\n", get_mb_s(out, strt, en))); return (0); }