From 51249c858d442fc77f884a9da210e4961abbce53 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 3 Dec 2012 23:45:41 +0530 Subject: [PATCH] Work in progress Adaptive Delta Encoding. --- delta2/delta2.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++++ delta2/delta2.h | 42 +++++++ main.c | 17 +++ 3 files changed, 356 insertions(+) create mode 100644 delta2/delta2.c create mode 100644 delta2/delta2.h diff --git a/delta2/delta2.c b/delta2/delta2.c new file mode 100644 index 0000000..a8060ca --- /dev/null +++ b/delta2/delta2.c @@ -0,0 +1,297 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +/* + * These routines perform a kind of Adaptive Delta Encoding. + * Initially the buffer is scanned to identify spans of values that + * are monotonically increasing in arithmetic progression. These + * values are not single bytes but consists of a stride of bytes + * packed into an integer representation. Multiple stride lengths + * (3, 5, 7, 8) are tried to find the one that gives the maximum + * reduction. A span length threshold in bytes is used. Byte spans + * less than this threshold are ignored. + * Bytes are packed into integers in big-endian format. + * + * After an optimal stride length has been identified the encoder + * performs a delta run length encoding on the spans. Two types of + * objects are output by the encoder: + * 1) A literal run of unmodified bytes. Header: 1 zero byte followed + * by a 64bit length in bytes. + * 2) An encoded run length of a series in arithmetic progression. + * Header: 1 byte stride length + * 64bit length of span in bytes + * 64bit starting value of series + * 64bit delta value + */ +#include +#include +#include "delta2.h" + +// Size of original data. 64 bits. +#define MAIN_HDR (sizeof (uint64_t)) + +// Literal text header block: +// 1-byte flag +// 64bit length of run in bytes. +#define LIT_HDR (1 + sizeof (uint64_t)) + +// Delta encoded header block: +// 1-byte flag indicating stride length +// 64bit length of span in bytes +// 64bit initial value +// 64bit delta value +#define DELTA_HDR (1 + (sizeof (uint64_t)) * 3) + +int +delta2_encode(uchar_t *src, size_t srclen, uchar_t *dst, size_t *dstlen, int rle_thresh) +{ + uint64_t snum, gtot1, gtot2, tot; + uint64_t cnt, val, sval; + uint64_t vl1, vl2, vld1, vld2; + uchar_t *pos, *pos1, *pos2, stride, st1; + uchar_t strides[4] = {3, 5, 7, 8}; + int st, sz; + + gtot1 = ULL_MAX; + stride = 0; + sz = sizeof (strides) / sizeof (strides[0]); + + /* + * Estimate which stride length gives the max reduction given rle_thresh. + */ + for (st = 0; st < sz; st++) { + snum = 0; + gtot2 = MAIN_HDR + LIT_HDR; + vl1 = 0; + vld1 = 0; + tot = 0; + pos = src; + st1 = strides[st]; + for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += st1) { + vl2 = *((uint64_t *)pos); + vl2 = htonll(vl2); + vl2 >>= ((sizeof (vl2) - st1) << 3); + vld2 = vl2 - vl1; + if (vld1 != vld2) { + if (snum > rle_thresh) { + if (tot > 0) { + gtot2 += LIT_HDR; + tot = 0; + } + gtot2 += DELTA_HDR; + } else { + gtot2 += snum; + tot += snum; + } + snum = 0; + } + snum += st1; + vld1 = vld2; + vl1 = vl2; + pos += st1; + } + if (snum > 1) { + if (snum > rle_thresh) { + gtot2 += DELTA_HDR; + } else { + gtot2 += snum; + } + } + if (gtot2 < gtot1) { + gtot1 = gtot2; + stride = st1; + } + } + + if (!(gtot1 < srclen && srclen - gtot1 > (DELTA_HDR + LIT_HDR + MAIN_HDR))) { + DEBUG_STAT_EN(fprintf(stderr, "No Delta\n")); + return (-1); + } + DEBUG_STAT_EN(fprintf(stderr, "Found Delta: %llu (srclen: %llu), stride: %d\n", gtot1, srclen, stride)); + + /* + * Now perform encoding using the stride length. + */ + snum = 0; + vl1 = 0; + vld1 = 0; + gtot1 = 0; + pos = src; + pos1 = dst; + *((uint64_t *)pos1) = htonll(srclen); + pos1 += MAIN_HDR; + pos2 = pos1; + pos1 += LIT_HDR; + + vl2 = *((uint64_t *)pos); + vl2 = htonll(vl2); + vl2 >>= ((sizeof (vl2) - stride) << 3); + sval = vl2; + + for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += stride) { + val = *((uint64_t *)pos); + vl2 = htonll(val); + vl2 >>= ((sizeof (vl2) - stride) << 3); + vld2 = vl2 - vl1; + if (vld1 != vld2) { + if (snum > rle_thresh) { + if (gtot1 > 0) { + /* + * Encode previous literal run, if any. + */ + *pos2 = 0; + pos2++; + *((uint64_t *)pos2) = htonll(gtot1); + pos2 += (gtot1 + sizeof (uint64_t)); + gtot1 = 0; + } + /* + * RLE Encode delta series. + */ + *pos2 = stride; + pos2++; + *((uint64_t *)pos2) = htonll(snum); + pos2 += sizeof (uint64_t); + *((uint64_t *)pos2) = htonll(sval); + pos2 += sizeof (uint64_t); + *((uint64_t *)pos2) = htonll(vld1); + pos2 += sizeof (uint64_t); + pos1 = pos2 + LIT_HDR; + } else { + gtot1 += snum; + } + snum = 0; + sval = vl2; + } + *((uint64_t *)pos1) = val; + pos1 += stride; + snum += stride; + vld1 = vld2; + vl1 = vl2; + pos += stride; + } + + if (snum > 0) { + if (snum > rle_thresh) { + if (gtot1 > 0) { + *pos2 = 0; + pos2++; + *((uint64_t *)pos2) = htonll(gtot1); + pos2 += (gtot1 + sizeof (uint64_t)); + gtot1 = 0; + } + *pos2 = stride; + pos2++; + *((uint64_t *)pos2) = htonll(snum); + pos2 += sizeof (uint64_t); + *((uint64_t *)pos2) = htonll(sval); + pos2 += sizeof (uint64_t); + *((uint64_t *)pos2) = htonll(vld1); + pos2 += sizeof (uint64_t); + } else { + gtot1 += snum; + *pos2 = 0; + pos2++; + *((uint64_t *)pos2) = htonll(gtot1); + pos2 += (gtot1 + sizeof (uint64_t)); + } + } + + val = srclen - (pos - src); + if (val > 0) { + /* + * Encode left over bytes, if any, at the end into a + * literal run. + */ + *pos2 = 0; + pos2++; + *((uint64_t *)pos2) = htonll(val); + pos2 += sizeof (uint64_t); + for (cnt = 0; cnt < val; cnt++) { + *pos2 = *pos; + pos2++; pos++; + } + } + *dstlen = pos2 - dst; + return (0); +} + +int +delta2_decode(uchar_t *src, size_t srclen, uchar_t *dst, size_t *dstlen) +{ + uchar_t *pos, *pos1, *last; + uint64_t olen, val, sval, delta, rcnt, cnt, out; + uchar_t stride; + + pos = src; + pos1 = dst; + + last = pos + srclen; + olen = ntohll(*((uint64_t *)pos)); + if (*dstlen < (olen + 8)) + return (-1); + + out = 0; + pos += MAIN_HDR; + + while (pos < last) { + if (*pos == 0) { + /* + * Copy over literal run of bytes. + */ + pos++; + rcnt = ntohll(*((uint64_t *)pos)); + pos += sizeof (rcnt); + if (out + rcnt > *dstlen) { + return (-1); + } + for (cnt = 0; cnt < rcnt; cnt++) { + *pos1 = *pos; + pos++; pos1++; out++; + } + } else { + stride = *pos; + pos++; + rcnt = ntohll(*((uint64_t *)pos)); + pos += sizeof (rcnt); + sval = ntohll(*((uint64_t *)pos)); + pos += sizeof (sval); + delta = ntohll(*((uint64_t *)pos)); + pos += sizeof (delta); + if (out + rcnt > *dstlen) { + return (-1); + } + + /* + * Recover original bytes from the arithmetic series using + * length, starting value and delta. + */ + for (cnt = 0; cnt < rcnt/stride; cnt++) { + val = sval << ((sizeof (val) - stride) << 3); + *((uint64_t *)pos1) = ntohll(val); + out += stride; + sval += delta; + pos1 += stride; + } + } + } + *dstlen = out; + return (0); +} diff --git a/delta2/delta2.h b/delta2/delta2.h new file mode 100644 index 0000000..42225df --- /dev/null +++ b/delta2/delta2.h @@ -0,0 +1,42 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +#ifndef _DELTA2_H +#define _DELTA2_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int delta2_encode(uchar_t *src, size_t srclen, uchar_t *dst, size_t *dstlen, int rle_thresh); +int delta2_decode(uchar_t *src, size_t srclen, uchar_t *dst, size_t *dstlen); + +#define ULL_MAX (18446744073709551615ULL) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/main.c b/main.c index ea3cceb..7da3a0e 100644 --- a/main.c +++ b/main.c @@ -210,6 +210,13 @@ preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst return (-1); } + _dstlen = srclen; + result = delta2_encode(src, srclen, dst, &_dstlen, 150); + if (result != -1) { + memcpy(src, dst, _dstlen); + srclen = _dstlen; + } + *dest = type; *((int64_t *)(dest + 1)) = htonll(srclen); _dstlen = srclen; @@ -220,6 +227,7 @@ preproc_compress(compress_func_ptr cmp_func, void *src, size_t srclen, void *dst } else { result = -1; } + result = 0; return (result); } @@ -229,6 +237,7 @@ preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *d { uchar_t *sorc = (uchar_t *)src, type; ssize_t result; + uint64_t _dstlen = *dstlen; type = *sorc; sorc++; @@ -243,6 +252,14 @@ preproc_decompress(compress_func_ptr dec_func, void *src, size_t srclen, void *d srclen = *dstlen; } + result = delta2_decode(src, srclen, dst, &_dstlen); + if (result != -1) { + memcpy(src, dst, _dstlen); + srclen = _dstlen; + } else { + return (result); + } + if (type & PREPROC_TYPE_LZP) { int hashsize; hashsize = lzp_hash_size(level);