pcompress/delta2/delta2.c
Moinak Ghosh 36d95276ee Further improvements to Delta2 performance.
Fix the byteswap macros.
Start adding assertions.
2012-12-28 22:12:38 +05:30

480 lines
12 KiB
C

/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*/
/*
* These routines perform a kind of Adaptive Delta Encoding.
* Initially the buffer is scanned to identify spans of values that
* are monotonically increasing in arithmetic progression. These
* values are not single bytes but consists of a stride of bytes
* packed into an integer representation. Multiple stride lengths
* (3, 5, 7, 8) are tried to find the one that gives the maximum
* reduction. A span length threshold in bytes is used. Byte spans
* less than this threshold are ignored.
* Bytes are packed into integers in little-endian format.
*
* After an optimal stride length has been identified the encoder
* performs a delta run length encoding on the spans. Two types of
* objects are output by the encoder:
* 1) A literal run of unmodified bytes. Header:
* 64-bit encoded value of the following format
* Most Significant Byte = 0
* Remaining Bytes = Length of literal span in bytes
* 2) An encoded run length of a series in arithmetic progression.
* Header: 64bit encoded value
* 64bit starting value of series
* 64bit delta value
* 64-bit encoded value is of the following format
* Most Significant Byte = Stride length
* Remaining Bytes = Number of bytes in the span
*
* We optimize for little-endian, so values are stored and interpreted
* in little-endian order.
*/
#include <stdio.h>
#include <string.h>
#include <utils.h>
#include <transpose.h>
#include "delta2.h"
// Size of original data. 64 bits.
#define MAIN_HDR (sizeof (uint64_t))
// Literal text header block:
// 64bit encoded value.
#define LIT_HDR (sizeof (uint64_t))
// Delta encoded header block:
// 64bit encoded value
// 64bit initial value
// 64bit delta value
#define DELTA_HDR ((sizeof (uint64_t)) * 3)
// Minimum span length
#define MIN_THRESH (50)
// Maximum data length (16TB)
#define MAX_THRESH (0x100000000000ULL)
#define MSB_SETZERO_MASK (0xffffffffffffffULL)
#define MSB_SHIFT (56)
/*
* Delta2 algorithm processes data in blocks. The 4K size below is somewhat
* adhoc but a couple of considerations were looked at:
* 1) Balance between number of headers and delta runs. Too small chunks
* will increase header counts for long delta runs spanning chunks.
* Too large chunks will reduce effectiveness of locating more data
* tables.
* 2) Chunk size should ideally be small enough to fit into L1 cache.
*/
#define DELTA2_CHUNK (4096)
/*
* Stride values to be checked. As of this implementation strides only
* upto 8 bytes (uint64_t) are supported.
*/
#define NSTRIDES 4
static uchar_t strides[NSTRIDES] = {3, 5, 7, 8};
static int delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen,
int rle_thresh, int last_encode, int *hdr_ovr);
int
delta2_encode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen, int rle_thresh)
{
if (srclen > MAX_THRESH) {
DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 " is too big.\n", srclen));
return (-1);
}
if (rle_thresh < MIN_THRESH)
return (-1);
if (*dstlen < DELTA2_CHUNK) {
int hdr_ovr;
int rv;
hdr_ovr = 0;
rv = delta2_encode_real(src, srclen, dst, dstlen, rle_thresh, 1, &hdr_ovr);
DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 ", dstlen: %" PRIu64 "\n", srclen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DELTA2: header overhead: %d\n", hdr_ovr));
} else {
uchar_t *srcpos, *dstpos, *lastdst, *lastsrc, *dstend;
uint64_t slen, sz, dsz, pending;
int rem, lenc, hdr_ovr;
DEBUG_STAT_EN(double strt, en);
srcpos = src;
dstpos = dst;
dstend = dst + *dstlen;
slen = srclen;
pending = 0;
*((uint64_t *)dstpos) = LE64(srclen);
dstpos += MAIN_HDR;
lastdst = dstpos;
lastsrc = srcpos;
hdr_ovr = 0;
DEBUG_STAT_EN(strt = get_wtime_millis());
while (slen > 0) {
if (slen > DELTA2_CHUNK) {
sz = DELTA2_CHUNK;
lenc = 0;
} else {
sz = slen;
lenc = 1;
}
dsz = sz;
rem = delta2_encode_real(srcpos, sz, dstpos, &dsz, rle_thresh, lenc,
&hdr_ovr);
if (rem == -1) {
if (pending == 0) {
lastdst = dstpos;
lastsrc = srcpos;
dstpos += LIT_HDR;
}
pending += dsz;
srcpos += dsz;
dstpos += dsz;
slen -= dsz;
} else {
if (pending) {
pending &= MSB_SETZERO_MASK;
*((uint64_t *)lastdst) = LE64(pending);
lastdst += sizeof (uint64_t);
memcpy(lastdst, lastsrc, pending);
pending = 0;
}
srcpos += (sz - rem);
slen -= (sz - rem);
dstpos += dsz;
if (dstpos > dstend) {
DEBUG_STAT_EN(fprintf(stderr, "No Delta\n"));
return (-1);
}
}
}
if (pending) {
pending &= MSB_SETZERO_MASK;
*((uint64_t *)lastdst) = LE64(pending);
lastdst += sizeof (uint64_t);
if (lastdst + pending > dstend) {
DEBUG_STAT_EN(fprintf(stderr, "No Delta\n"));
return (-1);
}
memcpy(lastdst, lastsrc, pending);
}
*dstlen = dstpos - dst;
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DELTA2: srclen: %" PRIu64 ", dstlen: %" PRIu64 "\n", srclen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DELTA2: header overhead: %d\n", hdr_ovr));
DEBUG_STAT_EN(fprintf(stderr, "DELTA2: Processed at %.3f MB/s\n", get_mb_s(srclen, strt, en)));
}
return (0);
}
static int
delta2_encode_real(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen,
int rle_thresh, int last_encode, int *hdr_ovr)
{
uint64_t snum, gtot1, gtot2, tot;
uint64_t cnt, val, sval;
uint64_t vl1, vl2, vld1, vld2;
uchar_t *pos, *pos2, stride, st1;
int st;
assert(srclen == *dstlen);
gtot1 = ULL_MAX;
stride = 0;
tot = 0;
/*
* Estimate which stride length gives the max reduction given rle_thresh.
*/
for (st = 0; st < NSTRIDES; st++) {
int gt;
snum = 0;
gtot2 = LIT_HDR;
vl1 = 0;
vld1 = 0;
tot = 0;
pos = src;
st1 = strides[st];
sval = st1;
sval = ((sval << 3) - 1);
sval = (1ULL << sval);
sval |= (sval - 1);
val = 0;
for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += st1) {
vl2 = *((uint64_t *)pos);
vl2 = LE64(vl2);
vl2 &= sval;
vld2 = vl2 - vl1;
if (vld1 != vld2) {
if (snum > rle_thresh) {
gt = (tot > 0);
gtot2 += (LIT_HDR * gt);
tot = 0;
gtot2 += DELTA_HDR;
} else {
gtot2 += snum;
tot += snum;
}
snum = 0;
}
snum += st1;
vld1 = vld2;
vl1 = vl2;
pos += st1;
}
if (snum > rle_thresh) {
gtot2 += DELTA_HDR;
/*
* If this ended into another table reset next scan
* point to beginning of the table.
*/
val = cnt - snum;
} else {
gtot2 += snum;
/*
* If this ended into another table reset next scan
* point to beginning of the table.
*/
if (snum >= (MIN_THRESH>>1))
val = cnt - snum;
}
if (gtot2 < gtot1) {
gtot1 = gtot2;
stride = st1;
tot = val;
}
}
/*
* No need to check for destination buffer overflow since
* dstlen == srclen always.
*/
if ( gtot1 > (srclen - (DELTA_HDR + LIT_HDR + MAIN_HDR)) ) {
if (srclen == DELTA2_CHUNK) {
if (tot > 0)
*dstlen = tot;
}
return (-1);
}
/*
* Now perform encoding using the stride length.
*/
snum = 0;
vl1 = 0;
vld1 = 0;
gtot1 = 0;
pos = src;
pos2 = dst;
vl2 = *((uint64_t *)pos);
vl2 = LE64(vl2);
val = stride;
val = ((val << 3) - 1);
val = (1ULL << val);
val |= (val - 1);
vl2 &= val;
sval = vl2;
for (cnt = 0; cnt < (srclen - sizeof (cnt)); cnt += stride) {
vl2 = *((uint64_t *)pos);
vl2 = LE64(vl2);
vl2 &= val;
vld2 = vl2 - vl1;
if (vld1 != vld2) {
if (snum > rle_thresh) {
if (gtot1 > 0) {
gtot1 &= MSB_SETZERO_MASK;
*((uint64_t *)pos2) = LE64(gtot1);
pos2 += sizeof (uint64_t);
DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
memcpy(pos2, pos - (gtot1+snum), gtot1);
pos2 += gtot1;
gtot1 = 0;
}
/*
* RLE Encode delta series.
*/
gtot2 = stride;
gtot2 <<= MSB_SHIFT;
gtot2 |= (snum & MSB_SETZERO_MASK);
*((uint64_t *)pos2) = LE64(gtot2);
pos2 += sizeof (uint64_t);
*((uint64_t *)pos2) = LE64(sval);
pos2 += sizeof (uint64_t);
*((uint64_t *)pos2) = LE64(vld1);
pos2 += sizeof (uint64_t);
DEBUG_STAT_EN(*hdr_ovr += DELTA_HDR);
} else {
gtot1 += snum;
}
snum = 0;
sval = vl2;
}
snum += stride;
vld1 = vld2;
vl1 = vl2;
pos += stride;
}
if (snum > 0) {
if (snum > rle_thresh) {
if (gtot1 > 0) {
gtot1 &= MSB_SETZERO_MASK;
*((uint64_t *)pos2) = LE64(gtot1);
pos2 += sizeof (uint64_t);
DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
memcpy(pos2, pos - (gtot1+snum), gtot1);
pos2 += gtot1;
gtot1 = 0;
}
gtot2 = stride;
gtot2 <<= MSB_SHIFT;
gtot2 |= (snum & MSB_SETZERO_MASK);
*((uint64_t *)pos2) = LE64(gtot2);
pos2 += sizeof (uint64_t);
*((uint64_t *)pos2) = LE64(sval);
pos2 += sizeof (uint64_t);
*((uint64_t *)pos2) = LE64(vld1);
pos2 += sizeof (uint64_t);
DEBUG_STAT_EN(*hdr_ovr += DELTA_HDR);
} else if (last_encode) {
gtot1 += snum;
gtot1 &= MSB_SETZERO_MASK;
*((uint64_t *)pos2) = LE64(gtot1);
pos2 += sizeof (uint64_t);
DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
memcpy(pos2, pos - gtot1, gtot1);
pos2 += gtot1;
} else {
gtot1 += snum;
}
}
if (last_encode) {
val = srclen - (pos - src);
if (val > 0) {
/*
* Encode left over bytes, if any, at the end into a
* literal run.
*/
val &= MSB_SETZERO_MASK;
*((uint64_t *)pos2) = LE64(val);
pos2 += sizeof (uint64_t);
for (cnt = 0; cnt < val; cnt++) {
*pos2 = *pos;
pos2++; pos++;
}
DEBUG_STAT_EN(*hdr_ovr += LIT_HDR);
}
val = 0;
} else {
val = gtot1 + (srclen - (pos - src));
}
*dstlen = pos2 - dst;
return (val);
}
int
delta2_decode(uchar_t *src, uint64_t srclen, uchar_t *dst, uint64_t *dstlen)
{
uchar_t *pos, *pos1, *last;
uint64_t olen, val, sval, delta, rcnt, cnt, out, vl;
uchar_t stride, flags;
DEBUG_STAT_EN(double strt, en);
pos = src;
pos1 = dst;
DEBUG_STAT_EN(strt = get_wtime_millis());
last = pos + srclen;
olen = LE64(*((uint64_t *)pos));
if (*dstlen < olen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer too small.\n");
return (-1);
}
out = 0;
pos += MAIN_HDR;
while (pos < last) {
val = *((uint64_t *)pos);
val = LE64(val);
flags = (val >> MSB_SHIFT) & 0xff;
if (flags == 0) {
/*
* Copy over literal run of bytes.
*/
rcnt = val & MSB_SETZERO_MASK;
pos += sizeof (rcnt);
if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1);
}
memcpy(pos1, pos, rcnt);
pos += rcnt;
pos1 += rcnt;
out += rcnt;
} else {
stride = flags;
rcnt = val & MSB_SETZERO_MASK;
pos += sizeof (rcnt);
sval = LE64(*((uint64_t *)pos));
pos += sizeof (sval);
delta = LE64(*((uint64_t *)pos));
pos += sizeof (delta);
if (out + rcnt > *dstlen) {
fprintf(stderr, "DELTA2 Decode: Destination buffer overflow. Corrupt data.\n");
return (-1);
}
vl = stride;
vl = (vl << 3) - 1;
vl = (1ULL << vl);
vl |= (vl - 1);
/*
* Recover original bytes from the arithmetic series using
* length, starting value and delta.
*/
for (cnt = 0; cnt < rcnt/stride; cnt++) {
val = (sval & vl);
*((uint64_t *)pos1) = LE64(val);
out += stride;
sval += delta;
pos1 += stride;
}
}
}
*dstlen = out;
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DELTA2: Decoded at %.3f MB/s\n", get_mb_s(out, strt, en)));
return (0);
}