Add Matrix Transpose of Dedupe index to compress it better.

Fix handling of Dedupe index compression failure.
This commit is contained in:
Moinak Ghosh 2012-12-13 00:00:47 +05:30
parent c7b960e72c
commit 375ebefa0d
4 changed files with 152 additions and 32 deletions

View file

@ -97,6 +97,10 @@ LIBBSCLIB = @LIBBSCLIB@
LIBBSCGEN_OPT = -fopenmp
LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC
TRANSP_SRCS = transpose/transpose.c
TRANSP_HDRS = transpose/transpose.h
TRANSP_OBJS = $(TRANSP_SRCS:.c=.o)
KECCAK_SRC_COMMON = crypto/keccak/genKAT.c crypto/keccak/KeccakDuplex.c \
crypto/keccak/KeccakNISTInterface.c crypto/keccak/KeccakSponge.c
KECCAK_SRC_OPT64 = $(KECCAK_SRC_COMMON) crypto/keccak/KeccakF-1600-opt64.c
@ -120,7 +124,7 @@ KECCAK_OBJS_ASM = $(KECCAK_SRCS_ASM:.s=.o)
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ crypto/sha2/*~ \
crypto/sha2/intel/*~ crypto/aes/*~ crypto/scrypt/*~ crypto/*~ rabin/global/*~ \
delta2/*~ crypto/keccak/*~
delta2/*~ crypto/keccak/*~ transpose/*~
RM = rm -f
RM_RF = rm -rf
@ -128,14 +132,15 @@ COMMON_CPPFLAGS = -I. -I./lzma -I./lzfx -I./lz4 -I./rabin -I./bsdiff -DNODEFAULT
-DFILE_OFFSET_BITS=64 -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 \
-I./lzp @LIBBSCCPPFLAGS@ -I./crypto/skein -I./utils -I@OPENSSL_INCDIR@ \
-I./crypto/sha2 -I./crypto/scrypt -I./crypto/aes -I./crypto @KEYLEN@ \
@LIBBZ2_INC@ @LIBZ_INC@ -I./crypto/keccak
@LIBBZ2_INC@ @LIBZ_INC@ -I./crypto/keccak -I./transpose
COMMON_VEC_FLAGS = -ftree-vectorize
COMMON_LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
LDLIBS = -ldl -L@LIBBZ2_DIR@ -lbz2 -L@LIBZ_DIR@ -lz -lm @LIBBSCLFLAGS@ \
-L@OPENSSL_LIBDIR@ -lcrypto -lrt
OBJS = $(MAINOBJS) $(LZMAOBJS) $(PPMDOBJS) $(LZFXOBJS) $(LZ4OBJS) $(CRCOBJS) \
$(RABINOBJS) $(BSDIFFOBJS) $(LZPOBJS) $(DELTA2OBJS) @LIBBSCWRAPOBJ@ $(SKEINOBJS) \
$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM)
$(SKEIN_BLOCK_OBJ) @SHA256ASM_OBJS@ @SHA256_OBJS@ $(KECCAK_OBJS) $(KECCAK_OBJS_ASM) \
$(TRANSP_OBJS)
DEBUG_LINK = g++ -m64 -pthread -msse3 @LIBBSCGEN_OPT@
DEBUG_COMPILE = gcc -m64 -g -msse3 -c
@ -227,6 +232,9 @@ $(LIBBSCLIB):
$(LIBBSCWRAPOBJ): $(LIBBSCWRAP) $(LIBBSCLIB)
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(TRANSP_OBJS): $(TRANSP_SRCS) $(TRANSP_HDRS)
$(COMPILE) $(GEN_OPT) $(VEC_FLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@
$(MAINOBJS): $(MAINSRCS) $(MAINHDRS)
$(COMPILE) $(GEN_OPT) $(LOOP_OPTFLAGS) $(CPPFLAGS) $(@:.o=.c) -o $@

76
main.c
View file

@ -46,6 +46,7 @@
#include <allocator.h>
#include <rabin_dedup.h>
#include <lzp.h>
#include <transpose.h>
/*
* We use 5MB chunks by default.
@ -452,13 +453,21 @@ redo:
rv = 0;
cmpbuf = cseg + RABIN_HDR_SIZE;
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE;
if (dedupe_index_sz >= 90) {
if (dedupe_index_sz >= 90 && dedupe_index_sz > dedupe_index_sz_cmp) {
/* Index should be at least 90 bytes to have been compressed. */
rv = lzma_decompress(cmpbuf, dedupe_index_sz_cmp, ubuf,
&dedupe_index_sz, tdat->rctx->level, 0, tdat->rctx->lzma_data);
} else {
memcpy(ubuf, cmpbuf, dedupe_index_sz);
}
/*
* Recover from transposed index.
*/
transpose(ubuf, cmpbuf, dedupe_index_sz, sizeof (uint32_t), COL);
memcpy(ubuf, cmpbuf, dedupe_index_sz);
} else {
if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) {
@ -1150,48 +1159,57 @@ redo:
index_size_cmp = dedupe_index_sz;
rv = 0;
/*
* Do a matrix transpose of the index table with the hope of improving
* compression ratio subsequently.
*/
transpose(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
compressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz,
sizeof (uint32_t), ROW);
memcpy(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
compressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz);
if (dedupe_index_sz >= 90) {
/* Compress index if it is at least 90 bytes. */
rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
dedupe_index_sz, compressed_chunk + RABIN_HDR_SIZE,
&index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data);
/*
* If index compression fails or does not produce a smaller result
* retain it as is. In that case compressed size == original size
* and it will be handled correctly during decompression.
*/
if (rv != 0 || index_size_cmp >= dedupe_index_sz) {
index_size_cmp = dedupe_index_sz;
goto plain_index;
}
} else {
plain_index:
memcpy(compressed_chunk + RABIN_HDR_SIZE,
tdat->uncompressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz);
}
index_size_cmp += RABIN_HDR_SIZE;
dedupe_index_sz += RABIN_HDR_SIZE;
if (rv == 0) {
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
/* Compress data chunk. */
if (lzp_preprocess) {
rv = preproc_compress(tdat->compress,
tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, tdat->data, tdat->props);
} else {
rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, tdat->data);
}
/* Can't compress data just retain as-is. */
if (rv < 0)
memcpy(compressed_chunk + index_size_cmp,
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
/* Now update rabin header with the compressed sizes. */
update_dedupe_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE,
_chunksize);
memcpy(compressed_chunk, tdat->uncompressed_chunk, RABIN_HDR_SIZE);
/* Compress data chunk. */
if (lzp_preprocess) {
rv = preproc_compress(tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, tdat->data, tdat->props);
} else {
/* If rabin index compression fails, we just drop down to plain
* compression and avoid dedup. Should be pretty rare case.
*/
tdat->rctx->valid = 0;
memcpy(tdat->uncompressed_chunk, tdat->cmp_seg, rbytes);
tdat->rbytes = rbytes;
goto plain_compress;
rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz, _chunksize,
compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, tdat->data);
}
/* Can't compress data just retain as-is. */
if (rv < 0)
memcpy(compressed_chunk + index_size_cmp,
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize);
/* Now update rabin header with the compressed sizes. */
update_dedupe_hdr(compressed_chunk, index_size_cmp - RABIN_HDR_SIZE, _chunksize);
_chunksize += index_size_cmp;
} else {
plain_compress:

50
transpose/transpose.c Normal file
View file

@ -0,0 +1,50 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*/
#include "transpose.h"
/*
* Perform a simple matrix transpose of the given buffer in "from".
* If the buffer contains tables of numbers or structured data a
* transpose can potentially help improve compression ratio by
* bringing repeating values in columns into row ordering.
*/
void
transpose(unsigned char *from, unsigned char *to, uint64_t buflen, uint64_t stride, rowcol_t rc)
{
uint64_t rows, cols, i, j, k, l;
if (rc == ROW) {
rows = buflen / stride;
cols = stride;
} else {
cols = buflen / stride;
rows = stride;
}
k = 0;
for (j = 0; j < rows; j++) {
l = 0;
for (i = 0; i < cols; i++) {
to[j + l] = from[i + k];
l += rows;
}
k += cols;
}
}

44
transpose/transpose.h Normal file
View file

@ -0,0 +1,44 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*/
#ifndef _TRANSP_H
#define _TRANSP_H
#include <sys/types.h>
#include <stdint.h>
#include <inttypes.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
ROW = 0,
COL = 1
} rowcol_t;
void transpose(unsigned char *from, unsigned char *to, uint64_t buflen,
uint64_t stride, rowcol_t rc);
#ifdef __cplusplus
}
#endif
#endif