Overhaul documentation part #1
Detect and handle uncompressed PDF files using libbsc. Force binary/text data detection for tar archives. Get rid of unnecessary CLI option. Add full pipeline mode check when archiving.
This commit is contained in:
parent
35043a74b0
commit
ea345a902a
7 changed files with 426 additions and 310 deletions
465
README.md
465
README.md
|
@ -6,31 +6,32 @@ Use is subject to license terms.
|
||||||
moinakg (_at) gma1l _dot com.
|
moinakg (_at) gma1l _dot com.
|
||||||
Comments, suggestions, code, rants etc are welcome.
|
Comments, suggestions, code, rants etc are welcome.
|
||||||
|
|
||||||
Pcompress is a utility to do compression and decompression in parallel by
|
Pcompress is an archiver that also does compression and decompression in
|
||||||
splitting input data into chunks. It has a modular structure and includes
|
parallel by splitting input data into chunks. It has a modular structure
|
||||||
support for multiple algorithms like LZMA, Bzip2, PPMD, etc, with SKEIN/
|
and includes support for multiple algorithms like LZMA, Bzip2, PPMD, etc,
|
||||||
SHA checksums for data integrity. It can also do Lempel-Ziv pre-compression
|
with SKEIN/SHA checksums for data integrity. Compression algorithms are
|
||||||
(derived from libbsc) to improve compression ratios across the board. SSE
|
selected based on the file type to maximize compression gains using a file
|
||||||
optimizations for the bundled LZMA are included. It also implements
|
and data anaylis based adaptive technique. It also includes various data
|
||||||
Variable Block Deduplication and Delta Compression features based on a
|
transformation filters to improve compression.
|
||||||
Semi-Rabin Fingerprinting scheme. Delta Compression is done via the widely
|
|
||||||
popular bsdiff algorithm. Similarity is detected using a technique based
|
It also implements Variable Block Deduplication and Delta Compression
|
||||||
on MinHashing. When doing Dedupe it attempts to merge adjacent non-
|
features based on a Polynomial Fingerprinting scheme. Delta Compression
|
||||||
duplicate block index entries into a single larger entry to reduce metadata.
|
is done via the widely popular bsdiff algorithm. Similarity is detected
|
||||||
In addition to all these it can internally split chunks at rabin boundaries
|
using a technique based on MinHashing. Deduplication metadata is also
|
||||||
to help Dedupe and compression.
|
compressed to reduce overheads. In addition to all these it can internally
|
||||||
|
split chunks at file and rabin boundaries to help Dedupe and compression.
|
||||||
|
|
||||||
It has low metadata overhead and overlaps I/O and compression to achieve
|
It has low metadata overhead and overlaps I/O and compression to achieve
|
||||||
maximum parallelism. It also bundles a simple slab allocator to speed
|
maximum parallelism. It also bundles a simple slab allocator to speed
|
||||||
repeated allocation of similar chunks. It can work in pipe mode, reading
|
repeated allocation of similar chunks. It can work in pipe mode, reading
|
||||||
from stdin and writing to stdout. It also provides adaptive compression
|
from stdin and writing to stdout. SIMD vector optimizations using the x86
|
||||||
modes in which data analysis heuristics are used to identify near-optimal
|
SSE instruction set are used to speed up various operations. Finally it
|
||||||
algorithms per chunk. Finally it supports 14 compression levels to allow
|
supports 14 compression levels to allow for ultra compression parameters
|
||||||
for ultra compression parameters in some algorithms.
|
in some algorithms.
|
||||||
|
|
||||||
Pcompress also supports encryption via AES and uses Scrypt from Tarsnap
|
Pcompress also supports encryption via AES, Salsa20 and uses Scrypt from
|
||||||
for Password Based Key generation. A unique key is generated per session
|
Tarsnap for Password Based Key generation. A unique key is generated per
|
||||||
even if the same password is used and HMAC is used to do authentication.
|
session even if the same password is used and HMAC is used to do authentication.
|
||||||
|
|
||||||
Links of Interest
|
Links of Interest
|
||||||
=================
|
=================
|
||||||
|
@ -49,59 +50,229 @@ http://moinakg.wordpress.com/2013/06/11/architecture-for-a-deduplicated-archival
|
||||||
|
|
||||||
http://moinakg.wordpress.com/2013/06/15/architecture-for-a-deduplicated-archival-store-part-2/
|
http://moinakg.wordpress.com/2013/06/15/architecture-for-a-deduplicated-archival-store-part-2/
|
||||||
|
|
||||||
Usage
|
Standard Usage
|
||||||
=====
|
==============
|
||||||
|
Standard usage consists of a few common options to control basic behavior. A variety of
|
||||||
|
parameters including global deduplication are automatically set based on the compression
|
||||||
|
level.
|
||||||
|
|
||||||
To compress a file:
|
Archiving
|
||||||
pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [<target file>]
|
---------
|
||||||
|
pcompress -a [-v] [-l <compress level>] [-s <chunk size>] [-c <algorithm>]
|
||||||
|
[<file1> <directory1> <file2> ...] [-t <number>] [-S <chunk checksum>]
|
||||||
|
<archive filename or '-'>
|
||||||
|
|
||||||
Where <algorithm> can be the folowing:
|
Archives a given set of files and/or directories into a compressed PAX archive. The
|
||||||
lzfx - Very fast and small algorithm based on LZF.
|
PAX datastream is encoded into a custom format compressed file that can only be
|
||||||
lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
|
handled by Pcompress.
|
||||||
zlib - The base Zlib format compression (not Gzip).
|
|
||||||
lzma - The LZMA (Lempel-Ziv Markov) algorithm from 7Zip.
|
-a Enables archive mode where pathnames specified in the command line are
|
||||||
lzmaMt - Multithreaded version of LZMA. This is a faster version but
|
archived using LibArchive and then compressed.
|
||||||
uses more memory for the dictionary. Thread count is balanced
|
|
||||||
between chunk processing threads and algorithm threads.
|
-l <compress level>
|
||||||
bzip2 - Bzip2 Algorithm from libbzip2.
|
Select a compression level from 1 (least compression, fastest) to 14
|
||||||
ppmd - The PPMd algorithm excellent for textual data. PPMd requires
|
(ultra compression, slow). Default: 6
|
||||||
|
|
||||||
|
-s <chunk size>
|
||||||
|
Archive data is split into chunks that are processed in parallel. This value
|
||||||
|
specifies the maximum chunk size. Blocks may be smaller than this. Values
|
||||||
|
can be in bytes or <number><suffix> format where suffix can be k - KB, m - MB,
|
||||||
|
g - GB. Default: 8m
|
||||||
|
Larger chunks can produce better compression at the cost of memory.
|
||||||
|
|
||||||
|
-c <algorithm>
|
||||||
|
Specifies the compression algorithm to use. Default algorithm when archiving
|
||||||
|
is adapt2 (Second Adaptive Mode). This is the ideal mode for archiving giving
|
||||||
|
best compression gains. However adapt (Adaptive Mode) can be used which is a
|
||||||
|
little faster but give lower compression gains.
|
||||||
|
Other algorithms can be used if all the files are of the same known type. For
|
||||||
|
example ppmd (slow) or libbsc (fast) can be used if all the files only have
|
||||||
|
ASCII text. See section "Compression Algorithms" for details.
|
||||||
|
|
||||||
|
-v Enables verbose mode where each file/directory is printed as it is processed.
|
||||||
|
|
||||||
|
-t <number>
|
||||||
|
Sets the number of threads that Pcompress can use. Pcompress automatically
|
||||||
|
uses thread count = core count. However with larger chunk size (-s option)
|
||||||
|
and/or ultra compression levels, large amounts of memory can be used. In this
|
||||||
|
case thread count can be reduced to reduce memory consumption.
|
||||||
|
|
||||||
|
-S <chunk checksum>
|
||||||
|
Specify then chunk checksum to use. Default: BLAKE256. The following checksums
|
||||||
|
are available:
|
||||||
|
|
||||||
|
CRC64 - Extremely Fast 64-bit CRC from LZMA SDK.
|
||||||
|
SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.
|
||||||
|
SHA512 - SHA512 version of Intel's optimized (SSE,AVX) SHA2 for x86.
|
||||||
|
KECCAK256 - Official 256-bit NIST SHA3 optimized implementation.
|
||||||
|
KECCAK512 - Official 512-bit NIST SHA3 optimized implementation.
|
||||||
|
BLAKE256 - Very fast 256-bit BLAKE2, derived from the NIST SHA3
|
||||||
|
runner-up BLAKE.
|
||||||
|
BLAKE512 - Very fast 256-bit BLAKE2, derived from the NIST SHA3
|
||||||
|
runner-up BLAKE.
|
||||||
|
|
||||||
|
The fastest checksum is the BLAKE2 family.
|
||||||
|
|
||||||
|
<archive filename>
|
||||||
|
Pathname of the resulting archive. A '.pz' extension is automatically added
|
||||||
|
if not already present. This can also be specified as '-' in order to send
|
||||||
|
the compressed archive stream to stdout.
|
||||||
|
|
||||||
|
Single File Compression
|
||||||
|
-----------------------
|
||||||
|
pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] [-p] [<file>]
|
||||||
|
[-t <number>] [-S <chunk checksum>] [<target file or '-'>]
|
||||||
|
|
||||||
|
Takes a single file as input and produces a compressed file. Archiving is not performed.
|
||||||
|
This can also work as compression pipeline.
|
||||||
|
|
||||||
|
-c <algorithm>
|
||||||
|
See above. Also see section "Compression Algorithms" for details.
|
||||||
|
|
||||||
|
-l <compress level>
|
||||||
|
-s <chunk size>
|
||||||
|
-t <number>
|
||||||
|
-S <chunk checksum>
|
||||||
|
See above.
|
||||||
|
Note: In singe file compression mode with adapt2 or adapt algorithm, larger
|
||||||
|
chunks may not produce better compression. Smaller chunks can result
|
||||||
|
in better data analysis here.
|
||||||
|
|
||||||
|
-p Make Pcompress work in full pipeline mode. Data is ingested via stdin
|
||||||
|
compressed and output via stdout. No filenames are used.
|
||||||
|
|
||||||
|
<target file>
|
||||||
|
Pathname of the compressed file to be created. This can be '-' to send the
|
||||||
|
compressed data to stdout.
|
||||||
|
|
||||||
|
Decompression and Archive extraction
|
||||||
|
------------------------------------
|
||||||
|
pcompress -d <compressed file or '-'> [-m] [-K] [<target file or directory>]
|
||||||
|
|
||||||
|
-m Enable restoring *all* permissions, ACLs, Extended Attributes etc.
|
||||||
|
Equivalent to the '-p' option in tar. Ownership is only extracted if run as
|
||||||
|
root user.
|
||||||
|
|
||||||
|
-K Do not overwrite newer files.
|
||||||
|
|
||||||
|
-m and -K are only meaningful if the compressed file is an archive. For single file
|
||||||
|
compressed mode these options are ignored.
|
||||||
|
|
||||||
|
<compressed file>
|
||||||
|
Specifies the compressed file or archive. This can be '-' to indicate reading
|
||||||
|
from stdin while write goes to <target file>
|
||||||
|
|
||||||
|
<target file or directory>
|
||||||
|
This can be a filename or a directory depending on how the archive was created.
|
||||||
|
If single file compression was used then this can be the name of the target
|
||||||
|
file that will hold the uncompressed data.
|
||||||
|
If this is omitted then an output file is created by appending '.out' to the
|
||||||
|
compressed filename.
|
||||||
|
|
||||||
|
If Archiving was done then this should be the name of a directory into which
|
||||||
|
extracted files are restored. The directory is created if it does not exist.
|
||||||
|
If this is omitted the files are extracted into the current directory.
|
||||||
|
|
||||||
|
Compression Algorithms
|
||||||
|
======================
|
||||||
|
|
||||||
|
lzfx - Fast, average compression. At high compression levels this can be faster
|
||||||
|
than LZ4.
|
||||||
|
Effective Levels: 1 - 5
|
||||||
|
lz4 - Very Fast, sometimes better compression than LZFX.
|
||||||
|
Effective Levels: 1 - 3
|
||||||
|
zlib - Fast, better compression.
|
||||||
|
Effective Levels: 1 - 9
|
||||||
|
bzip2 - Slow, much better compression than Zlib.
|
||||||
|
Effective Levels: 1 - 9
|
||||||
|
|
||||||
|
lzma - Very slow. Extreme compression. Recommended: Use lzmaMt variant mentioned
|
||||||
|
below.
|
||||||
|
Effective Levels: 1 - 14
|
||||||
|
Till level 9 it is standard LZMA parameters. Levels 10 - 12 use
|
||||||
|
more memory and higher match iterations so are slower. Levels
|
||||||
|
13 and 14 use larger dictionaries upto 256MB and really suck up
|
||||||
|
RAM. Use these levels only if you have at the minimum 4GB RAM on
|
||||||
|
your system.
|
||||||
|
lzmaMt - This is the multithreaded variant of lzma and typically runs faster.
|
||||||
|
However in a few cases this can produce slightly lesser compression
|
||||||
|
gain.
|
||||||
|
|
||||||
|
PPMD - Slow. Extreme compression for Text, average compression for binary.
|
||||||
|
In addition PPMD decompression time is also high for large chunks.
|
||||||
|
This requires lots of RAM similar to LZMA. PPMd requires
|
||||||
at least 64MB X core-count more memory than the other modes.
|
at least 64MB X core-count more memory than the other modes.
|
||||||
|
Effective Levels: 1 - 14.
|
||||||
|
|
||||||
libbsc - A Block Sorting Compressor using the Burrows Wheeler Transform
|
Adapt - Synthetic mode with text/binary detection. For pure text data PPMD is
|
||||||
like Bzip2 but runs faster and gives better compression than
|
used otherwise Bzip2 is selected per chunk.
|
||||||
Bzip2 (See: libbsc.com).
|
Effective Levels: 1 - 14
|
||||||
|
Adapt2 - Slower synthetic mode. For pure text data PPMD is otherwise LZMA is
|
||||||
|
applied. Can give very good compression ratio when splitting file
|
||||||
|
into multiple chunks.
|
||||||
|
Effective Levels: 1 - 14
|
||||||
|
Since both LZMA and PPMD are used together memory requirements are
|
||||||
|
large especially if you are also using extreme levels above 10. For
|
||||||
|
example with 100MB chunks, Level 14, 2 threads and with or without
|
||||||
|
dedupe, it uses upto 2.5GB physical RAM (RSS).
|
||||||
|
|
||||||
adapt - Adaptive mode where ppmd or bzip2 will be used per chunk,
|
none - No compression. This is only meaningful with -G or -D. So Dedupe
|
||||||
depending on heuristics. If at least 50% of the input data is
|
|
||||||
7-bit text then PPMd will be used otherwise Bzip2.
|
|
||||||
adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of
|
|
||||||
the input data is 7-bit text then PPMd will be used otherwise
|
|
||||||
LZMA. It has significantly more memory usage than adapt.
|
|
||||||
none - No compression. This is only meaningful with -D and -E so Dedupe
|
|
||||||
can be done for post-processing with an external utility.
|
can be done for post-processing with an external utility.
|
||||||
|
|
||||||
<chunk_size> - This can be in bytes or can use the following suffixes:
|
Enabled features based on Compression Level
|
||||||
g - Gigabyte, m - Megabyte, k - Kilobyte.
|
===========================================
|
||||||
Larger chunks produce better compression at the cost of memory.
|
|
||||||
In case of Global Deduplication (see below) this chunk size is
|
|
||||||
just a hint and may get automatically adjusted.
|
|
||||||
<compress_level> - Can be a number from 0 meaning minimum and 14 meaning
|
|
||||||
maximum compression.
|
|
||||||
<target file> - Optional argument specifying the destination compressed
|
|
||||||
file. The '.pz' extension is appended. If this is '-' then
|
|
||||||
compressed output goes to stdout. If this argument is omitted then
|
|
||||||
source filename is used with the extension '.pz' appended.
|
|
||||||
|
|
||||||
To decompress a file compressed using above command:
|
1 to 3 - No features, just compression and archiving, if needed.
|
||||||
pcompress -d <compressed file> <target file>
|
4 - Global Deduplication with avg block size of 8KB.
|
||||||
|
5 - Global Dedup block size 8KB, Adaptive Delta Encoding.
|
||||||
|
6 to 8 - Global Dedup block size reduced to 4KB, Adaptive Delta Encoding.
|
||||||
|
9 - Global Dedup block size reduced to 2KB, Adaptive Delta Encoding, Dispack.
|
||||||
|
10 - Global Dedup block size 2KB, Adaptive Delta Encoding with extra rounds, Dispack,
|
||||||
|
LZP Preprocessing
|
||||||
|
10 - 14 - Global Dedup block size 2KB, Adaptive Delta Encoding with extra rounds, Dispack,
|
||||||
|
LZP Preprocessing, PackJPG filter for Jpegs.
|
||||||
|
|
||||||
<compressed file> can be '-' to indicate reading from stdin while write goes
|
Encryption
|
||||||
to <target file>
|
==========
|
||||||
|
Pcompress supports encryption and authentication in both archive and single-file
|
||||||
|
compresion modes. Encryption options are discussed below.
|
||||||
|
|
||||||
To operate as a full pipe, read from stdin and write to stdout:
|
NOTE: When using pipe-mode via -p the only way to provide a password is to use '-w'.
|
||||||
pcompress -p ...
|
See below.
|
||||||
|
|
||||||
Attempt Rabin fingerprinting based deduplication on a per-chunk basis:
|
-e <ALGO>
|
||||||
|
Encrypt chunks using the given encryption algorithm. The algo parameter
|
||||||
|
can be one of AES or SALSA20. Both are used in CTR stream encryption
|
||||||
|
mode.
|
||||||
|
The password can be prompted from the user or read from a file. Unique
|
||||||
|
keys are generated every time pcompress is run even when giving the same
|
||||||
|
password. Of course enough info is stored in the compresse file so that
|
||||||
|
the key used for the file can be re-created given the correct password.
|
||||||
|
|
||||||
|
Default key length if 256 bits but can be reduced to 128 bits using the
|
||||||
|
'-k' option.
|
||||||
|
|
||||||
|
The Scrypt algorithm from Tarsnap is used
|
||||||
|
(See: http://www.tarsnap.com/scrypt.html) for generating keys from
|
||||||
|
passwords. The CTR mode AES mechanism from Tarsnap is also utilized.
|
||||||
|
|
||||||
|
-w <pathname>
|
||||||
|
Provide a file which contains the encryption password. This file must
|
||||||
|
be readable and writable since it is zeroed out after the password is
|
||||||
|
read.
|
||||||
|
|
||||||
|
-k <key length>
|
||||||
|
Specify the key length. Can be 16 for 128 bit keys or 32 for 256 bit
|
||||||
|
keys. Default value is 32 for 256 bit keys.
|
||||||
|
|
||||||
|
|
||||||
|
Advanced usage
|
||||||
|
==============
|
||||||
|
A variety of advanced options are provided if one wishes fine grained control
|
||||||
|
as opposed to automatic settings. If advanced options are used then auto-setting
|
||||||
|
of parameters get disabled. The various advanced options are discussed below.
|
||||||
|
|
||||||
|
Attempt Polynomial fingerprinting based deduplication on a per-chunk basis:
|
||||||
pcompress -D ...
|
pcompress -D ...
|
||||||
|
|
||||||
Perform Delta Encoding in addition to Identical Dedup:
|
Perform Delta Encoding in addition to Identical Dedup:
|
||||||
|
@ -114,61 +285,39 @@ Usage
|
||||||
effect greater final compression ratio at the cost of
|
effect greater final compression ratio at the cost of
|
||||||
higher processing overhead.
|
higher processing overhead.
|
||||||
|
|
||||||
Number of threads can optionally be specified: -t <1 - 256 count>
|
-L Enable LZP pre-compression. This improves compression ratio of all
|
||||||
Other flags:
|
|
||||||
'-L' - Enable LZP pre-compression. This improves compression ratio of all
|
|
||||||
algorithms with some extra CPU and very low RAM overhead. Using
|
algorithms with some extra CPU and very low RAM overhead. Using
|
||||||
delta encoding in conjunction with this may not always be beneficial.
|
delta encoding in conjunction with this may not always be beneficial.
|
||||||
However Adaptive Delta Encoding is beneficial along with this.
|
However Adaptive Delta Encoding is beneficial along with this.
|
||||||
|
|
||||||
'-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio further
|
-P Enable Adaptive Delta Encoding. It can improve compresion ratio further
|
||||||
for data containing tables of numerical values especially if those are
|
for data containing tables of numerical values especially if those are
|
||||||
in an arithmetic series. In this implementation basic Delta Encoding is
|
in an arithmetic series. In this implementation basic Delta Encoding is
|
||||||
combined with Run-Length encoding and Matrix transpose
|
combined with Run-Length encoding and Matrix transpose
|
||||||
NOTE - Both -L and -P can be used together to give maximum benefit on most
|
NOTE - Both -L and -P can be used together to give maximum benefit on most
|
||||||
datasets.
|
datasets.
|
||||||
|
|
||||||
'-S' <cksum>
|
-F Perform Fixed Block Deduplication. This is faster than fingerprinting
|
||||||
- Specify chunk checksum to use:
|
|
||||||
|
|
||||||
CRC64 - Extremely Fast 64-bit CRC from LZMA SDK.
|
|
||||||
SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86.
|
|
||||||
SHA512 - SHA512 version of Intel's optimized (SSE,AVX) SHA2 for x86.
|
|
||||||
KECCAK256 - Official 256-bit NIST SHA3 optimized implementation.
|
|
||||||
KECCAK512 - Official 512-bit NIST SHA3 optimized implementation.
|
|
||||||
BLAKE256 - Very fast 256-bit BLAKE2, derived from the NIST SHA3
|
|
||||||
runner-up BLAKE.
|
|
||||||
BLAKE512 - Very fast 256-bit BLAKE2, derived from the NIST SHA3
|
|
||||||
runner-up BLAKE.
|
|
||||||
|
|
||||||
'-F' - Perform Fixed Block Deduplication. This is faster than fingerprinting
|
|
||||||
based content-aware deduplication in some cases. However this is mostly
|
based content-aware deduplication in some cases. However this is mostly
|
||||||
usable for disk dumps especially virtual machine images. This generally
|
usable for disk dumps especially virtual machine images. This generally
|
||||||
gives lower dedupe ratio than content-aware dedupe (-D) and does not
|
gives lower dedupe ratio than content-aware dedupe (-D) and does not
|
||||||
support delta compression.
|
support delta compression.
|
||||||
|
|
||||||
'-B' <0..5>
|
-B <0..5>
|
||||||
- Specify an average Dedupe block size. 0 - 2K, 1 - 4K, 2 - 8K ... 5 - 64K.
|
Specify an average Dedupe block size. 0 - 2K, 1 - 4K, 2 - 8K ... 5 - 64K.
|
||||||
Default deduplication block size is 4KB for Global Deduplication and 2KB
|
Default deduplication block size is 4KB for Global Deduplication and 2KB
|
||||||
otherwise.
|
otherwise.
|
||||||
'-B' 0
|
-B 0
|
||||||
- This uses blocks as small as 2KB for deduplication. This option can be
|
This uses blocks as small as 2KB for deduplication. This option can be
|
||||||
used for datasets of a few GBs to a few hundred TBs in size depending on
|
used for datasets of a few GBs to a few hundred TBs in size depending on
|
||||||
available RAM.
|
available RAM.
|
||||||
|
|
||||||
Caveats:
|
-M Display memory allocator statistics.
|
||||||
In some cases like LZMA with extreme compression levels and with '-L' and
|
-C Display compression statistics.
|
||||||
'-P' preprocessing enabled, this can result in lower compression as compared
|
|
||||||
to using '-B 1'.
|
|
||||||
For fast compression algorithms like LZ4 and Zlib this should always benefit.
|
|
||||||
However please test on your sample data with your desired compression
|
|
||||||
algorithm to verify the results.
|
|
||||||
|
|
||||||
'-M' - Display memory allocator statistics
|
Global Deduplication
|
||||||
'-C' - Display compression statistics
|
--------------------
|
||||||
|
-G This flag enables Global Deduplication. This makes pcompress maintain an
|
||||||
Global Deduplication:
|
|
||||||
'-G' - This flag enables Global Deduplication. This makes pcompress maintain an
|
|
||||||
in-memory index to lookup cryptographic block hashes for duplicates. Once
|
in-memory index to lookup cryptographic block hashes for duplicates. Once
|
||||||
a duplicate is found it is replaced with a reference to the original block.
|
a duplicate is found it is replaced with a reference to the original block.
|
||||||
This allows detecting and eliminating duplicate blocks across the entire
|
This allows detecting and eliminating duplicate blocks across the entire
|
||||||
|
@ -203,34 +352,6 @@ Usage
|
||||||
In pipe mode Global Deduplication always uses a segmented similarity based
|
In pipe mode Global Deduplication always uses a segmented similarity based
|
||||||
index. It allows efficient network transfer of large data.
|
index. It allows efficient network transfer of large data.
|
||||||
|
|
||||||
Encryption flags:
|
|
||||||
'-e <ALGO>'
|
|
||||||
Encrypt chunks using the given encryption algorithm. The algo parameter
|
|
||||||
can be one of AES or SALSA20. Both are used in CTR stream encryption
|
|
||||||
mode.
|
|
||||||
The password can be prompted from the user or read from a file. Unique
|
|
||||||
keys are generated every time pcompress is run even when giving the same
|
|
||||||
password. Of course enough info is stored in the compresse file so that
|
|
||||||
the key used for the file can be re-created given the correct password.
|
|
||||||
|
|
||||||
Default key length if 256 bits but can be reduced to 128 bits using the
|
|
||||||
'-k' option.
|
|
||||||
|
|
||||||
The Scrypt algorithm from Tarsnap is used
|
|
||||||
(See: http://www.tarsnap.com/scrypt.html) for generating keys from
|
|
||||||
passwords. The CTR mode AES mechanism from Tarsnap is also utilized.
|
|
||||||
|
|
||||||
'-w <pathname>'
|
|
||||||
Provide a file which contains the encryption password. This file must
|
|
||||||
be readable and writable since it is zeroed out after the password is
|
|
||||||
read.
|
|
||||||
|
|
||||||
'-k <key length>'
|
|
||||||
Specify the key length. Can be 16 for 128 bit keys or 32 for 256 bit
|
|
||||||
keys. Default value is 32 for 256 bit keys.
|
|
||||||
|
|
||||||
NOTE: When using pipe-mode via -p the only way to provide a password is to use '-w'.
|
|
||||||
|
|
||||||
Environment Variables
|
Environment Variables
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
|
@ -275,72 +396,32 @@ than SHA2.
|
||||||
Examples
|
Examples
|
||||||
========
|
========
|
||||||
|
|
||||||
Simple compress "file.tar" using zlib(gzip) algorithm. Default chunk or per-thread
|
Archive contents of directory /usr/include into usr.pz. Default chunk or per-thread
|
||||||
segment size is 8MB and default compression level is 6.
|
segment size is 8MB and default compression level is 6.
|
||||||
|
|
||||||
|
pcompress -a /usr/include usr
|
||||||
|
|
||||||
|
Archive the given listr of files into file.pz and max compresion level and all features
|
||||||
|
enabled. A maximum chunk size of 20MB is used. Also use verbose mode which lists each
|
||||||
|
file as it is processed.
|
||||||
|
|
||||||
|
pcompress -a -v -l14 -s20m file1 file2 file3 file
|
||||||
|
|
||||||
|
Simple compress "file.tar" using zlib(gzip) algorithm. Default chunk or per-thread
|
||||||
|
segment size is 8MB and default compression level is 6. Output file created will be
|
||||||
|
file.tar.pz
|
||||||
|
|
||||||
pcompress -c zlib file.tar
|
pcompress -c zlib file.tar
|
||||||
|
|
||||||
Compress "file.tar" using bzip2 level 6, 64MB chunk size and use 4 threads. In
|
Simple compress "file.tar" using zlib(gzip) algorithm with output file file.compressed.pz
|
||||||
addition perform identity deduplication and delta compression prior to compression.
|
|
||||||
|
|
||||||
pcompress -D -E -c bzip2 -l6 -s64m -t4 file.tar
|
pcompress -c zlib file.tar file.compressed
|
||||||
|
|
||||||
Compress "file.tar" using zlib and also perform Global Deduplication. Default block
|
Compress "file.tar" using Zlib and per-thread chunk or segment size of 10MB and
|
||||||
size used for deduplication is 4KB. Also redirect the compressed output to stdout and
|
Compression level 9. Compressed output is sent to stdout using '-' which is then
|
||||||
send it to a compressed file at a different path.
|
redirected to a file.
|
||||||
|
|
||||||
pcompress -G -c zlib -l9 -s10m file.tar - > /path/to/compress_file.tar.pz
|
pcompress -c zlib -l9 -s10m file.tar - > /path/to/compress_file.tar.pz
|
||||||
|
|
||||||
Perform the same as above but this time use a deduplication block size of 8KB.
|
|
||||||
|
|
||||||
pcompress -G -c zlib -l9 -B2 -s10m file.tar - > /path/to/compress_file.tar.pz
|
|
||||||
|
|
||||||
Compress "file.tar" using extreme compression mode of LZMA and a chunk size of
|
|
||||||
of 1GB. Allow pcompress to detect the number of CPU cores and use as many threads.
|
|
||||||
|
|
||||||
pcompress -c lzma -l14 -s1g file.tar
|
|
||||||
|
|
||||||
Compress "file.tar" using lz4 at max compression with LZ-Prediction pre-processing
|
|
||||||
and encryption enabled. Chunksize is 100M:
|
|
||||||
|
|
||||||
pcompress -c lz4 -l3 -e -L -s100m file.tar
|
|
||||||
|
|
||||||
Compression Algorithms
|
|
||||||
======================
|
|
||||||
|
|
||||||
LZFX - Ultra Fast, average compression. This algorithm is the fastest overall.
|
|
||||||
Levels: 1 - 5
|
|
||||||
LZ4 - Very Fast, better compression than LZFX.
|
|
||||||
Levels: 1 - 3
|
|
||||||
Zlib - Fast, better compression.
|
|
||||||
Levels: 1 - 9
|
|
||||||
Bzip2 - Slow, much better compression than Zlib.
|
|
||||||
Levels: 1 - 9
|
|
||||||
|
|
||||||
LZMA - Very slow. Extreme compression.
|
|
||||||
Levels: 1 - 14
|
|
||||||
Till level 9 it is standard LZMA parameters. Levels 10 - 12 use
|
|
||||||
more memory and higher match iterations so are slower. Levels
|
|
||||||
13 and 14 use larger dictionaries upto 256MB and really suck up
|
|
||||||
RAM. Use these levels only if you have at the minimum 4GB RAM on
|
|
||||||
your system.
|
|
||||||
|
|
||||||
PPMD - Slow. Extreme compression for Text, average compression for binary.
|
|
||||||
In addition PPMD decompression time is also high for large chunks.
|
|
||||||
This requires lots of RAM similar to LZMA.
|
|
||||||
Levels: 1 - 14.
|
|
||||||
|
|
||||||
Adapt - Synthetic mode with text/binary detection. For pure text data PPMD is
|
|
||||||
used otherwise Bzip2 is selected per chunk.
|
|
||||||
Levels: 1 - 14
|
|
||||||
Adapt2 - Slower synthetic mode. For pure text data PPMD is otherwise LZMA is
|
|
||||||
applied. Can give very good compression ratio when splitting file
|
|
||||||
into multiple chunks.
|
|
||||||
Levels: 1 - 14
|
|
||||||
Since both LZMA and PPMD are used together memory requirements are
|
|
||||||
large especially if you are also using extreme levels above 10. For
|
|
||||||
example with 100MB chunks, Level 14, 2 threads and with or without
|
|
||||||
dedupe, it uses upto 2.5GB physical RAM (RSS).
|
|
||||||
|
|
||||||
It is possible for a single chunk to span the entire file if enough RAM is
|
It is possible for a single chunk to span the entire file if enough RAM is
|
||||||
available. However for adaptive modes to be effective for large files, especially
|
available. However for adaptive modes to be effective for large files, especially
|
||||||
|
@ -356,21 +437,21 @@ algorithms can achieve by themselves. These are summarized below:
|
||||||
1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin
|
1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin
|
||||||
fingerprinting.
|
fingerprinting.
|
||||||
|
|
||||||
2) Delta Compression : A similarity based (minhash) comparison of Rabin blocks. Two
|
2) Delta Compression : A similarity based (minhash) comparison of Rabin blocks.
|
||||||
blocks at least 60% similar with each other are diffed using
|
Two blocks at least 60% similar with each other are diffed
|
||||||
bsdiff.
|
using bsdiff.
|
||||||
|
|
||||||
3) LZP : LZ Prediction is a variant of LZ77 that replaces repeating
|
3) LZP : LZ Prediction is a variant of LZ77 that replaces repeating
|
||||||
runs of text with shorter codes.
|
runs of text with shorter codes.
|
||||||
|
|
||||||
4) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic
|
4) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic
|
||||||
progressions are detected in the data stream and collapsed
|
progressions are detected in the data stream and
|
||||||
via Run-Length encoding.
|
collapsed via Run-Length encoding.
|
||||||
|
|
||||||
4) Matrix Transpose : This is used automatically in Delta Encoding and Deduplication.
|
4) Matrix Transpose : This is used automatically in Delta Encoding and
|
||||||
This attempts to transpose columnar repeating sequences of
|
Deduplication. This attempts to transpose columnar
|
||||||
bytes into row-wise sequences so that compression algorithms
|
repeating sequences of bytes into row-wise sequences so
|
||||||
can work better.
|
that compression algorithms can work better.
|
||||||
|
|
||||||
Memory Usage
|
Memory Usage
|
||||||
============
|
============
|
||||||
|
|
|
@ -230,8 +230,9 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
||||||
struct adapt_data *adat = (struct adapt_data *)(data);
|
struct adapt_data *adat = (struct adapt_data *)(data);
|
||||||
uchar_t *src1 = (uchar_t *)src;
|
uchar_t *src1 = (uchar_t *)src;
|
||||||
int rv = 0, bsc_type = 0;
|
int rv = 0, bsc_type = 0;
|
||||||
|
int stype = PC_SUBTYPE(btype);
|
||||||
|
|
||||||
if (btype == TYPE_UNKNOWN) {
|
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
|
||||||
uint64_t i, tot8b, tag1, tag2, tag3;
|
uint64_t i, tot8b, tag1, tag2, tag3;
|
||||||
double tagcnt, pct_tag;
|
double tagcnt, pct_tag;
|
||||||
uchar_t cur_byte, prev_byte;
|
uchar_t cur_byte, prev_byte;
|
||||||
|
@ -267,6 +268,29 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
||||||
tagcnt > (double)srclen * 0.001)
|
tagcnt > (double)srclen * 0.001)
|
||||||
btype |= TYPE_MARKUP;
|
btype |= TYPE_MARKUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else if (stype == TYPE_PDF) {
|
||||||
|
uint64_t i, tot8b;
|
||||||
|
uchar_t cur_byte;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For PDF files we need to check for uncompressed PDFs. Those are compressed
|
||||||
|
* using Libbsc.
|
||||||
|
*/
|
||||||
|
tot8b = 0;
|
||||||
|
for (i = 0; i < srclen; i++) {
|
||||||
|
cur_byte = src1[i];
|
||||||
|
tot8b += (cur_byte & 0x80);
|
||||||
|
}
|
||||||
|
|
||||||
|
tot8b /= 0x80;
|
||||||
|
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
|
||||||
|
btype = TYPE_BINARY;
|
||||||
|
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
|
||||||
|
btype = TYPE_BINARY;
|
||||||
|
} else {
|
||||||
|
btype = TYPE_TEXT|TYPE_MARKUP;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -46,6 +46,7 @@ struct filter_info {
|
||||||
struct archive_entry *entry;
|
struct archive_entry *entry;
|
||||||
int fd;
|
int fd;
|
||||||
int compressing, block_size;
|
int compressing, block_size;
|
||||||
|
int *type_ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct filter_flags {
|
struct filter_flags {
|
||||||
|
|
|
@ -842,7 +842,7 @@ setup_extractor(pc_ctx_t *pctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t
|
static ssize_t
|
||||||
process_by_filter(int fd, int typ, struct archive *target_arc,
|
process_by_filter(int fd, int *typ, struct archive *target_arc,
|
||||||
struct archive *source_arc, struct archive_entry *entry, int cmp)
|
struct archive *source_arc, struct archive_entry *entry, int cmp)
|
||||||
{
|
{
|
||||||
struct filter_info fi;
|
struct filter_info fi;
|
||||||
|
@ -854,10 +854,11 @@ process_by_filter(int fd, int typ, struct archive *target_arc,
|
||||||
fi.fd = fd;
|
fi.fd = fd;
|
||||||
fi.compressing = cmp;
|
fi.compressing = cmp;
|
||||||
fi.block_size = AW_BLOCK_SIZE;
|
fi.block_size = AW_BLOCK_SIZE;
|
||||||
wrtn = (*(typetab[(typ >> 3)].filter_func))(&fi, typetab[(typ >> 3)].filter_private);
|
fi.type_ptr = typ;
|
||||||
|
wrtn = (*(typetab[(*typ >> 3)].filter_func))(&fi, typetab[(*typ >> 3)].filter_private);
|
||||||
if (wrtn == FILTER_RETURN_ERROR) {
|
if (wrtn == FILTER_RETURN_ERROR) {
|
||||||
log_msg(LOG_ERR, 0, "Error invoking filter module: %s",
|
log_msg(LOG_ERR, 0, "Error invoking filter module: %s",
|
||||||
typetab[(typ >> 3)].filter_name);
|
typetab[(*typ >> 3)].filter_name);
|
||||||
}
|
}
|
||||||
return (wrtn);
|
return (wrtn);
|
||||||
}
|
}
|
||||||
|
@ -890,7 +891,8 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry,
|
||||||
if (typetab[(typ >> 3)].filter_func != NULL) {
|
if (typetab[(typ >> 3)].filter_func != NULL) {
|
||||||
int64_t rv;
|
int64_t rv;
|
||||||
|
|
||||||
rv = process_by_filter(fd, typ, arc, NULL, entry, 1);
|
pctx->ctype = typ;
|
||||||
|
rv = process_by_filter(fd, &(pctx->ctype), arc, NULL, entry, 1);
|
||||||
if (rv == FILTER_RETURN_ERROR) {
|
if (rv == FILTER_RETURN_ERROR) {
|
||||||
close(fd);
|
close(fd);
|
||||||
return (-1);
|
return (-1);
|
||||||
|
@ -934,7 +936,7 @@ do_map:
|
||||||
int64_t rv;
|
int64_t rv;
|
||||||
munmap(mapbuf, len);
|
munmap(mapbuf, len);
|
||||||
|
|
||||||
rv = process_by_filter(fd, typ, arc, NULL, entry, 1);
|
rv = process_by_filter(fd, &(pctx->ctype), arc, NULL, entry, 1);
|
||||||
if (rv == FILTER_RETURN_ERROR) {
|
if (rv == FILTER_RETURN_ERROR) {
|
||||||
return (-1);
|
return (-1);
|
||||||
} else if (rv == FILTER_RETURN_SKIP) {
|
} else if (rv == FILTER_RETURN_SKIP) {
|
||||||
|
@ -1149,7 +1151,7 @@ copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entr
|
||||||
if (typetab[(typ >> 3)].filter_func != NULL) {
|
if (typetab[(typ >> 3)].filter_func != NULL) {
|
||||||
int64_t rv;
|
int64_t rv;
|
||||||
|
|
||||||
rv = process_by_filter(-1, typ, aw, ar, entry, 0);
|
rv = process_by_filter(-1, &typ, aw, ar, entry, 0);
|
||||||
if (rv == FILTER_RETURN_ERROR) {
|
if (rv == FILTER_RETURN_ERROR) {
|
||||||
archive_set_error(ar, archive_errno(aw),
|
archive_set_error(ar, archive_errno(aw),
|
||||||
"%s", archive_error_string(aw));
|
"%s", archive_error_string(aw));
|
||||||
|
@ -1231,6 +1233,7 @@ extractor_thread_func(void *dat) {
|
||||||
* Extract all security attributes if we are root.
|
* Extract all security attributes if we are root.
|
||||||
*/
|
*/
|
||||||
if (pctx->force_archive_perms || geteuid() == 0) {
|
if (pctx->force_archive_perms || geteuid() == 0) {
|
||||||
|
if (geteuid() == 0)
|
||||||
flags |= ARCHIVE_EXTRACT_OWNER;
|
flags |= ARCHIVE_EXTRACT_OWNER;
|
||||||
flags |= ARCHIVE_EXTRACT_PERM;
|
flags |= ARCHIVE_EXTRACT_PERM;
|
||||||
flags |= ARCHIVE_EXTRACT_ACL;
|
flags |= ARCHIVE_EXTRACT_ACL;
|
||||||
|
@ -1475,17 +1478,21 @@ out:
|
||||||
* Detect a few file types from looking at magic signatures.
|
* Detect a few file types from looking at magic signatures.
|
||||||
* NOTE: Jpeg files must be detected via '.jpg' or '.jpeg' (case-insensitive)
|
* NOTE: Jpeg files must be detected via '.jpg' or '.jpeg' (case-insensitive)
|
||||||
* extensions. Do not add Jpeg header detection here. it will break
|
* extensions. Do not add Jpeg header detection here. it will break
|
||||||
* context based PackJPG processing. Jpeg files not have proper
|
* context based PackJPG processing. Jpeg files not having proper
|
||||||
* extension must not be processed via PackJPG.
|
* extension must not be processed via PackJPG.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
detect_type_by_data(uchar_t *buf, size_t len)
|
detect_type_by_data(uchar_t *buf, size_t len)
|
||||||
{
|
{
|
||||||
// At least a few bytes.
|
// At least a few bytes.
|
||||||
if (len < 16) return (TYPE_UNKNOWN);
|
if (len < 512) return (TYPE_UNKNOWN);
|
||||||
|
|
||||||
if (memcmp(buf, "!<arch>\n", 8) == 0)
|
if (memcmp(buf, "!<arch>\n", 8) == 0)
|
||||||
return (TYPE_BINARY|TYPE_ARCHIVE_AR);
|
return (TYPE_BINARY|TYPE_ARCHIVE_AR);
|
||||||
|
if (memcmp(&buf[257], "ustar\0", 6) == 0 || memcmp(&buf[257], "ustar\040\040\0", 8) == 0)
|
||||||
|
return (TYPE_BINARY|TYPE_ARCHIVE_TAR);
|
||||||
|
if (memcmp(buf, "%PDF-", 5) == 0)
|
||||||
|
return (TYPE_BINARY|TYPE_PDF);
|
||||||
if (U32_P(buf) == ELFINT) { // Regular ELF, check for 32/64-bit, core dump
|
if (U32_P(buf) == ELFINT) { // Regular ELF, check for 32/64-bit, core dump
|
||||||
if (*(buf + 16) != 4) {
|
if (*(buf + 16) != 4) {
|
||||||
if (*(buf + 4) == 2) {
|
if (*(buf + 4) == 2) {
|
||||||
|
|
11
pcompress.c
11
pcompress.c
|
@ -2832,7 +2832,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
|
||||||
ff.enable_packjpg = 0;
|
ff.enable_packjpg = 0;
|
||||||
|
|
||||||
pthread_mutex_lock(&opt_parse);
|
pthread_mutex_lock(&opt_parse);
|
||||||
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDGEe:w:LPS:B:Fk:avnmK")) != -1) {
|
while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDGEe:w:LPS:B:Fk:avmK")) != -1) {
|
||||||
int ovr;
|
int ovr;
|
||||||
int64_t chunksize;
|
int64_t chunksize;
|
||||||
|
|
||||||
|
@ -2982,10 +2982,6 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
|
||||||
pctx->verbose = 1;
|
pctx->verbose = 1;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'n':
|
|
||||||
pctx->enable_archive_sort = -1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 'm':
|
case 'm':
|
||||||
pctx->force_archive_perms = 1;
|
pctx->force_archive_perms = 1;
|
||||||
break;
|
break;
|
||||||
|
@ -3023,6 +3019,11 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
|
||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pctx->archive_mode && pctx->pipe_mode) {
|
||||||
|
log_msg(LOG_ERR, 0, "Full pipeline mode is meaningless with archiver.");
|
||||||
|
return (1);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default compression algorithm during archiving is Adaptive2.
|
* Default compression algorithm during archiving is Adaptive2.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -236,7 +236,7 @@ typedef struct pc_ctx {
|
||||||
uchar_t *arc_buf;
|
uchar_t *arc_buf;
|
||||||
uint64_t arc_buf_size, arc_buf_pos;
|
uint64_t arc_buf_size, arc_buf_pos;
|
||||||
int arc_closed, arc_writing;
|
int arc_closed, arc_writing;
|
||||||
uchar_t btype, ctype;
|
int btype, ctype;
|
||||||
int min_chunk;
|
int min_chunk;
|
||||||
int enable_packjpg;
|
int enable_packjpg;
|
||||||
|
|
||||||
|
|
|
@ -246,7 +246,7 @@ typedef enum {
|
||||||
/*
|
/*
|
||||||
* Sub-types.
|
* Sub-types.
|
||||||
*/
|
*/
|
||||||
#define NUM_SUB_TYPES 26
|
#define NUM_SUB_TYPES 28
|
||||||
TYPE_EXE32 = 8,
|
TYPE_EXE32 = 8,
|
||||||
TYPE_JPEG = 16,
|
TYPE_JPEG = 16,
|
||||||
TYPE_MARKUP = 24,
|
TYPE_MARKUP = 24,
|
||||||
|
@ -272,7 +272,9 @@ typedef enum {
|
||||||
TYPE_AUDIO_COMPRESSED = 184,
|
TYPE_AUDIO_COMPRESSED = 184,
|
||||||
TYPE_EXE64 = 192,
|
TYPE_EXE64 = 192,
|
||||||
TYPE_BMP = 200,
|
TYPE_BMP = 200,
|
||||||
TYPE_TIFF = 208
|
TYPE_TIFF = 208,
|
||||||
|
TYPE_PDF = 216,
|
||||||
|
TYPE_ARCHIVE_TAR = 224
|
||||||
} data_type_t;
|
} data_type_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in a new issue