From ea345a902a212498b978b9fa445c1a63a3983027 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 30 Dec 2013 23:24:37 +0530 Subject: [PATCH] Overhaul documentation part #1 Detect and handle uncompressed PDF files using libbsc. Force binary/text data detection for tar archives. Get rid of unnecessary CLI option. Add full pipeline mode check when archiving. --- README.md | 665 ++++++++++++++++++++++------------------ adaptive_compress.c | 26 +- archive/pc_arc_filter.h | 1 + archive/pc_archive.c | 25 +- pcompress.c | 11 +- pcompress.h | 2 +- utils/utils.h | 6 +- 7 files changed, 426 insertions(+), 310 deletions(-) diff --git a/README.md b/README.md index 4a6092b..a527035 100644 --- a/README.md +++ b/README.md @@ -6,31 +6,32 @@ Use is subject to license terms. moinakg (_at) gma1l _dot com. Comments, suggestions, code, rants etc are welcome. -Pcompress is a utility to do compression and decompression in parallel by -splitting input data into chunks. It has a modular structure and includes -support for multiple algorithms like LZMA, Bzip2, PPMD, etc, with SKEIN/ -SHA checksums for data integrity. It can also do Lempel-Ziv pre-compression -(derived from libbsc) to improve compression ratios across the board. SSE -optimizations for the bundled LZMA are included. It also implements -Variable Block Deduplication and Delta Compression features based on a -Semi-Rabin Fingerprinting scheme. Delta Compression is done via the widely -popular bsdiff algorithm. Similarity is detected using a technique based -on MinHashing. When doing Dedupe it attempts to merge adjacent non- -duplicate block index entries into a single larger entry to reduce metadata. -In addition to all these it can internally split chunks at rabin boundaries -to help Dedupe and compression. +Pcompress is an archiver that also does compression and decompression in +parallel by splitting input data into chunks. It has a modular structure +and includes support for multiple algorithms like LZMA, Bzip2, PPMD, etc, +with SKEIN/SHA checksums for data integrity. Compression algorithms are +selected based on the file type to maximize compression gains using a file +and data anaylis based adaptive technique. It also includes various data +transformation filters to improve compression. + +It also implements Variable Block Deduplication and Delta Compression +features based on a Polynomial Fingerprinting scheme. Delta Compression +is done via the widely popular bsdiff algorithm. Similarity is detected +using a technique based on MinHashing. Deduplication metadata is also +compressed to reduce overheads. In addition to all these it can internally +split chunks at file and rabin boundaries to help Dedupe and compression. It has low metadata overhead and overlaps I/O and compression to achieve maximum parallelism. It also bundles a simple slab allocator to speed repeated allocation of similar chunks. It can work in pipe mode, reading -from stdin and writing to stdout. It also provides adaptive compression -modes in which data analysis heuristics are used to identify near-optimal -algorithms per chunk. Finally it supports 14 compression levels to allow -for ultra compression parameters in some algorithms. +from stdin and writing to stdout. SIMD vector optimizations using the x86 +SSE instruction set are used to speed up various operations. Finally it +supports 14 compression levels to allow for ultra compression parameters +in some algorithms. -Pcompress also supports encryption via AES and uses Scrypt from Tarsnap -for Password Based Key generation. A unique key is generated per session -even if the same password is used and HMAC is used to do authentication. +Pcompress also supports encryption via AES, Salsa20 and uses Scrypt from +Tarsnap for Password Based Key generation. A unique key is generated per +session even if the same password is used and HMAC is used to do authentication. Links of Interest ================= @@ -49,59 +50,229 @@ http://moinakg.wordpress.com/2013/06/11/architecture-for-a-deduplicated-archival http://moinakg.wordpress.com/2013/06/15/architecture-for-a-deduplicated-archival-store-part-2/ -Usage -===== +Standard Usage +============== + Standard usage consists of a few common options to control basic behavior. A variety of + parameters including global deduplication are automatically set based on the compression + level. - To compress a file: - pcompress -c [-l ] [-s ] [] + Archiving + --------- + pcompress -a [-v] [-l ] [-s ] [-c ] + [ ...] [-t ] [-S ] + - Where can be the folowing: - lzfx - Very fast and small algorithm based on LZF. - lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1. - zlib - The base Zlib format compression (not Gzip). - lzma - The LZMA (Lempel-Ziv Markov) algorithm from 7Zip. - lzmaMt - Multithreaded version of LZMA. This is a faster version but - uses more memory for the dictionary. Thread count is balanced - between chunk processing threads and algorithm threads. - bzip2 - Bzip2 Algorithm from libbzip2. - ppmd - The PPMd algorithm excellent for textual data. PPMd requires - at least 64MB X core-count more memory than the other modes. + Archives a given set of files and/or directories into a compressed PAX archive. The + PAX datastream is encoded into a custom format compressed file that can only be + handled by Pcompress. - libbsc - A Block Sorting Compressor using the Burrows Wheeler Transform - like Bzip2 but runs faster and gives better compression than - Bzip2 (See: libbsc.com). + -a Enables archive mode where pathnames specified in the command line are + archived using LibArchive and then compressed. - adapt - Adaptive mode where ppmd or bzip2 will be used per chunk, - depending on heuristics. If at least 50% of the input data is - 7-bit text then PPMd will be used otherwise Bzip2. - adapt2 - Adaptive mode which includes ppmd and lzma. If at least 80% of - the input data is 7-bit text then PPMd will be used otherwise - LZMA. It has significantly more memory usage than adapt. - none - No compression. This is only meaningful with -D and -E so Dedupe - can be done for post-processing with an external utility. + -l + Select a compression level from 1 (least compression, fastest) to 14 + (ultra compression, slow). Default: 6 - - This can be in bytes or can use the following suffixes: - g - Gigabyte, m - Megabyte, k - Kilobyte. - Larger chunks produce better compression at the cost of memory. - In case of Global Deduplication (see below) this chunk size is - just a hint and may get automatically adjusted. - - Can be a number from 0 meaning minimum and 14 meaning - maximum compression. - - Optional argument specifying the destination compressed - file. The '.pz' extension is appended. If this is '-' then - compressed output goes to stdout. If this argument is omitted then - source filename is used with the extension '.pz' appended. + -s + Archive data is split into chunks that are processed in parallel. This value + specifies the maximum chunk size. Blocks may be smaller than this. Values + can be in bytes or format where suffix can be k - KB, m - MB, + g - GB. Default: 8m + Larger chunks can produce better compression at the cost of memory. - To decompress a file compressed using above command: - pcompress -d - - can be '-' to indicate reading from stdin while write goes - to + -c + Specifies the compression algorithm to use. Default algorithm when archiving + is adapt2 (Second Adaptive Mode). This is the ideal mode for archiving giving + best compression gains. However adapt (Adaptive Mode) can be used which is a + little faster but give lower compression gains. + Other algorithms can be used if all the files are of the same known type. For + example ppmd (slow) or libbsc (fast) can be used if all the files only have + ASCII text. See section "Compression Algorithms" for details. - To operate as a full pipe, read from stdin and write to stdout: - pcompress -p ... + -v Enables verbose mode where each file/directory is printed as it is processed. - Attempt Rabin fingerprinting based deduplication on a per-chunk basis: + -t + Sets the number of threads that Pcompress can use. Pcompress automatically + uses thread count = core count. However with larger chunk size (-s option) + and/or ultra compression levels, large amounts of memory can be used. In this + case thread count can be reduced to reduce memory consumption. + + -S + Specify then chunk checksum to use. Default: BLAKE256. The following checksums + are available: + + CRC64 - Extremely Fast 64-bit CRC from LZMA SDK. + SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86. + SHA512 - SHA512 version of Intel's optimized (SSE,AVX) SHA2 for x86. + KECCAK256 - Official 256-bit NIST SHA3 optimized implementation. + KECCAK512 - Official 512-bit NIST SHA3 optimized implementation. + BLAKE256 - Very fast 256-bit BLAKE2, derived from the NIST SHA3 + runner-up BLAKE. + BLAKE512 - Very fast 256-bit BLAKE2, derived from the NIST SHA3 + runner-up BLAKE. + + The fastest checksum is the BLAKE2 family. + + + Pathname of the resulting archive. A '.pz' extension is automatically added + if not already present. This can also be specified as '-' in order to send + the compressed archive stream to stdout. + + Single File Compression + ----------------------- + pcompress -c [-l ] [-s ] [-p] [] + [-t ] [-S ] [] + + Takes a single file as input and produces a compressed file. Archiving is not performed. + This can also work as compression pipeline. + + -c + See above. Also see section "Compression Algorithms" for details. + + -l + -s + -t + -S + See above. + Note: In singe file compression mode with adapt2 or adapt algorithm, larger + chunks may not produce better compression. Smaller chunks can result + in better data analysis here. + + -p Make Pcompress work in full pipeline mode. Data is ingested via stdin + compressed and output via stdout. No filenames are used. + + + Pathname of the compressed file to be created. This can be '-' to send the + compressed data to stdout. + + Decompression and Archive extraction + ------------------------------------ + pcompress -d [-m] [-K] [] + + -m Enable restoring *all* permissions, ACLs, Extended Attributes etc. + Equivalent to the '-p' option in tar. Ownership is only extracted if run as + root user. + + -K Do not overwrite newer files. + + -m and -K are only meaningful if the compressed file is an archive. For single file + compressed mode these options are ignored. + + + Specifies the compressed file or archive. This can be '-' to indicate reading + from stdin while write goes to + + + This can be a filename or a directory depending on how the archive was created. + If single file compression was used then this can be the name of the target + file that will hold the uncompressed data. + If this is omitted then an output file is created by appending '.out' to the + compressed filename. + + If Archiving was done then this should be the name of a directory into which + extracted files are restored. The directory is created if it does not exist. + If this is omitted the files are extracted into the current directory. + +Compression Algorithms +====================== + + lzfx - Fast, average compression. At high compression levels this can be faster + than LZ4. + Effective Levels: 1 - 5 + lz4 - Very Fast, sometimes better compression than LZFX. + Effective Levels: 1 - 3 + zlib - Fast, better compression. + Effective Levels: 1 - 9 + bzip2 - Slow, much better compression than Zlib. + Effective Levels: 1 - 9 + + lzma - Very slow. Extreme compression. Recommended: Use lzmaMt variant mentioned + below. + Effective Levels: 1 - 14 + Till level 9 it is standard LZMA parameters. Levels 10 - 12 use + more memory and higher match iterations so are slower. Levels + 13 and 14 use larger dictionaries upto 256MB and really suck up + RAM. Use these levels only if you have at the minimum 4GB RAM on + your system. + lzmaMt - This is the multithreaded variant of lzma and typically runs faster. + However in a few cases this can produce slightly lesser compression + gain. + + PPMD - Slow. Extreme compression for Text, average compression for binary. + In addition PPMD decompression time is also high for large chunks. + This requires lots of RAM similar to LZMA. PPMd requires + at least 64MB X core-count more memory than the other modes. + Effective Levels: 1 - 14. + + Adapt - Synthetic mode with text/binary detection. For pure text data PPMD is + used otherwise Bzip2 is selected per chunk. + Effective Levels: 1 - 14 + Adapt2 - Slower synthetic mode. For pure text data PPMD is otherwise LZMA is + applied. Can give very good compression ratio when splitting file + into multiple chunks. + Effective Levels: 1 - 14 + Since both LZMA and PPMD are used together memory requirements are + large especially if you are also using extreme levels above 10. For + example with 100MB chunks, Level 14, 2 threads and with or without + dedupe, it uses upto 2.5GB physical RAM (RSS). + + none - No compression. This is only meaningful with -G or -D. So Dedupe + can be done for post-processing with an external utility. + +Enabled features based on Compression Level +=========================================== + + 1 to 3 - No features, just compression and archiving, if needed. + 4 - Global Deduplication with avg block size of 8KB. + 5 - Global Dedup block size 8KB, Adaptive Delta Encoding. + 6 to 8 - Global Dedup block size reduced to 4KB, Adaptive Delta Encoding. + 9 - Global Dedup block size reduced to 2KB, Adaptive Delta Encoding, Dispack. + 10 - Global Dedup block size 2KB, Adaptive Delta Encoding with extra rounds, Dispack, + LZP Preprocessing + 10 - 14 - Global Dedup block size 2KB, Adaptive Delta Encoding with extra rounds, Dispack, + LZP Preprocessing, PackJPG filter for Jpegs. + +Encryption +========== + Pcompress supports encryption and authentication in both archive and single-file + compresion modes. Encryption options are discussed below. + + NOTE: When using pipe-mode via -p the only way to provide a password is to use '-w'. + See below. + + -e + Encrypt chunks using the given encryption algorithm. The algo parameter + can be one of AES or SALSA20. Both are used in CTR stream encryption + mode. + The password can be prompted from the user or read from a file. Unique + keys are generated every time pcompress is run even when giving the same + password. Of course enough info is stored in the compresse file so that + the key used for the file can be re-created given the correct password. + + Default key length if 256 bits but can be reduced to 128 bits using the + '-k' option. + + The Scrypt algorithm from Tarsnap is used + (See: http://www.tarsnap.com/scrypt.html) for generating keys from + passwords. The CTR mode AES mechanism from Tarsnap is also utilized. + + -w + Provide a file which contains the encryption password. This file must + be readable and writable since it is zeroed out after the password is + read. + + -k + Specify the key length. Can be 16 for 128 bit keys or 32 for 256 bit + keys. Default value is 32 for 256 bit keys. + + +Advanced usage +============== + A variety of advanced options are provided if one wishes fine grained control + as opposed to automatic settings. If advanced options are used then auto-setting + of parameters get disabled. The various advanced options are discussed below. + + Attempt Polynomial fingerprinting based deduplication on a per-chunk basis: pcompress -D ... Perform Delta Encoding in addition to Identical Dedup: @@ -114,233 +285,143 @@ Usage effect greater final compression ratio at the cost of higher processing overhead. - Number of threads can optionally be specified: -t <1 - 256 count> - Other flags: - '-L' - Enable LZP pre-compression. This improves compression ratio of all - algorithms with some extra CPU and very low RAM overhead. Using - delta encoding in conjunction with this may not always be beneficial. - However Adaptive Delta Encoding is beneficial along with this. + -L Enable LZP pre-compression. This improves compression ratio of all + algorithms with some extra CPU and very low RAM overhead. Using + delta encoding in conjunction with this may not always be beneficial. + However Adaptive Delta Encoding is beneficial along with this. - '-P' - Enable Adaptive Delta Encoding. It can improve compresion ratio further - for data containing tables of numerical values especially if those are - in an arithmetic series. In this implementation basic Delta Encoding is - combined with Run-Length encoding and Matrix transpose - NOTE - Both -L and -P can be used together to give maximum benefit on most - datasets. + -P Enable Adaptive Delta Encoding. It can improve compresion ratio further + for data containing tables of numerical values especially if those are + in an arithmetic series. In this implementation basic Delta Encoding is + combined with Run-Length encoding and Matrix transpose + NOTE - Both -L and -P can be used together to give maximum benefit on most + datasets. - '-S' - - Specify chunk checksum to use: + -F Perform Fixed Block Deduplication. This is faster than fingerprinting + based content-aware deduplication in some cases. However this is mostly + usable for disk dumps especially virtual machine images. This generally + gives lower dedupe ratio than content-aware dedupe (-D) and does not + support delta compression. - CRC64 - Extremely Fast 64-bit CRC from LZMA SDK. - SHA256 - SHA512/256 version of Intel's optimized (SSE,AVX) SHA2 for x86. - SHA512 - SHA512 version of Intel's optimized (SSE,AVX) SHA2 for x86. - KECCAK256 - Official 256-bit NIST SHA3 optimized implementation. - KECCAK512 - Official 512-bit NIST SHA3 optimized implementation. - BLAKE256 - Very fast 256-bit BLAKE2, derived from the NIST SHA3 - runner-up BLAKE. - BLAKE512 - Very fast 256-bit BLAKE2, derived from the NIST SHA3 - runner-up BLAKE. - - '-F' - Perform Fixed Block Deduplication. This is faster than fingerprinting - based content-aware deduplication in some cases. However this is mostly - usable for disk dumps especially virtual machine images. This generally - gives lower dedupe ratio than content-aware dedupe (-D) and does not - support delta compression. - - '-B' <0..5> - - Specify an average Dedupe block size. 0 - 2K, 1 - 4K, 2 - 8K ... 5 - 64K. - Default deduplication block size is 4KB for Global Deduplication and 2KB - otherwise. - '-B' 0 - - This uses blocks as small as 2KB for deduplication. This option can be - used for datasets of a few GBs to a few hundred TBs in size depending on - available RAM. + -B <0..5> + Specify an average Dedupe block size. 0 - 2K, 1 - 4K, 2 - 8K ... 5 - 64K. + Default deduplication block size is 4KB for Global Deduplication and 2KB + otherwise. + -B 0 + This uses blocks as small as 2KB for deduplication. This option can be + used for datasets of a few GBs to a few hundred TBs in size depending on + available RAM. - Caveats: - In some cases like LZMA with extreme compression levels and with '-L' and - '-P' preprocessing enabled, this can result in lower compression as compared - to using '-B 1'. - For fast compression algorithms like LZ4 and Zlib this should always benefit. - However please test on your sample data with your desired compression - algorithm to verify the results. + -M Display memory allocator statistics. + -C Display compression statistics. - '-M' - Display memory allocator statistics - '-C' - Display compression statistics + Global Deduplication + -------------------- + -G This flag enables Global Deduplication. This makes pcompress maintain an + in-memory index to lookup cryptographic block hashes for duplicates. Once + a duplicate is found it is replaced with a reference to the original block. + This allows detecting and eliminating duplicate blocks across the entire + dataset. In contrast using only '-D' or '-F' flags does deduplication only + within the chunk but uses less memory and is much faster than Global Dedupe. - Global Deduplication: - '-G' - This flag enables Global Deduplication. This makes pcompress maintain an - in-memory index to lookup cryptographic block hashes for duplicates. Once - a duplicate is found it is replaced with a reference to the original block. - This allows detecting and eliminating duplicate blocks across the entire - dataset. In contrast using only '-D' or '-F' flags does deduplication only - within the chunk but uses less memory and is much faster than Global Dedupe. + The '-G' flag can be combined with either '-D' or '-F' flags to indicate + rabin chunking or fixed chunking respectively. If these flags are not + specified then the default is to assume rabin chunking via '-D'. + All other Dedupe flags have the same meanings in this context. - The '-G' flag can be combined with either '-D' or '-F' flags to indicate - rabin chunking or fixed chunking respectively. If these flags are not - specified then the default is to assume rabin chunking via '-D'. - All other Dedupe flags have the same meanings in this context. + Delta Encoding is not supported with Global Deduplication at this time. The + in-memory hashtable index can use upto 75% of free RAM depending on the size + of the dataset. In Pipe mode the index will always use 75% of free RAM since + the dataset size is not known. This is the simple full block index mode. If + the available RAM is not enough to hold all block checksums then older block + entries are discarded automatically from the matching hash slots. - Delta Encoding is not supported with Global Deduplication at this time. The - in-memory hashtable index can use upto 75% of free RAM depending on the size - of the dataset. In Pipe mode the index will always use 75% of free RAM since - the dataset size is not known. This is the simple full block index mode. If - the available RAM is not enough to hold all block checksums then older block - entries are discarded automatically from the matching hash slots. - - If pipe mode is not used and the given dataset is a file then Pcompress - checks whether the index size will exceed three times of 75% of the available - free RAM. In such a case it automatically switches to a Segmented Deduplication - mode. Here data is first split into blocks as above. Then upto 2048 blocks are - grouped together to form a larger segment. The individual block hashes for a - segment are stored on a tempfile on disk. A few min-values hashes are then - computed from the block hashes of the segment which are then loaded into the - index. These hashes are used to detect segments that are approximately similar - to each other. Once found the block hashes of the matching segments are loaded - from the temp file and actual deduplication is performed. This allows the - in-memory index size to be approximately 0.0025% of the total dataset size and - requires very few disk reads for every 2048 blocks processed. + If pipe mode is not used and the given dataset is a file then Pcompress + checks whether the index size will exceed three times of 75% of the available + free RAM. In such a case it automatically switches to a Segmented Deduplication + mode. Here data is first split into blocks as above. Then upto 2048 blocks are + grouped together to form a larger segment. The individual block hashes for a + segment are stored on a tempfile on disk. A few min-values hashes are then + computed from the block hashes of the segment which are then loaded into the + index. These hashes are used to detect segments that are approximately similar + to each other. Once found the block hashes of the matching segments are loaded + from the temp file and actual deduplication is performed. This allows the + in-memory index size to be approximately 0.0025% of the total dataset size and + requires very few disk reads for every 2048 blocks processed. - In pipe mode Global Deduplication always uses a segmented similarity based - index. It allows efficient network transfer of large data. - - Encryption flags: - '-e ' - Encrypt chunks using the given encryption algorithm. The algo parameter - can be one of AES or SALSA20. Both are used in CTR stream encryption - mode. - The password can be prompted from the user or read from a file. Unique - keys are generated every time pcompress is run even when giving the same - password. Of course enough info is stored in the compresse file so that - the key used for the file can be re-created given the correct password. - - Default key length if 256 bits but can be reduced to 128 bits using the - '-k' option. - - The Scrypt algorithm from Tarsnap is used - (See: http://www.tarsnap.com/scrypt.html) for generating keys from - passwords. The CTR mode AES mechanism from Tarsnap is also utilized. - - '-w ' - Provide a file which contains the encryption password. This file must - be readable and writable since it is zeroed out after the password is - read. - - '-k ' - Specify the key length. Can be 16 for 128 bit keys or 32 for 256 bit - keys. Default value is 32 for 256 bit keys. - -NOTE: When using pipe-mode via -p the only way to provide a password is to use '-w'. + In pipe mode Global Deduplication always uses a segmented similarity based + index. It allows efficient network transfer of large data. Environment Variables ===================== -Set ALLOCATOR_BYPASS=1 in the environment to avoid using the the built-in -allocator. Due to the the way it rounds up an allocation request to the nearest -slab the built-in allocator can allocate extra unused memory. In addition you -may want to use a different allocator in your environment. + Set ALLOCATOR_BYPASS=1 in the environment to avoid using the the built-in + allocator. Due to the the way it rounds up an allocation request to the nearest + slab the built-in allocator can allocate extra unused memory. In addition you + may want to use a different allocator in your environment. -The variable PCOMPRESS_INDEX_MEM can be set to limit memory used by the Global -Deduplication Index. The number specified is in multiples of a megabyte. + The variable PCOMPRESS_INDEX_MEM can be set to limit memory used by the Global + Deduplication Index. The number specified is in multiples of a megabyte. -The variable PCOMPRESS_CACHE_DIR can point to a directory where some temporary -files relating to the Global Deduplication process can be stored. This for example -can be a directory on a Solid State Drive to speed up Global Deduplication. The -space used in this directory is proportional to the size of the dataset being -processed and is slightly more than 8KB for every 1MB of data. + The variable PCOMPRESS_CACHE_DIR can point to a directory where some temporary + files relating to the Global Deduplication process can be stored. This for example + can be a directory on a Solid State Drive to speed up Global Deduplication. The + space used in this directory is proportional to the size of the dataset being + processed and is slightly more than 8KB for every 1MB of data. -The default checksum used for block hashes during Global Deduplication is SHA256. -However this can be changed by setting the PCOMPRESS_CHUNK_HASH_GLOBAL environment -variable. The list of allowed checksums for this is: + The default checksum used for block hashes during Global Deduplication is SHA256. + However this can be changed by setting the PCOMPRESS_CHUNK_HASH_GLOBAL environment + variable. The list of allowed checksums for this is: -SHA256 , SHA512 -KECCAK256, KECCAK512 -BLAKE256 , BLAKE512 -SKEIN256 , SKEIN512 + SHA256 , SHA512 + KECCAK256, KECCAK512 + BLAKE256 , BLAKE512 + SKEIN256 , SKEIN512 -Even though SKEIN is not supported as a chunk checksum (not deemed necessary -because BLAKE2 is available) it can be used as a dedupe block checksum. One may -ask why? The reasoning is we depend on hashes to find duplicate blocks. Now SHA256 -is the default because it is known to be robust and unbroken till date. Proven as -yet in the field. However one may want a faster alternative so we have choices -from the NIST SHA3 finalists in the form of SKEIN and BLAKE which are neck to -neck with SKEIN getting an edge. SKEIN and BLAKE have seen extensive cryptanalysis -in the intervening years and are unbroken with only marginal theoretical issues -determined. BLAKE2 is a derivative of BLAKE and is tremendously fast but has not -seen much specific cryptanalysis as yet, even though it is not new but just a -performance optimized derivate. So cryptanalysis that applies to BLAKE should -also apply and justify BLAKE2. However the paranoid may well trust SKEIN a bit -more than BLAKE2 and SKEIN while not being as fast as BLAKE2 is still a lot faster -than SHA2. + Even though SKEIN is not supported as a chunk checksum (not deemed necessary + because BLAKE2 is available) it can be used as a dedupe block checksum. One may + ask why? The reasoning is we depend on hashes to find duplicate blocks. Now SHA256 + is the default because it is known to be robust and unbroken till date. Proven as + yet in the field. However one may want a faster alternative so we have choices + from the NIST SHA3 finalists in the form of SKEIN and BLAKE which are neck to + neck with SKEIN getting an edge. SKEIN and BLAKE have seen extensive cryptanalysis + in the intervening years and are unbroken with only marginal theoretical issues + determined. BLAKE2 is a derivative of BLAKE and is tremendously fast but has not + seen much specific cryptanalysis as yet, even though it is not new but just a + performance optimized derivate. So cryptanalysis that applies to BLAKE should + also apply and justify BLAKE2. However the paranoid may well trust SKEIN a bit + more than BLAKE2 and SKEIN while not being as fast as BLAKE2 is still a lot faster + than SHA2. Examples ======== -Simple compress "file.tar" using zlib(gzip) algorithm. Default chunk or per-thread +Archive contents of directory /usr/include into usr.pz. Default chunk or per-thread segment size is 8MB and default compression level is 6. + pcompress -a /usr/include usr + +Archive the given listr of files into file.pz and max compresion level and all features +enabled. A maximum chunk size of 20MB is used. Also use verbose mode which lists each +file as it is processed. + + pcompress -a -v -l14 -s20m file1 file2 file3 file + +Simple compress "file.tar" using zlib(gzip) algorithm. Default chunk or per-thread +segment size is 8MB and default compression level is 6. Output file created will be +file.tar.pz + pcompress -c zlib file.tar -Compress "file.tar" using bzip2 level 6, 64MB chunk size and use 4 threads. In -addition perform identity deduplication and delta compression prior to compression. +Simple compress "file.tar" using zlib(gzip) algorithm with output file file.compressed.pz - pcompress -D -E -c bzip2 -l6 -s64m -t4 file.tar - -Compress "file.tar" using zlib and also perform Global Deduplication. Default block -size used for deduplication is 4KB. Also redirect the compressed output to stdout and -send it to a compressed file at a different path. + pcompress -c zlib file.tar file.compressed - pcompress -G -c zlib -l9 -s10m file.tar - > /path/to/compress_file.tar.pz - -Perform the same as above but this time use a deduplication block size of 8KB. +Compress "file.tar" using Zlib and per-thread chunk or segment size of 10MB and +Compression level 9. Compressed output is sent to stdout using '-' which is then +redirected to a file. - pcompress -G -c zlib -l9 -B2 -s10m file.tar - > /path/to/compress_file.tar.pz - -Compress "file.tar" using extreme compression mode of LZMA and a chunk size of -of 1GB. Allow pcompress to detect the number of CPU cores and use as many threads. - - pcompress -c lzma -l14 -s1g file.tar - -Compress "file.tar" using lz4 at max compression with LZ-Prediction pre-processing -and encryption enabled. Chunksize is 100M: - - pcompress -c lz4 -l3 -e -L -s100m file.tar - -Compression Algorithms -====================== - -LZFX - Ultra Fast, average compression. This algorithm is the fastest overall. - Levels: 1 - 5 -LZ4 - Very Fast, better compression than LZFX. - Levels: 1 - 3 -Zlib - Fast, better compression. - Levels: 1 - 9 -Bzip2 - Slow, much better compression than Zlib. - Levels: 1 - 9 - -LZMA - Very slow. Extreme compression. - Levels: 1 - 14 - Till level 9 it is standard LZMA parameters. Levels 10 - 12 use - more memory and higher match iterations so are slower. Levels - 13 and 14 use larger dictionaries upto 256MB and really suck up - RAM. Use these levels only if you have at the minimum 4GB RAM on - your system. - -PPMD - Slow. Extreme compression for Text, average compression for binary. - In addition PPMD decompression time is also high for large chunks. - This requires lots of RAM similar to LZMA. - Levels: 1 - 14. - -Adapt - Synthetic mode with text/binary detection. For pure text data PPMD is - used otherwise Bzip2 is selected per chunk. - Levels: 1 - 14 -Adapt2 - Slower synthetic mode. For pure text data PPMD is otherwise LZMA is - applied. Can give very good compression ratio when splitting file - into multiple chunks. - Levels: 1 - 14 - Since both LZMA and PPMD are used together memory requirements are - large especially if you are also using extreme levels above 10. For - example with 100MB chunks, Level 14, 2 threads and with or without - dedupe, it uses upto 2.5GB physical RAM (RSS). + pcompress -c zlib -l9 -s10m file.tar - > /path/to/compress_file.tar.pz It is possible for a single chunk to span the entire file if enough RAM is available. However for adaptive modes to be effective for large files, especially @@ -349,46 +430,46 @@ algorithm can be selected for textual and binary portions. Pre-Processing Algorithms ========================= -As can be seen above a multitude of pre-processing algorithms are available that -provide further compression effectiveness beyond what the usual compression -algorithms can achieve by themselves. These are summarized below: + As can be seen above a multitude of pre-processing algorithms are available that + provide further compression effectiveness beyond what the usual compression + algorithms can achieve by themselves. These are summarized below: -1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin - fingerprinting. + 1) Deduplication : Per-Chunk (or per-segment) deduplication based on Rabin + fingerprinting. -2) Delta Compression : A similarity based (minhash) comparison of Rabin blocks. Two - blocks at least 60% similar with each other are diffed using - bsdiff. + 2) Delta Compression : A similarity based (minhash) comparison of Rabin blocks. + Two blocks at least 60% similar with each other are diffed + using bsdiff. -3) LZP : LZ Prediction is a variant of LZ77 that replaces repeating - runs of text with shorter codes. + 3) LZP : LZ Prediction is a variant of LZ77 that replaces repeating + runs of text with shorter codes. -4) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic - progressions are detected in the data stream and collapsed - via Run-Length encoding. + 4) Adaptive Delta : This is a simple form of Delta Encoding where arithmetic + progressions are detected in the data stream and + collapsed via Run-Length encoding. -4) Matrix Transpose : This is used automatically in Delta Encoding and Deduplication. - This attempts to transpose columnar repeating sequences of - bytes into row-wise sequences so that compression algorithms - can work better. + 4) Matrix Transpose : This is used automatically in Delta Encoding and + Deduplication. This attempts to transpose columnar + repeating sequences of bytes into row-wise sequences so + that compression algorithms can work better. Memory Usage ============ -As can be seen from above memory usage can vary greatly based on compression/ -pre-processing algorithms and chunk size. A variety of configurations are possible -depending on resource availability in the system. + As can be seen from above memory usage can vary greatly based on compression/ + pre-processing algorithms and chunk size. A variety of configurations are possible + depending on resource availability in the system. -The minimum possible meaningful settings while still giving about 50% compression -ratio and very high speed is with the LZFX algorithm with 1MB chunk size and 2 -threads: + The minimum possible meaningful settings while still giving about 50% compression + ratio and very high speed is with the LZFX algorithm with 1MB chunk size and 2 + threads: pcompress -c lzfx -l2 -s1m -t2 -This uses about 6MB of physical RAM (RSS). Earlier versions of the utility before -the 0.9 release comsumed much more memory. This was improved in the later versions. -When using Linux the virtual memory consumption may appear to be very high but it -is just address space usage rather than actual RAM and should be ignored. It is only -the RSS that matters. This is a result of the memory arena mechanism in Glibc that -improves malloc() performance for multi-threaded applications. + This uses about 6MB of physical RAM (RSS). Earlier versions of the utility before + the 0.9 release comsumed much more memory. This was improved in the later versions. + When using Linux the virtual memory consumption may appear to be very high but it + is just address space usage rather than actual RAM and should be ignored. It is only + the RSS that matters. This is a result of the memory arena mechanism in Glibc that + improves malloc() performance for multi-threaded applications. diff --git a/adaptive_compress.c b/adaptive_compress.c index 01b8dd3..edbb6d2 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -230,8 +230,9 @@ adapt_compress(void *src, uint64_t srclen, void *dst, struct adapt_data *adat = (struct adapt_data *)(data); uchar_t *src1 = (uchar_t *)src; int rv = 0, bsc_type = 0; + int stype = PC_SUBTYPE(btype); - if (btype == TYPE_UNKNOWN) { + if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) { uint64_t i, tot8b, tag1, tag2, tag3; double tagcnt, pct_tag; uchar_t cur_byte, prev_byte; @@ -267,6 +268,29 @@ adapt_compress(void *src, uint64_t srclen, void *dst, tagcnt > (double)srclen * 0.001) btype |= TYPE_MARKUP; } + + } else if (stype == TYPE_PDF) { + uint64_t i, tot8b; + uchar_t cur_byte; + + /* + * For PDF files we need to check for uncompressed PDFs. Those are compressed + * using Libbsc. + */ + tot8b = 0; + for (i = 0; i < srclen; i++) { + cur_byte = src1[i]; + tot8b += (cur_byte & 0x80); + } + + tot8b /= 0x80; + if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) { + btype = TYPE_BINARY; + } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { + btype = TYPE_BINARY; + } else { + btype = TYPE_TEXT|TYPE_MARKUP; + } } /* diff --git a/archive/pc_arc_filter.h b/archive/pc_arc_filter.h index fc5076c..211d087 100644 --- a/archive/pc_arc_filter.h +++ b/archive/pc_arc_filter.h @@ -46,6 +46,7 @@ struct filter_info { struct archive_entry *entry; int fd; int compressing, block_size; + int *type_ptr; }; struct filter_flags { diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 042e5f3..01264b1 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -842,7 +842,7 @@ setup_extractor(pc_ctx_t *pctx) } static ssize_t -process_by_filter(int fd, int typ, struct archive *target_arc, +process_by_filter(int fd, int *typ, struct archive *target_arc, struct archive *source_arc, struct archive_entry *entry, int cmp) { struct filter_info fi; @@ -854,10 +854,11 @@ process_by_filter(int fd, int typ, struct archive *target_arc, fi.fd = fd; fi.compressing = cmp; fi.block_size = AW_BLOCK_SIZE; - wrtn = (*(typetab[(typ >> 3)].filter_func))(&fi, typetab[(typ >> 3)].filter_private); + fi.type_ptr = typ; + wrtn = (*(typetab[(*typ >> 3)].filter_func))(&fi, typetab[(*typ >> 3)].filter_private); if (wrtn == FILTER_RETURN_ERROR) { log_msg(LOG_ERR, 0, "Error invoking filter module: %s", - typetab[(typ >> 3)].filter_name); + typetab[(*typ >> 3)].filter_name); } return (wrtn); } @@ -890,7 +891,8 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry, if (typetab[(typ >> 3)].filter_func != NULL) { int64_t rv; - rv = process_by_filter(fd, typ, arc, NULL, entry, 1); + pctx->ctype = typ; + rv = process_by_filter(fd, &(pctx->ctype), arc, NULL, entry, 1); if (rv == FILTER_RETURN_ERROR) { close(fd); return (-1); @@ -934,7 +936,7 @@ do_map: int64_t rv; munmap(mapbuf, len); - rv = process_by_filter(fd, typ, arc, NULL, entry, 1); + rv = process_by_filter(fd, &(pctx->ctype), arc, NULL, entry, 1); if (rv == FILTER_RETURN_ERROR) { return (-1); } else if (rv == FILTER_RETURN_SKIP) { @@ -1149,7 +1151,7 @@ copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entr if (typetab[(typ >> 3)].filter_func != NULL) { int64_t rv; - rv = process_by_filter(-1, typ, aw, ar, entry, 0); + rv = process_by_filter(-1, &typ, aw, ar, entry, 0); if (rv == FILTER_RETURN_ERROR) { archive_set_error(ar, archive_errno(aw), "%s", archive_error_string(aw)); @@ -1231,7 +1233,8 @@ extractor_thread_func(void *dat) { * Extract all security attributes if we are root. */ if (pctx->force_archive_perms || geteuid() == 0) { - flags |= ARCHIVE_EXTRACT_OWNER; + if (geteuid() == 0) + flags |= ARCHIVE_EXTRACT_OWNER; flags |= ARCHIVE_EXTRACT_PERM; flags |= ARCHIVE_EXTRACT_ACL; flags |= ARCHIVE_EXTRACT_XATTR; @@ -1475,17 +1478,21 @@ out: * Detect a few file types from looking at magic signatures. * NOTE: Jpeg files must be detected via '.jpg' or '.jpeg' (case-insensitive) * extensions. Do not add Jpeg header detection here. it will break - * context based PackJPG processing. Jpeg files not have proper + * context based PackJPG processing. Jpeg files not having proper * extension must not be processed via PackJPG. */ static int detect_type_by_data(uchar_t *buf, size_t len) { // At least a few bytes. - if (len < 16) return (TYPE_UNKNOWN); + if (len < 512) return (TYPE_UNKNOWN); if (memcmp(buf, "!\n", 8) == 0) return (TYPE_BINARY|TYPE_ARCHIVE_AR); + if (memcmp(&buf[257], "ustar\0", 6) == 0 || memcmp(&buf[257], "ustar\040\040\0", 8) == 0) + return (TYPE_BINARY|TYPE_ARCHIVE_TAR); + if (memcmp(buf, "%PDF-", 5) == 0) + return (TYPE_BINARY|TYPE_PDF); if (U32_P(buf) == ELFINT) { // Regular ELF, check for 32/64-bit, core dump if (*(buf + 16) != 4) { if (*(buf + 4) == 2) { diff --git a/pcompress.c b/pcompress.c index a53f6b8..fa02f32 100644 --- a/pcompress.c +++ b/pcompress.c @@ -2832,7 +2832,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) ff.enable_packjpg = 0; pthread_mutex_lock(&opt_parse); - while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDGEe:w:LPS:B:Fk:avnmK")) != -1) { + while ((opt = getopt(argc, argv, "dc:s:l:pt:MCDGEe:w:LPS:B:Fk:avmK")) != -1) { int ovr; int64_t chunksize; @@ -2982,10 +2982,6 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) pctx->verbose = 1; break; - case 'n': - pctx->enable_archive_sort = -1; - break; - case 'm': pctx->force_archive_perms = 1; break; @@ -3023,6 +3019,11 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) return (1); } + if (pctx->archive_mode && pctx->pipe_mode) { + log_msg(LOG_ERR, 0, "Full pipeline mode is meaningless with archiver."); + return (1); + } + /* * Default compression algorithm during archiving is Adaptive2. */ diff --git a/pcompress.h b/pcompress.h index 2561248..9adf74b 100644 --- a/pcompress.h +++ b/pcompress.h @@ -236,7 +236,7 @@ typedef struct pc_ctx { uchar_t *arc_buf; uint64_t arc_buf_size, arc_buf_pos; int arc_closed, arc_writing; - uchar_t btype, ctype; + int btype, ctype; int min_chunk; int enable_packjpg; diff --git a/utils/utils.h b/utils/utils.h index bdf5021..358d07d 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -246,7 +246,7 @@ typedef enum { /* * Sub-types. */ -#define NUM_SUB_TYPES 26 +#define NUM_SUB_TYPES 28 TYPE_EXE32 = 8, TYPE_JPEG = 16, TYPE_MARKUP = 24, @@ -272,7 +272,9 @@ typedef enum { TYPE_AUDIO_COMPRESSED = 184, TYPE_EXE64 = 192, TYPE_BMP = 200, - TYPE_TIFF = 208 + TYPE_TIFF = 208, + TYPE_PDF = 216, + TYPE_ARCHIVE_TAR = 224 } data_type_t; /*