pcompress/archive/pc_archive.c
Moinak Ghosh 4c75a2da48 Fix issue #12.
Fix issue #13.
Create output directory with correct mode.
Fix the flow where pathname list is not sorted.
Fix ppmd decompression bug introduced in previous commit.
Reduce compression level for automatic pathname sorting.
Change to extraction directory only after opening archive.
2013-12-27 23:49:47 +05:30

1562 lines
40 KiB
C

/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
* This file includes all the archiving related functions. Pathnames are sorted
* based on extension (or first 4 chars of name if no extension) and size. A simple
* external merge sort is used. This sorting yields better compression ratio.
*
* Sorting is enabled for compression levels greater than 6.
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
#include <utils.h>
#include <pthread.h>
#include <sys/mman.h>
#include <ctype.h>
#include <archive.h>
#include <archive_entry.h>
#include <phash/phash.h>
#include <phash/extensions.h>
#include <phash/standard.h>
#include "pc_archive.h"
#undef _FEATURES_H
#define _XOPEN_SOURCE 700
#include <ftw.h>
#include <stdint.h>
static int inited = 0, filters_inited = 0;
static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
static struct ext_hash_entry {
uint64_t extnum;
int type;
} *exthtab = NULL;
static struct type_data typetab[NUM_SUB_TYPES];
/*
AE_IFREG Regular file
AE_IFLNK Symbolic link
AE_IFSOCK Socket
AE_IFCHR Character device
AE_IFBLK Block device
AE_IFDIR Directory
AE_IFIFO Named pipe (fifo)
*/
#define ARC_ENTRY_OVRHEAD 1024
#define MMAP_SIZE (1024 * 1024)
#define SORT_BUF_SIZE (65536)
#define NAMELEN 4
#define TEMP_MMAP_SIZE (128 * 1024)
#define AW_BLOCK_SIZE (256 * 1024)
typedef struct member_entry {
char name[NAMELEN];
uint32_t file_pos; // 32-bit file position to limit memory usage.
uint64_t size;
} member_entry_t;
struct sort_buf {
member_entry_t members[SORT_BUF_SIZE]; // Use 1MB per sorted buffer
int pos, max;
struct sort_buf *next;
};
static struct arc_list_state {
uchar_t *pbuf;
uint64_t bufsiz, bufpos, arc_size, pathlist_size;
uint32_t fcount;
int fd;
struct sort_buf *srt, *head;
int srt_pos;
} a_state;
pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER;
static int detect_type_by_ext(const char *path, int pathlen);
static int detect_type_by_data(uchar_t *buf, size_t len);
/*
* Archive writer callback routines for archive creation operation.
*/
static int
arc_open_callback(struct archive *arc, void *ctx)
{
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
sem_init(&(pctx->read_sem), 0, 0);
sem_init(&(pctx->write_sem), 0, 0);
pctx->arc_buf = NULL;
pctx->arc_buf_pos = 0;
pctx->arc_buf_size = 0;
return (ARCHIVE_OK);
}
static int
creat_close_callback(struct archive *arc, void *ctx)
{
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
pctx->arc_closed = 1;
if (pctx->arc_buf) {
sem_post(&(pctx->read_sem));
} else {
pctx->arc_buf_pos = 0;
}
return (ARCHIVE_OK);
}
static ssize_t
creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len)
{
uchar_t *buff = (uchar_t *)buf;
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
size_t remaining;
if (pctx->arc_closed) {
archive_set_error(arc, ARCHIVE_EOF, "End of file when writing archive.");
return (-1);
}
if (!pctx->arc_writing) {
sem_wait(&(pctx->write_sem));
}
if (pctx->arc_buf == NULL || pctx->arc_buf_size == 0) {
archive_set_error(arc, ARCHIVE_EOF, "End of file when writing archive.");
return (-1);
}
pctx->arc_writing = 1;
remaining = len;
while (remaining && !pctx->arc_closed) {
uchar_t *tbuf;
tbuf = pctx->arc_buf + pctx->arc_buf_pos;
/*
* Determine if we should return the accumulated data to the caller.
* This is done if the data type changes and at least some minimum amount
* of data has accumulated in the buffer.
*/
if (pctx->btype != pctx->ctype) {
if (pctx->btype == TYPE_UNKNOWN || pctx->arc_buf_pos == 0) {
pctx->btype = pctx->ctype;
} else {
if (pctx->arc_buf_pos < pctx->min_chunk) {
uint32_t diff = pctx->min_chunk - pctx->arc_buf_pos;
if (len > diff)
pctx->btype = pctx->ctype;
else
pctx->ctype = pctx->btype;
} else {
pctx->arc_writing = 0;
sem_post(&(pctx->read_sem));
sem_wait(&(pctx->write_sem));
tbuf = pctx->arc_buf + pctx->arc_buf_pos;
pctx->arc_writing = 1;
if (remaining > 0)
pctx->btype = pctx->ctype;
}
}
}
if (remaining > pctx->arc_buf_size - pctx->arc_buf_pos) {
size_t nlen = pctx->arc_buf_size - pctx->arc_buf_pos;
memcpy(tbuf, buff, nlen);
remaining -= nlen;
pctx->arc_buf_pos += nlen;
buff += nlen;
pctx->arc_writing = 0;
sem_post(&(pctx->read_sem));
sem_wait(&(pctx->write_sem));
pctx->arc_writing = 1;
} else {
memcpy(tbuf, buff, remaining);
pctx->arc_buf_pos += remaining;
remaining = 0;
if (pctx->arc_buf_pos == pctx->arc_buf_size) {
pctx->arc_writing = 0;
sem_post(&(pctx->read_sem));
}
break;
}
}
return (len - remaining);
}
int64_t
archiver_read(void *ctx, void *buf, uint64_t count)
{
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
if (pctx->arc_closed)
return (0);
if (pctx->arc_buf != NULL) {
log_msg(LOG_ERR, 0, "Incorrect sequencing of archiver_read() call.");
return (-1);
}
pctx->arc_buf = buf;
pctx->arc_buf_size = count;
pctx->arc_buf_pos = 0;
pctx->btype = TYPE_UNKNOWN;
sem_post(&(pctx->write_sem));
sem_wait(&(pctx->read_sem));
pctx->arc_buf = NULL;
return (pctx->arc_buf_pos);
}
int
archiver_close(void *ctx)
{
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
pctx->arc_closed = 1;
pctx->arc_buf = NULL;
pctx->arc_buf_size = 0;
sem_post(&(pctx->write_sem));
sem_post(&(pctx->read_sem));
return (0);
}
static int
extract_close_callback(struct archive *arc, void *ctx)
{
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
pctx->arc_closed = 1;
if (pctx->arc_buf) {
sem_post(&(pctx->write_sem));
} else {
pctx->arc_buf_size = 0;
}
return (ARCHIVE_OK);
}
static ssize_t
extract_read_callback(struct archive *arc, void *ctx, const void **buf)
{
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
if (pctx->arc_closed) {
pctx->arc_buf_size = 0;
log_msg(LOG_WARN, 0, "End of file.");
archive_set_error(arc, ARCHIVE_EOF, "End of file.");
return (-1);
}
if (!pctx->arc_writing) {
sem_wait(&(pctx->read_sem));
} else {
sem_post(&(pctx->write_sem));
sem_wait(&(pctx->read_sem));
}
if (pctx->arc_buf == NULL || pctx->arc_buf_size == 0) {
pctx->arc_buf_size = 0;
log_msg(LOG_ERR, 0, "End of file when extracting archive.");
archive_set_error(arc, ARCHIVE_EOF, "End of file when extracting archive.");
return (-1);
}
pctx->arc_writing = 1;
*buf = pctx->arc_buf;
return (pctx->arc_buf_size);
}
int64_t
archiver_write(void *ctx, void *buf, uint64_t count)
{
pc_ctx_t *pctx = (pc_ctx_t *)ctx;
if (pctx->arc_closed) {
log_msg(LOG_WARN, 0, "Archive extractor closed unexpectedly");
return (0);
}
if (pctx->arc_buf != NULL) {
log_msg(LOG_ERR, 0, "Incorrect sequencing of archiver_read() call.");
return (-1);
}
pctx->arc_buf = buf;
pctx->arc_buf_size = count;
sem_post(&(pctx->read_sem));
sem_wait(&(pctx->write_sem));
pctx->arc_buf = NULL;
return (pctx->arc_buf_size);
}
/*
* Comparison function for sorting pathname members. Sort by name/extension and then
* by size.
*/
static int
compare_members(const void *a, const void *b) {
int rv, i;
member_entry_t *mem1 = (member_entry_t *)a;
member_entry_t *mem2 = (member_entry_t *)b;
rv = 0;
for (i = 0; i < NAMELEN; i++) {
rv = mem1->name[i] - mem2->name[i];
if (rv != 0)
return (rv);
}
if (mem1->size > mem2->size)
return (1);
else if (mem1->size < mem2->size)
return (-1);
return (0);
}
/*
* Tell if path entry mem1 is "less than" path entry mem2. This function
* is used during the merge phase.
*/
static int
compare_members_lt(member_entry_t *mem1, member_entry_t *mem2) {
int rv, i;
rv = 0;
for (i = 0; i < NAMELEN; i++) {
rv = mem1->name[i] - mem2->name[i];
if (rv < 0)
return (1);
else if (rv > 0)
return (0);
}
if (mem1->size < mem2->size)
return (1);
return (0);
}
/*
* Fetch the next entry from the pathlist file. If we are doing sorting then this
* fetches the next entry in ascending order of the predetermined sort keys.
*/
static int
read_next_path(pc_ctx_t *pctx, char *fpath, char **namechars, int *fpathlen)
{
short namelen;
ssize_t rbytes;
uchar_t *buf;
int n;
if (pctx->enable_archive_sort) {
member_entry_t *mem1, *mem2;
struct sort_buf *srt, *srt1, *psrt, *psrt1;
/*
* Here we have a set of sorted buffers and we do the external merge phase where
* we pop the buffer entry that is smallest.
*/
srt = (struct sort_buf *)pctx->archive_sort_buf;
if (!srt) return (0);
srt1 = srt;
psrt = srt;
psrt1 = psrt;
mem1 = &(srt->members[srt->pos]);
srt = srt->next;
while (srt) {
mem2 = &(srt->members[srt->pos]);
if (compare_members_lt(mem2, mem1)) {
mem1 = mem2;
srt1 = srt;
psrt1 = psrt;
}
psrt = srt;
srt = srt->next;
}
/*
* If we are not using mmap then seek to the position of the current entry, otherwise
* just note the entry position.
*/
if (pctx->temp_mmap_len == 0) {
if (lseek(pctx->archive_members_fd, mem1->file_pos, SEEK_SET) == (off_t)-1) {
log_msg(LOG_ERR, 1, "Error seeking in archive members file.");
return (-1);
}
} else {
pctx->temp_file_pos = mem1->file_pos;
}
/*
* Increment popped position of the current buffer and check if it is empty.
* The empty buffer is freed and is taken out of the linked list of buffers.
*/
srt1->pos++;
if (srt1->pos > srt1->max) {
if (srt1 == pctx->archive_sort_buf) {
pctx->archive_sort_buf = srt1->next;
free(srt1);
} else {
psrt1->next = srt1->next;
free(srt1);
}
}
}
/*
* Mmap handling. If requested entry is in current mmap region read it. Otherwise attempt
* new mmap.
*/
if (pctx->temp_mmap_len > 0) {
int retried;
if (pctx->temp_file_pos < pctx->temp_mmap_pos ||
pctx->temp_file_pos - pctx->temp_mmap_pos > pctx->temp_mmap_len ||
pctx->temp_mmap_len - (pctx->temp_file_pos - pctx->temp_mmap_pos) < 3) {
uint32_t adj;
do_mmap:
munmap(pctx->temp_mmap_buf, pctx->temp_mmap_len);
adj = pctx->temp_file_pos % pctx->pagesize;
pctx->temp_mmap_pos = pctx->temp_file_pos - adj;
pctx->temp_mmap_len = pctx->archive_temp_size - pctx->temp_mmap_pos;
if (pctx->temp_mmap_len > TEMP_MMAP_SIZE)
pctx->temp_mmap_len = TEMP_MMAP_SIZE ;
pctx->temp_mmap_buf = mmap(NULL, pctx->temp_mmap_len, PROT_READ,
MAP_SHARED, pctx->archive_members_fd, pctx->temp_mmap_pos);
if (pctx->temp_mmap_buf == NULL) {
log_msg(LOG_ERR, 1, "Error mmap-ing archive members file.");
return (-1);
}
}
retried = 0;
buf = pctx->temp_mmap_buf + (pctx->temp_file_pos - pctx->temp_mmap_pos);
namelen = U32_P(buf);
pctx->temp_file_pos += 2;
/*
* If length of pathname entry exceeds current mmap region, repeat mmap
* at the entry offset. Only one repeat attempt is made. If there is a
* failure then we give up.
*/
if (pctx->temp_mmap_len - (pctx->temp_file_pos - pctx->temp_mmap_pos) < namelen) {
if (!retried) {
pctx->temp_file_pos -= 2;
retried = 1;
goto do_mmap;
} else {
log_msg(LOG_ERR, 0, "Unable to mmap after retry.");
return (-1);
}
}
buf = pctx->temp_mmap_buf + (pctx->temp_file_pos - pctx->temp_mmap_pos);
memcpy(fpath, buf, namelen);
fpath[namelen] = '\0';
*fpathlen = namelen;
n = namelen-1;
while (fpath[n] == '/' && n > 0) n--;
while (fpath[n] != '/' && fpath[n] != '\\' && n > 0) n--;
*namechars = &fpath[n+1];
pctx->temp_file_pos += namelen;
return (namelen);
}
/*
* This code is used if mmap is not being used for the pathlist file.
*/
if ((rbytes = Read(pctx->archive_members_fd, &namelen, sizeof(namelen))) != 0) {
if (rbytes < 2) {
log_msg(LOG_ERR, 1, "Error reading archive members file.");
return (-1);
}
rbytes = Read(pctx->archive_members_fd, fpath, namelen);
if (rbytes < namelen) {
log_msg(LOG_ERR, 1, "Error reading archive members file.");
return (-1);
}
fpath[namelen] = '\0';
*fpathlen = namelen;
n = namelen-1;
while (fpath[n] == '/' && n > 0) n--;
while (fpath[n] != '/' && fpath[n] != '\\' && n > 0) n--;
*namechars = &fpath[n+1];
printf("%s\n", fpath);
}
return (rbytes);
}
/*
* Build list of pathnames in a temp file.
*/
static int
add_pathname(const char *fpath, const struct stat *sb,
int tflag, struct FTW *ftwbuf)
{
short len;
uchar_t *buf;
const char *basename;
if (tflag == FTW_DP) return (0);
if (tflag == FTW_DNR || tflag == FTW_NS) {
log_msg(LOG_WARN, 0, "Cannot access %s\n", fpath);
return (0);
}
/*
* Pathname entries are pushed into a memory buffer till buffer is full. The
* buffer is then flushed to disk. This is for decent performance.
*/
a_state.arc_size += (sb->st_size + ARC_ENTRY_OVRHEAD);
len = strlen(fpath);
if (a_state.bufpos + len + 14 > a_state.bufsiz) {
ssize_t wrtn = Write(a_state.fd, a_state.pbuf, a_state.bufpos);
if (wrtn < a_state.bufpos) {
log_msg(LOG_ERR, 1, "Write: ");
return (-1);
}
a_state.bufpos = 0;
a_state.pathlist_size += wrtn;
}
/*
* If we are sorting path entries then sort per buffer and then merge when iterating
* through all the path entries.
*/
if (a_state.srt) {
member_entry_t *member;
int i;
char *dot;
basename = &fpath[ftwbuf->base];
if (a_state.srt_pos == SORT_BUF_SIZE) {
struct sort_buf *srt;
/*
* Sort Buffer is full so sort it. Sorting is done by file extension and size.
* If file has no extension then first 4 chars of the filename are used.
*/
srt = (struct sort_buf *)malloc(sizeof (struct sort_buf));
if (srt == NULL) {
log_msg(LOG_WARN, 0, "Out of memory for sort buffer. Continuing without sorting.");
a_state.srt = a_state.head;
while (a_state.srt) {
struct sort_buf *srt;
srt = a_state.srt->next;
free(a_state.srt);
a_state.srt = srt;
goto cont;
}
} else {
log_msg(LOG_INFO, 0, "Sorting ...");
a_state.srt->max = a_state.srt_pos - 1;
qsort(a_state.srt->members, SORT_BUF_SIZE, sizeof (member_entry_t), compare_members);
srt->next = NULL;
srt->pos = 0;
a_state.srt->next = srt;
a_state.srt = srt;
a_state.srt_pos = 0;
}
}
/*
* The total size of path list file that can be handled when sorting is 4GB to
* limit memory usage. If total accumulated path entries exceed 4GB in bytes,
* we abort sorting. This is large enough to handle all practical scenarios
* except in the case of millions of pathname entries each having PATH_MAX length!
*/
if (a_state.pathlist_size + a_state.bufpos >= UINT_MAX) {
log_msg(LOG_WARN, 0, "Too many pathnames. Continuing without sorting.");
a_state.srt = a_state.head;
while (a_state.srt) {
struct sort_buf *srt;
srt = a_state.srt->next;
free(a_state.srt);
a_state.srt = srt;
goto cont;
}
}
member = &(a_state.srt->members[a_state.srt_pos++]);
member->size = sb->st_size;
member->file_pos = a_state.pathlist_size + a_state.bufpos;
dot = strrchr(basename, '.');
// Small NAMELEN so these loops will be unrolled by compiler.
for (i = 0; i < NAMELEN; i++) member->name[i] = 0;
i = 0;
if (!dot) {
while (basename[i] != '\0' && i < NAMELEN) {
member->name[i] = basename[i]; i++;
}
} else {
dot++;
while (dot[i] != '\0' && i < NAMELEN) {
member->name[i] = dot[i]; i++;
}
}
}
cont:
buf = a_state.pbuf + a_state.bufpos;
*((short *)buf) = len;
buf += 2;
memcpy(buf, fpath, len);
a_state.bufpos += (len + 2);
a_state.fcount++;
return (0);
}
/*
* Archiving related functions.
* This one creates a list of files to be included into the archive and
* sets up the libarchive context.
*/
int
setup_archiver(pc_ctx_t *pctx, struct stat *sbuf)
{
char *tmpfile, *tmp;
int err, fd;
uchar_t *pbuf;
struct archive *arc;
struct fn_list *fn;
/*
* If sorting is enabled create the initial sort buffer.
*/
if (pctx->enable_archive_sort) {
struct sort_buf *srt;
srt = (struct sort_buf *)malloc(sizeof (struct sort_buf));
if (srt == NULL) {
log_msg(LOG_ERR, 0, "Out of memory.");
return (-1);
}
srt->next = NULL;
srt->pos = 0;
pctx->archive_sort_buf = srt;
}
/*
* Create a temporary file to hold the generated list of pathnames to be archived.
* Storing in a file saves memory usage and allows scalability.
*/
tmpfile = pctx->archive_members_file;
tmp = get_temp_dir();
strcpy(tmpfile, tmp);
free(tmp);
strcat(tmpfile, "/.pcompXXXXXX");
if ((fd = mkstemp(tmpfile)) == -1) {
log_msg(LOG_ERR, 1, "mkstemp errored.");
return (-1);
}
add_fname(tmpfile);
pbuf = malloc(pctx->chunksize);
if (pbuf == NULL) {
log_msg(LOG_ERR, 0, "Out of memory.");
close(fd); unlink(tmpfile);
return (-1);
}
/*
* Use nftw() to scan all the directory hierarchies provided on the command
* line and generate a consolidated list of pathnames to be archived. By
* doing this we can sort the pathnames and estimate the total archive size.
* Total archive size is needed by the subsequent compression stages.
*/
log_msg(LOG_INFO, 0, "Scanning files.");
sbuf->st_size = 0;
pctx->archive_size = 0;
pctx->archive_members_count = 0;
/*
* nftw requires using global state variable. So we lock to be mt-safe.
* This means only one directory tree scan can happen at a time.
*/
pthread_mutex_lock(&nftw_mutex);
fn = pctx->fn;
a_state.pbuf = pbuf;
a_state.bufsiz = pctx->chunksize;
a_state.bufpos = 0;
a_state.fd = fd;
a_state.srt = pctx->archive_sort_buf;
a_state.srt_pos = 0;
a_state.head = a_state.srt;
a_state.pathlist_size = 0;
while (fn) {
struct stat sb;
if (lstat(fn->filename, &sb) == -1) {
log_msg(LOG_ERR, 1, "Ignoring %s.", fn->filename);
fn = fn->next;
continue;
}
a_state.arc_size = 0;
a_state.fcount = 0;
if (S_ISDIR(sb.st_mode)) {
err = nftw(fn->filename, add_pathname, 1024, FTW_PHYS);
} else {
int tflag;
struct FTW ftwbuf;
char *pos;
if (S_ISLNK(sb.st_mode))
tflag = FTW_SL;
else
tflag = FTW_F;
/*
* Find out basename to mimic FTW.
*/
pos = strrchr(fn->filename, PATHSEP_CHAR);
if (pos)
ftwbuf.base = pos - fn->filename + 1;
else
ftwbuf.base = 0;
add_pathname(fn->filename, &sb, tflag, &ftwbuf);
a_state.arc_size = sb.st_size;
}
if (a_state.bufpos > 0) {
ssize_t wrtn = Write(a_state.fd, a_state.pbuf, a_state.bufpos);
if (wrtn < a_state.bufpos) {
log_msg(LOG_ERR, 1, "Write failed.");
close(fd); unlink(tmpfile);
return (-1);
}
a_state.bufpos = 0;
a_state.pathlist_size += wrtn;
}
pctx->archive_size += a_state.arc_size;
pctx->archive_members_count += a_state.fcount;
fn = fn->next;
}
if (a_state.srt == NULL) {
pctx->enable_archive_sort = 0;
} else {
log_msg(LOG_INFO, 0, "Sorting ...");
a_state.srt->max = a_state.srt_pos - 1;
qsort(a_state.srt->members, a_state.srt_pos, sizeof (member_entry_t), compare_members);
pctx->archive_temp_size = a_state.pathlist_size;
}
pthread_mutex_unlock(&nftw_mutex);
sbuf->st_size = pctx->archive_size;
lseek(fd, 0, SEEK_SET);
free(pbuf);
sbuf->st_uid = geteuid();
sbuf->st_gid = getegid();
sbuf->st_mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
arc = archive_write_new();
if (!arc) {
log_msg(LOG_ERR, 1, "Unable to create libarchive context.\n");
close(fd);
unlink(tmpfile);
return (-1);
}
archive_write_set_format_pax_restricted(arc);
archive_write_set_bytes_per_block(arc, 0);
archive_write_open(arc, pctx, arc_open_callback,
creat_write_callback, creat_close_callback);
pctx->archive_ctx = arc;
pctx->archive_members_fd = fd;
if (pctx->enable_archive_sort) {
pctx->temp_mmap_len = TEMP_MMAP_SIZE;
pctx->temp_mmap_buf = mmap(NULL, pctx->temp_mmap_len, PROT_READ,
MAP_SHARED, pctx->archive_members_fd, 0);
if (pctx->temp_mmap_buf == NULL) {
log_msg(LOG_WARN, 1, "Unable to mmap pathlist file, switching to read().");
pctx->temp_mmap_len = 0;
}
} else {
pctx->temp_mmap_buf = NULL;
pctx->temp_mmap_len = 0;
}
pctx->temp_mmap_pos = 0;
pctx->arc_writing = 0;
return (0);
}
/*
* This creates a libarchive context for extracting members to disk.
*/
int
setup_extractor(pc_ctx_t *pctx)
{
int pipefd[2];
struct archive *arc;
if (pipe(pipefd) == -1) {
log_msg(LOG_ERR, 1, "Unable to create extractor pipe.\n");
return (-1);
}
arc = archive_read_new();
if (!arc) {
log_msg(LOG_ERR, 1, "Unable to create libarchive context.\n");
close(pipefd[0]); close(pipefd[1]);
return (-1);
}
archive_read_support_format_all(arc);
pctx->archive_ctx = arc;
pctx->arc_writing = 0;
return (0);
}
static ssize_t
process_by_filter(int fd, int typ, struct archive *target_arc,
struct archive *source_arc, struct archive_entry *entry, int cmp)
{
struct filter_info fi;
int64_t wrtn;
fi.source_arc = source_arc;
fi.target_arc = target_arc;
fi.entry = entry;
fi.fd = fd;
fi.compressing = cmp;
fi.block_size = AW_BLOCK_SIZE;
wrtn = (*(typetab[(typ >> 3)].filter_func))(&fi, typetab[(typ >> 3)].filter_private);
if (wrtn == FILTER_RETURN_ERROR) {
log_msg(LOG_ERR, 0, "Error invoking filter module: %s",
typetab[(typ >> 3)].filter_name);
}
return (wrtn);
}
/*
* Routines to archive members and write the file data to the callback. Portions of
* the following code is adapted from some of the Libarchive bsdtar code.
*/
static int
copy_file_data(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry, int typ)
{
size_t sz, offset, len;
ssize_t bytes_to_write;
uchar_t *mapbuf;
int rv, fd;
const char *fpath;
offset = 0;
rv = 0;
sz = archive_entry_size(entry);
bytes_to_write = sz;
fpath = archive_entry_sourcepath(entry);
fd = open(fpath, O_RDONLY);
if (fd == -1) {
log_msg(LOG_ERR, 1, "Failed to open %s.", fpath);
return (-1);
}
if (typ != TYPE_UNKNOWN) {
if (typetab[(typ >> 3)].filter_func != NULL) {
int64_t rv;
rv = process_by_filter(fd, typ, arc, NULL, entry, 1);
if (rv == FILTER_RETURN_ERROR) {
close(fd);
return (-1);
} else if (rv != FILTER_RETURN_SKIP) {
close(fd);
return (ARCHIVE_OK);
}
}
}
/*
* Use mmap for copying file data. Not necessarily for performance, but it saves on
* resident memory use.
*/
while (bytes_to_write > 0) {
uchar_t *src;
size_t wlen;
ssize_t wrtn;
if (bytes_to_write < MMAP_SIZE)
len = bytes_to_write;
else
len = MMAP_SIZE;
do_map:
mapbuf = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, offset);
if (mapbuf == NULL) {
/* Mmap failed; this is bad. */
log_msg(LOG_ERR, 1, "Mmap failed for %s.", fpath);
rv = -1;
break;
}
offset += len;
src = mapbuf;
wlen = len;
if (typ == TYPE_UNKNOWN) {
pctx->ctype = detect_type_by_data(src, len);
typ = pctx->ctype;
if (typ != TYPE_UNKNOWN) {
if (typetab[(typ >> 3)].filter_func != NULL) {
int64_t rv;
munmap(mapbuf, len);
rv = process_by_filter(fd, typ, arc, NULL, entry, 1);
if (rv == FILTER_RETURN_ERROR) {
return (-1);
} else if (rv == FILTER_RETURN_SKIP) {
lseek(fd, 0, SEEK_SET);
typ = TYPE_COMPRESSED;
offset = 0;
goto do_map;
} else {
return (ARCHIVE_OK);
}
}
}
}
typ = TYPE_COMPRESSED; // Need to avoid calling detect_type_by_data subsequently.
/*
* Write the entire mmap-ed buffer. Since we are writing to the compressor
* stage there is no need for blocking.
*/
wrtn = archive_write_data(arc, src, wlen);
if (wrtn < wlen) {
/* Write failed; this is bad */
log_msg(LOG_ERR, 0, "Data write error: %s", archive_error_string(arc));
rv = -1;
}
bytes_to_write -= wrtn;
if (rv == -1) break;
munmap(mapbuf, len);
}
close(fd);
return (rv);
}
static int
write_entry(pc_ctx_t *pctx, struct archive *arc, struct archive_entry *entry, int typ)
{
int rv;
rv = archive_write_header(arc, entry);
if (rv != ARCHIVE_OK) {
if (rv == ARCHIVE_FATAL || rv == ARCHIVE_FAILED) {
log_msg(LOG_ERR, 0, "%s: %s",
archive_entry_sourcepath(entry), archive_error_string(arc));
return (-1);
} else {
log_msg(LOG_WARN, 0, "%s: %s",
archive_entry_sourcepath(entry), archive_error_string(arc));
}
}
if (archive_entry_size(entry) > 0) {
return (copy_file_data(pctx, arc, entry, typ));
}
return (0);
}
/*
* Thread function. Archive members and write to pipe. The dispatcher thread
* reads from the other end and compresses.
*/
static void *
archiver_thread_func(void *dat) {
pc_ctx_t *pctx = (pc_ctx_t *)dat;
char fpath[PATH_MAX], *name, *bnchars = NULL; // Silence compiler
int warn, rbytes, fpathlen = 0; // Silence compiler
uint32_t ctr;
struct archive_entry *entry, *spare_entry, *ent;
struct archive *arc, *ard;
struct archive_entry_linkresolver *resolver;
int readdisk_flags;
warn = 1;
entry = archive_entry_new();
arc = (struct archive *)(pctx->archive_ctx);
if ((resolver = archive_entry_linkresolver_new()) != NULL) {
archive_entry_linkresolver_set_strategy(resolver, archive_format(arc));
} else {
log_msg(LOG_WARN, 0, "Cannot create link resolver, hardlinks will be duplicated.");
}
ctr = 1;
readdisk_flags = ARCHIVE_READDISK_NO_TRAVERSE_MOUNTS;
readdisk_flags |= ARCHIVE_READDISK_HONOR_NODUMP;
ard = archive_read_disk_new();
archive_read_disk_set_behavior(ard, readdisk_flags);
archive_read_disk_set_standard_lookup(ard);
archive_read_disk_set_symlink_physical(ard);
/*
* Read next path entry from list file. read_next_path() also handles sorted reading.
*/
while ((rbytes = read_next_path(pctx, fpath, &bnchars, &fpathlen)) != 0) {
int typ;
if (rbytes == -1) break;
archive_entry_copy_sourcepath(entry, fpath);
if (archive_read_disk_entry_from_file(ard, entry, -1, NULL) != ARCHIVE_OK) {
log_msg(LOG_WARN, 1, "archive_read_disk_entry_from_file:\n %s", archive_error_string(ard));
archive_entry_clear(entry);
continue;
}
typ = TYPE_UNKNOWN;
if (archive_entry_filetype(entry) == AE_IFREG) {
if ((typ = detect_type_by_ext(fpath, fpathlen)) != TYPE_UNKNOWN)
pctx->ctype = typ;
}
/*
* Strip leading '/' or '../' or '/../' from member name.
*/
name = fpath;
while (name[0] == '/' || name[0] == '\\') {
if (warn) {
log_msg(LOG_WARN, 0, "Converting absolute paths.");
warn = 0;
}
if (name[1] == '.' && name[2] == '.' && (name[3] == '/' || name[3] == '\\')) {
name += 3; /* /.. is removed here and / is removed next. */
} else {
name += 1;
}
}
#ifndef __APPLE__
/*
* Workaround for libarchive weirdness on Non MAC OS X platforms. The files
* with names matching pattern: ._* are MAC OS X resource forks which contain
* extended attributes, ACLs etc. They should be handled accordingly on MAC
* platforms and treated as normal files on others. For some reason beyond me
* libarchive refuses to extract these files on Linux, no matter what I try.
* Bug?
*
* In this case the file basename is changed and a custom flag is set to
* indicate extraction to change it back.
*/
if (bnchars[0] == '.' && bnchars[1] == '_' && archive_entry_filetype(entry) == AE_IFREG) {
char *pos = strstr(name, "._");
char name[] = "@.", value[] = "m";
if (pos) {
*pos = '|';
archive_entry_xattr_add_entry(entry, name, value, strlen(value));
}
}
#endif
if (name != archive_entry_pathname(entry))
archive_entry_copy_pathname(entry, name);
if (archive_entry_filetype(entry) != AE_IFREG) {
archive_entry_set_size(entry, 0);
} else {
archive_entry_set_size(entry, archive_entry_size(entry));
}
if (pctx->verbose)
log_msg(LOG_INFO, 0, "%5d/%5d %8d %s", ctr, pctx->archive_members_count,
archive_entry_size(entry), name);
archive_entry_linkify(resolver, &entry, &spare_entry);
ent = entry;
while (ent != NULL) {
if (write_entry(pctx, arc, ent, typ) != 0) {
goto done;
}
ent = spare_entry;
spare_entry = NULL;
}
archive_write_finish_entry(arc);
archive_entry_clear(entry);
ctr++;
}
done:
if (pctx->temp_mmap_len > 0)
munmap(pctx->temp_mmap_buf, pctx->temp_mmap_len);
archive_entry_free(entry);
archive_entry_linkresolver_free(resolver);
archive_read_free(ard);
archive_write_free(arc);
close(pctx->archive_members_fd);
unlink(pctx->archive_members_file);
return (NULL);
}
int
start_archiver(pc_ctx_t *pctx) {
return (pthread_create(&(pctx->archive_thread), NULL, archiver_thread_func, (void *)pctx));
}
/*
* The next two functions are from libArchive source/example:
* https://github.com/libarchive/libarchive/wiki/Examples#wiki-A_Complete_Extractor
*
* We have to use low-level APIs to extract entries to disk. Normally one would use
* archive_read_extract2() but LibArchive has no option to set user-defined filter
* routines, so we have to handle here.
*/
static int
copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entry,
int typ)
{
int64_t offset;
const void *buff;
size_t size;
int r;
if (typ != TYPE_UNKNOWN) {
if (typetab[(typ >> 3)].filter_func != NULL) {
int64_t rv;
rv = process_by_filter(-1, typ, aw, ar, entry, 0);
if (rv == FILTER_RETURN_ERROR) {
archive_set_error(ar, archive_errno(aw),
"%s", archive_error_string(aw));
return (ARCHIVE_FATAL);
} else if (rv == FILTER_RETURN_SKIP) {
log_msg(LOG_WARN, 0, "Filter function failed for entry.");
return (ARCHIVE_WARN);
} else {
return (ARCHIVE_OK);
}
}
}
for (;;) {
r = archive_read_data_block(ar, &buff, &size, &offset);
if (r == ARCHIVE_EOF)
return (ARCHIVE_OK);
if (r != ARCHIVE_OK)
return (r);
r = (int)archive_write_data_block(aw, buff, size, offset);
if (r < ARCHIVE_WARN)
r = ARCHIVE_WARN;
if (r != ARCHIVE_OK) {
archive_set_error(ar, archive_errno(aw),
"%s", archive_error_string(aw));
return (r);
}
}
}
static int
archive_extract_entry(struct archive *a, struct archive_entry *entry,
struct archive *ad, int typ)
{
int r, r2;
r = archive_write_header(ad, entry);
if (r < ARCHIVE_WARN)
r = ARCHIVE_WARN;
if (r != ARCHIVE_OK) {
/* If _write_header failed, copy the error. */
archive_copy_error(a, ad);
} else if (!archive_entry_size_is_set(entry) || archive_entry_size(entry) > 0) {
/* Otherwise, pour data into the entry. */
r = copy_data_out(a, ad, entry, typ);
}
r2 = archive_write_finish_entry(ad);
if (r2 < ARCHIVE_WARN)
r2 = ARCHIVE_WARN;
/* Use the first message. */
if (r2 != ARCHIVE_OK && r == ARCHIVE_OK)
archive_copy_error(a, ad);
/* Use the worst error return. */
if (r2 < r)
r = r2;
return (r);
}
/*
* Extract Thread function. Read an uncompressed archive from the decompressor stage
* and extract members to disk.
*/
static void *
extractor_thread_func(void *dat) {
pc_ctx_t *pctx = (pc_ctx_t *)dat;
char cwd[PATH_MAX], got_cwd;
int flags, rv;
uint32_t ctr;
struct archive_entry *entry;
struct archive *awd, *arc;
flags = ARCHIVE_EXTRACT_TIME;
flags |= ARCHIVE_EXTRACT_SECURE_SYMLINKS;
flags |= ARCHIVE_EXTRACT_SECURE_NODOTDOT;
flags |= ARCHIVE_EXTRACT_SPARSE;
/*
* Extract all security attributes if we are root.
*/
if (pctx->force_archive_perms || geteuid() == 0) {
flags |= ARCHIVE_EXTRACT_OWNER;
flags |= ARCHIVE_EXTRACT_PERM;
flags |= ARCHIVE_EXTRACT_ACL;
flags |= ARCHIVE_EXTRACT_XATTR;
flags |= ARCHIVE_EXTRACT_FFLAGS;
flags |= ARCHIVE_EXTRACT_MAC_METADATA;
}
if (pctx->no_overwrite_newer)
flags |= ARCHIVE_EXTRACT_NO_OVERWRITE_NEWER;
got_cwd = 1;
if (getcwd(cwd, PATH_MAX) == NULL) {
log_msg(LOG_WARN, 1, "Cannot get current directory.");
got_cwd = 0;
}
ctr = 1;
awd = archive_write_disk_new();
archive_write_disk_set_options(awd, flags);
archive_write_disk_set_standard_lookup(awd);
arc = (struct archive *)(pctx->archive_ctx);
archive_read_open(arc, pctx, arc_open_callback, extract_read_callback, extract_close_callback);
/*
* Change directory after opening the archive, otherwise archive_read_open() can fail
* for relative paths.
*/
if (chdir(pctx->to_filename) == -1) {
log_msg(LOG_ERR, 1, "Cannot change to dir: %s", pctx->to_filename);
goto done;
}
/*
* Read archive entries and extract to disk.
*/
while ((rv = archive_read_next_header(arc, &entry)) != ARCHIVE_EOF) {
const char *xt_name, *xt_value;
size_t xt_size;
int typ;
if (rv != ARCHIVE_OK)
log_msg(LOG_WARN, 0, "%s", archive_error_string(arc));
if (rv == ARCHIVE_FATAL) {
log_msg(LOG_ERR, 0, "Fatal error aborting extraction.");
break;
}
if (rv == ARCHIVE_RETRY) {
log_msg(LOG_INFO, 0, "Retrying extractor read ...");
continue;
}
typ = TYPE_UNKNOWN;
if (archive_entry_filetype(entry) == AE_IFREG) {
const char *fpath = archive_entry_pathname(entry);
typ = detect_type_by_ext(fpath, strlen(fpath));
}
/*
* Workaround for libarchive weirdness on Non MAC OS X platforms for filenames
* starting with '._'. See above ...
*/
#ifndef __APPLE__
if (archive_entry_xattr_reset(entry) > 0) {
while (archive_entry_xattr_next(entry, &xt_name, (const void **)&xt_value,
&xt_size) == ARCHIVE_OK) {
if (xt_name[0] == '@' && xt_name[1] == '.' && xt_value[0] == 'm') {
const char *name;
char *pos;
name = archive_entry_pathname(entry);
pos = strstr(name, "|_");
if (pos) {
*pos = '.';
archive_entry_set_pathname(entry, name);
}
archive_entry_xattr_clear(entry);
break;
}
}
}
#endif
rv = archive_extract_entry(arc, entry, awd, typ);
if (rv != ARCHIVE_OK) {
log_msg(LOG_WARN, 0, "%s: %s", archive_entry_pathname(entry),
archive_error_string(arc));
} else if (pctx->verbose) {
log_msg(LOG_INFO, 0, "%5d %8d %s", ctr, archive_entry_size(entry),
archive_entry_pathname(entry));
}
if (rv == ARCHIVE_FATAL) {
log_msg(LOG_ERR, 0, "Fatal error aborting extraction.");
break;
}
ctr++;
}
if (got_cwd) {
rv = chdir(cwd);
}
archive_read_free(arc);
archive_write_free(awd);
done:
return (NULL);
}
int
start_extractor(pc_ctx_t *pctx) {
return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx));
}
/*
* Initialize the hash table of known extensions and types. Bob Jenkins Minimal Perfect Hash
* is used to get a perfect hash function for the set of known extensions. See:
* http://burtleburtle.net/bob/hash/perfect.html
*/
int
init_archive_mod() {
int rv = 0;
pthread_mutex_lock(&init_mutex);
if (!inited) {
int i, j;
exthtab = malloc(PHASHNKEYS * sizeof (struct ext_hash_entry));
if (exthtab != NULL) {
for (i = 0; i < PHASHNKEYS; i++) {
uint64_t extnum;
ub4 slot = phash(extlist[i].ext, extlist[i].len);
extnum = 0;
/*
* Since extensions are less than 8 bytes (or truncated otherwise),
* each extension string is packed into a 64-bit integer for quick
* comparison.
*/
for (j = 0; j < extlist[i].len; j++)
extnum = (extnum << 8) | extlist[i].ext[j];
exthtab[slot].extnum = extnum;
exthtab[slot].type = extlist[i].type;
}
memset(typetab, 0, sizeof (typetab));
inited = 1;
} else {
rv = 1;
}
}
pthread_mutex_unlock(&init_mutex);
return (rv);
}
void
init_filters(struct filter_flags *ff)
{
pthread_mutex_lock(&init_mutex);
if (!filters_inited) {
add_filters_by_type(typetab, ff);
filters_inited = 1;
}
pthread_mutex_unlock(&init_mutex);
}
/*
* Identify file type based on extension. Lookup is fast as we have a perfect hash function.
* If the given extension maps to a slot which has a different extension or maps to a slot
* outside the hash table range then the function returns unknown type.
*/
static int
detect_type_by_ext(const char *path, int pathlen)
{
const char *ext = NULL;
ub4 slot;
int i, len;
uint64_t extnum;
char extl[8];
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
if (i == 0 || path[i] != '.') goto out; // If extension not found give up
len = pathlen - i - 1;
if (len == 0 || len > 8) goto out; // If extension is empty give up
ext = &path[i+1];
for (i = 0; i < len; i++) extl[i] = tolower(ext[i]);
slot = phash(extl, len);
if (slot >= PHASHNKEYS) goto out; // Extension maps outside hash table range, give up
extnum = 0;
/*
* Pack given extension into 64-bit integer.
*/
for (i = 0; i < len; i++)
extnum = (extnum << 8) | tolower(ext[i]);
if (exthtab[slot].extnum == extnum)
return (exthtab[slot].type);
out:
return (TYPE_UNKNOWN);
}
#ifdef WORDS_BIGENDIAN
/* 0x7fELF packed into 32-bit integer. */
# define ELFINT (0x7f454c46U)
/* TZif packed into 32-bit integer. */
# define TZSINT (0x545a6966U)
/* PPMZ packed into 32-bit integer. */
# define PPMINT (0x50504d5aU)
/* wvpk packed into 32-bit integer. */
# define WVPK (0x7776706b)
/* TTA1 packed into 32-bit integer. */
# define TTA1 (0x54544131)
/* Magic for different MSDOS COM file types. */
# define COM_MAGIC (0xcd21)
#else
/* 0x7fELF packed into 32-bit integer. */
# define ELFINT (0x464c457fU)
/* TZif packed into 32-bit integer. */
# define TZINT (0x66695a54U)
/* PPMZ packed into 32-bit integer. */
# define PPMINT (0x5a4d5050U)
/* wvpk packed into 32-bit integer. */
# define WVPK (0x6b707677)
/* TTA1 packed into 32-bit integer. */
# define TTA1 (0x31415454)
/* Magic for different MSDOS COM file types. */
# define COM_MAGIC (0x21cd)
#endif
/*
* Detect a few file types from looking at magic signatures.
* NOTE: Jpeg files must be detected via '.jpg' or '.jpeg' (case-insensitive)
* extensions. Do not add Jpeg header detection here. it will break
* context based PackJPG processing. Jpeg files not have proper
* extension must not be processed via PackJPG.
*/
static int
detect_type_by_data(uchar_t *buf, size_t len)
{
// At least a few bytes.
if (len < 16) return (TYPE_UNKNOWN);
if (memcmp(buf, "!<arch>\n", 8) == 0)
return (TYPE_BINARY|TYPE_ARCHIVE_AR);
if (U32_P(buf) == ELFINT) { // Regular ELF, check for 32/64-bit, core dump
if (*(buf + 16) != 4) {
if (*(buf + 4) == 2) {
return (TYPE_BINARY|TYPE_EXE64);
} else {
return (TYPE_BINARY|TYPE_EXE32);
}
} else {
return (TYPE_BINARY);
}
}
if (buf[1] == 'Z') {
// Check for MSDOS/Windows Exe types
if (buf[0] == 'L') {
return (TYPE_BINARY|TYPE_EXE32);
} else if (buf[0] == 'M') {
// If relocation table is less than 0x40 bytes into file then
// it is a 32-bit MSDOS exe.
if (LE16(U16_P(buf + 0x18)) < 0x40) {
return (TYPE_BINARY|TYPE_EXE32);
} else {
uint32_t off = LE32(U32_P(buf + 0x3c));
// This is non-MSDOS, check whether PE
if (off < len - 3) {
if (buf[off] == 'P' && buf[off+1] == 'E' &&
buf[off+2] == '\0' && buf[off+3] == '\0') {
// This is a PE executable.
// Check 32/64-bit.
off = LE32(U32_P(buf + 0x3c))+4;
if (LE16(U16_P(buf + off)) == 0x8664) {
return (TYPE_BINARY|TYPE_EXE64);
} else {
return (TYPE_BINARY|TYPE_EXE32);
}
} else {
return (TYPE_BINARY|TYPE_EXE32);
}
}
}
}
}
// BMP Files
if (buf[0] == 'B' && buf[1] == 'M') {
uint16_t typ = LE16(U16_P(buf + 14));
if (typ == 12 || typ == 64 || typ == 40 || typ == 128)
return (TYPE_BINARY|TYPE_BMP);
}
if (U32_P(buf) == TZINT)
return (TYPE_BINARY); // Timezone data
if (U32_P(buf) == PPMINT)
return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD); // PPM Compressed archive
if (U32_P(buf) == WVPK || U32_P(buf) == TTA1)
return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED);
// MSDOS COM types, two byte and one byte magic numbers are checked
// after all other multi-byte magic number checks.
if (buf[0] == 0xe9 || buf[0] == 0xeb) {
if (LE16(U16_P(buf + 0x1fe)) == 0xaa55)
return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM
else
return (TYPE_BINARY);
}
if (U16_P(buf + 2) == COM_MAGIC || U16_P(buf + 4) == COM_MAGIC ||
U16_P(buf + 4) == COM_MAGIC || U16_P(buf + 5) == COM_MAGIC ||
U16_P(buf + 13) == COM_MAGIC || U16_P(buf + 18) == COM_MAGIC ||
U16_P(buf + 23) == COM_MAGIC || U16_P(buf + 30) == COM_MAGIC ||
U16_P(buf + 70) == COM_MAGIC) {
return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM
}
return (TYPE_UNKNOWN);
}