Add file type detection based on magic values.
Add more comments. Add more extensions.
This commit is contained in:
parent
991482403b
commit
b7facc929e
5 changed files with 103 additions and 12 deletions
|
@ -42,6 +42,7 @@
|
|||
#include <utils.h>
|
||||
#include <pthread.h>
|
||||
#include <sys/mman.h>
|
||||
#include <ctype.h>
|
||||
#include <archive.h>
|
||||
#include <archive_entry.h>
|
||||
#include "pc_archive.h"
|
||||
|
@ -102,6 +103,7 @@ static struct arc_list_state {
|
|||
pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
static int detect_type_by_ext(char *path, int pathlen);
|
||||
static int detect_type_by_data(uchar_t *buf, size_t len);
|
||||
|
||||
/*
|
||||
* Archive writer callback routines for archive creation operation.
|
||||
|
@ -847,8 +849,8 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc,
|
|||
src = mapbuf;
|
||||
wlen = len;
|
||||
|
||||
/* if (typ == TYPE_UNKNOWN)
|
||||
pctx->ctype = detect_type_by_data(src, len);*/
|
||||
if (typ == TYPE_UNKNOWN)
|
||||
pctx->ctype = detect_type_by_data(src, len);
|
||||
|
||||
/*
|
||||
* Write the entire mmap-ed buffer. Since we are writing to the compressor
|
||||
|
@ -1043,6 +1045,9 @@ extractor_thread_func(void *dat) {
|
|||
flags |= ARCHIVE_EXTRACT_SECURE_NODOTDOT;
|
||||
flags |= ARCHIVE_EXTRACT_SPARSE;
|
||||
|
||||
/*
|
||||
* Extract all security attributes if we are root.
|
||||
*/
|
||||
if (pctx->force_archive_perms || geteuid() == 0) {
|
||||
flags |= ARCHIVE_EXTRACT_OWNER;
|
||||
flags |= ARCHIVE_EXTRACT_PERM;
|
||||
|
@ -1148,6 +1153,11 @@ start_extractor(pc_ctx_t *pctx) {
|
|||
return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx));
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the hash table of known extensions and types. Bob Jenkins Minimal Perfect Hash
|
||||
* is used to get a perfect hash function for the set of known extensions. See:
|
||||
* http://burtleburtle.net/bob/hash/perfect.html
|
||||
*/
|
||||
int
|
||||
init_archive_mod() {
|
||||
int rv = 0;
|
||||
|
@ -1162,6 +1172,12 @@ init_archive_mod() {
|
|||
uint64_t extnum;
|
||||
ub4 slot = phash(extlist[i].ext, extlist[i].len);
|
||||
extnum = 0;
|
||||
|
||||
/*
|
||||
* Since extensions are less than 8 bytes (or truncated otherwise),
|
||||
* each extension string is packed into a 64-bit integer for quick
|
||||
* comparison.
|
||||
*/
|
||||
for (j = 0; j < extlist[i].len; j++)
|
||||
extnum = (extnum << 1) | extlist[i].ext[j];
|
||||
exthtab[slot].extnum = extnum;
|
||||
|
@ -1176,6 +1192,11 @@ init_archive_mod() {
|
|||
return (rv);
|
||||
}
|
||||
|
||||
/*
|
||||
* Identify file type based on extension. Lookup is fast as we have a perfect hash function.
|
||||
* If the given extension maps to a slot which has a different extension or maps to a slot
|
||||
* outside the hash table range then the function returns unknown type.
|
||||
*/
|
||||
static int
|
||||
detect_type_by_ext(char *path, int pathlen)
|
||||
{
|
||||
|
@ -1185,17 +1206,53 @@ detect_type_by_ext(char *path, int pathlen)
|
|||
uint64_t extnum;
|
||||
|
||||
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
|
||||
if (i == 0 || path[i] != '.') goto out;
|
||||
if (i == 0 || path[i] != '.') goto out; // If extension not found give up
|
||||
len = pathlen - i - 1;
|
||||
if (len == 0) goto out;
|
||||
if (len == 0) goto out; // If extension is empty give up
|
||||
ext = &path[i+1];
|
||||
slot = phash(ext, len);
|
||||
if (slot > NUM_EXT) goto out;
|
||||
if (slot > NUM_EXT) goto out; // Extension maps outside hash table range, give up
|
||||
extnum = 0;
|
||||
|
||||
/*
|
||||
* Pack given extension into 64-bit integer.
|
||||
*/
|
||||
for (i = 0; i < len; i++)
|
||||
extnum = (extnum << 1) | ext[i];
|
||||
extnum = (extnum << 1) | tolower(ext[i]);
|
||||
if (exthtab[slot].extnum == extnum)
|
||||
return (exthtab[slot].type);
|
||||
out:
|
||||
return (TYPE_UNKNOWN);
|
||||
}
|
||||
|
||||
/* 0x7fELF packed into 32-bit integer. */
|
||||
#define ELFSHORT (0x7f454c46U)
|
||||
|
||||
/* TZif packed into 32-bit integer. */
|
||||
#define TZSHORT (0x545a6966U)
|
||||
|
||||
/* PPMZ packed into 32-bit integer. */
|
||||
#define PPMSHORT (0x50504d5aU)
|
||||
|
||||
/*
|
||||
* Detect a few file types from looking at magic signatures.
|
||||
*/
|
||||
static int
|
||||
detect_type_by_data(uchar_t *buf, size_t len)
|
||||
{
|
||||
// At least a few bytes.
|
||||
if (len < 16) return (TYPE_UNKNOWN);
|
||||
|
||||
if (U32_P(buf) == ELFSHORT)
|
||||
return (TYPE_EXE); // Regular ELF
|
||||
if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z')
|
||||
return (TYPE_EXE); // MSDOS Exe
|
||||
if (buf[0] == 0xe9)
|
||||
return (TYPE_EXE); // MSDOS COM
|
||||
if (U32_P(buf) == TZSHORT)
|
||||
return (TYPE_BINARY); // Timezone data
|
||||
if (U32_P(buf) == PPMSHORT)
|
||||
return (TYPE_COMPRESSED); // PPM Compressed archive
|
||||
|
||||
return (TYPE_UNKNOWN);
|
||||
}
|
||||
|
|
|
@ -45,6 +45,7 @@ struct ext_entry {
|
|||
{"m4" , TYPE_TEXT, 2},
|
||||
{"vb" , TYPE_TEXT, 2},
|
||||
{"xslt" , TYPE_TEXT, 4},
|
||||
{"xsl" , TYPE_TEXT, 3},
|
||||
{"yacc" , TYPE_TEXT, 4},
|
||||
{"lex" , TYPE_TEXT, 3},
|
||||
{"csv" , TYPE_TEXT, 3},
|
||||
|
@ -67,6 +68,16 @@ struct ext_entry {
|
|||
{"bib" , TYPE_TEXT, 3},
|
||||
{"lua" , TYPE_TEXT, 3},
|
||||
{"qml" , TYPE_TEXT, 3},
|
||||
{"fa" , TYPE_TEXT, 2},
|
||||
{"tcc" , TYPE_TEXT, 3},
|
||||
{"css" , TYPE_TEXT, 3},
|
||||
{"pod" , TYPE_TEXT, 3},
|
||||
{"al" , TYPE_TEXT, 2},
|
||||
{"vim" , TYPE_TEXT, 3},
|
||||
{"am" , TYPE_TEXT, 2},
|
||||
{"upp" , TYPE_TEXT, 3},
|
||||
{"mom" , TYPE_TEXT, 3},
|
||||
{"tmac" , TYPE_TEXT, 4},
|
||||
{"exe" , TYPE_EXE, 3},
|
||||
{"dll" , TYPE_EXE, 3},
|
||||
{"bin" , TYPE_EXE, 3},
|
||||
|
@ -109,7 +120,13 @@ struct ext_entry {
|
|||
{"uha" , TYPE_COMPRESSED, 3},
|
||||
{"alz" , TYPE_COMPRESSED, 3},
|
||||
{"ace" , TYPE_COMPRESSED, 3},
|
||||
{"rar" , TYPE_COMPRESSED, 3},
|
||||
{"xz" , TYPE_COMPRESSED, 2},
|
||||
{"xcf" , TYPE_BINARY, 3},
|
||||
{"mo" , TYPE_BINARY, 2},
|
||||
{"bmp" , TYPE_BINARY, 3},
|
||||
{"pyo" , TYPE_BINARY, 3},
|
||||
{"pyc" , TYPE_BINARY, 3},
|
||||
};
|
||||
#define NUM_EXT (99)
|
||||
#define NUM_EXT (116)
|
||||
#endif
|
||||
|
|
|
@ -32,6 +32,7 @@ java,TYPE_TEXT
|
|||
m4,TYPE_TEXT
|
||||
vb,TYPE_TEXT
|
||||
xslt,TYPE_TEXT
|
||||
xsl,TYPE_TEXT
|
||||
yacc,TYPE_TEXT
|
||||
lex,TYPE_TEXT
|
||||
csv,TYPE_TEXT
|
||||
|
@ -54,6 +55,16 @@ ps,TYPE_TEXT
|
|||
bib,TYPE_TEXT
|
||||
lua,TYPE_TEXT
|
||||
qml,TYPE_TEXT
|
||||
fa,TYPE_TEXT
|
||||
tcc,TYPE_TEXT
|
||||
css,TYPE_TEXT
|
||||
pod,TYPE_TEXT
|
||||
al,TYPE_TEXT
|
||||
vim,TYPE_TEXT
|
||||
am,TYPE_TEXT
|
||||
upp,TYPE_TEXT
|
||||
mom,TYPE_TEXT
|
||||
tmac,TYPE_TEXT
|
||||
exe,TYPE_EXE
|
||||
dll,TYPE_EXE
|
||||
bin,TYPE_EXE
|
||||
|
@ -96,4 +107,10 @@ dmg,TYPE_COMPRESSED
|
|||
uha,TYPE_COMPRESSED
|
||||
alz,TYPE_COMPRESSED
|
||||
ace,TYPE_COMPRESSED
|
||||
rar,TYPE_COMPRESSED
|
||||
xz,TYPE_COMPRESSED
|
||||
xcf,TYPE_BINARY
|
||||
mo,TYPE_BINARY
|
||||
bmp,TYPE_BINARY
|
||||
pyo,TYPE_BINARY
|
||||
pyc,TYPE_BINARY
|
||||
|
|
|
@ -12,10 +12,10 @@
|
|||
|
||||
/* small adjustments to _a_ to make values distinct */
|
||||
ub1 tab[] = {
|
||||
20,70,0,4,61,76,0,119,0,0,16,4,10,1,61,76,
|
||||
61,0,0,16,1,61,0,76,0,123,32,70,28,34,119,51,
|
||||
0,76,4,122,70,0,0,43,0,106,20,83,0,0,28,66,
|
||||
79,0,1,47,79,122,0,0,71,75,85,26,0,103,0,76,
|
||||
10,76,0,76,70,42,0,1,0,0,119,1,61,1,70,79,
|
||||
0,0,0,4,70,1,0,122,0,119,47,76,76,34,110,101,
|
||||
0,76,70,70,42,28,0,66,0,108,0,109,28,4,28,4,
|
||||
70,0,1,20,4,123,123,0,79,75,34,76,69,77,0,69,
|
||||
};
|
||||
|
||||
/* The hash function */
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
|
||||
extern ub1 tab[];
|
||||
#define PHASHLEN 0x40 /* length of hash mapping table */
|
||||
#define PHASHNKEYS 99 /* How many keys were hashed */
|
||||
#define PHASHNKEYS 116 /* How many keys were hashed */
|
||||
#define PHASHRANGE 128 /* Range any input might map to */
|
||||
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */
|
||||
|
||||
|
|
Loading…
Reference in a new issue