Add file type detection based on magic values.
Add more comments. Add more extensions.
This commit is contained in:
parent
991482403b
commit
b7facc929e
5 changed files with 103 additions and 12 deletions
|
@ -42,6 +42,7 @@
|
||||||
#include <utils.h>
|
#include <utils.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
|
#include <ctype.h>
|
||||||
#include <archive.h>
|
#include <archive.h>
|
||||||
#include <archive_entry.h>
|
#include <archive_entry.h>
|
||||||
#include "pc_archive.h"
|
#include "pc_archive.h"
|
||||||
|
@ -102,6 +103,7 @@ static struct arc_list_state {
|
||||||
pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER;
|
pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
static int detect_type_by_ext(char *path, int pathlen);
|
static int detect_type_by_ext(char *path, int pathlen);
|
||||||
|
static int detect_type_by_data(uchar_t *buf, size_t len);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Archive writer callback routines for archive creation operation.
|
* Archive writer callback routines for archive creation operation.
|
||||||
|
@ -847,8 +849,8 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc,
|
||||||
src = mapbuf;
|
src = mapbuf;
|
||||||
wlen = len;
|
wlen = len;
|
||||||
|
|
||||||
/* if (typ == TYPE_UNKNOWN)
|
if (typ == TYPE_UNKNOWN)
|
||||||
pctx->ctype = detect_type_by_data(src, len);*/
|
pctx->ctype = detect_type_by_data(src, len);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Write the entire mmap-ed buffer. Since we are writing to the compressor
|
* Write the entire mmap-ed buffer. Since we are writing to the compressor
|
||||||
|
@ -1043,6 +1045,9 @@ extractor_thread_func(void *dat) {
|
||||||
flags |= ARCHIVE_EXTRACT_SECURE_NODOTDOT;
|
flags |= ARCHIVE_EXTRACT_SECURE_NODOTDOT;
|
||||||
flags |= ARCHIVE_EXTRACT_SPARSE;
|
flags |= ARCHIVE_EXTRACT_SPARSE;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Extract all security attributes if we are root.
|
||||||
|
*/
|
||||||
if (pctx->force_archive_perms || geteuid() == 0) {
|
if (pctx->force_archive_perms || geteuid() == 0) {
|
||||||
flags |= ARCHIVE_EXTRACT_OWNER;
|
flags |= ARCHIVE_EXTRACT_OWNER;
|
||||||
flags |= ARCHIVE_EXTRACT_PERM;
|
flags |= ARCHIVE_EXTRACT_PERM;
|
||||||
|
@ -1148,6 +1153,11 @@ start_extractor(pc_ctx_t *pctx) {
|
||||||
return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx));
|
return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize the hash table of known extensions and types. Bob Jenkins Minimal Perfect Hash
|
||||||
|
* is used to get a perfect hash function for the set of known extensions. See:
|
||||||
|
* http://burtleburtle.net/bob/hash/perfect.html
|
||||||
|
*/
|
||||||
int
|
int
|
||||||
init_archive_mod() {
|
init_archive_mod() {
|
||||||
int rv = 0;
|
int rv = 0;
|
||||||
|
@ -1162,6 +1172,12 @@ init_archive_mod() {
|
||||||
uint64_t extnum;
|
uint64_t extnum;
|
||||||
ub4 slot = phash(extlist[i].ext, extlist[i].len);
|
ub4 slot = phash(extlist[i].ext, extlist[i].len);
|
||||||
extnum = 0;
|
extnum = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Since extensions are less than 8 bytes (or truncated otherwise),
|
||||||
|
* each extension string is packed into a 64-bit integer for quick
|
||||||
|
* comparison.
|
||||||
|
*/
|
||||||
for (j = 0; j < extlist[i].len; j++)
|
for (j = 0; j < extlist[i].len; j++)
|
||||||
extnum = (extnum << 1) | extlist[i].ext[j];
|
extnum = (extnum << 1) | extlist[i].ext[j];
|
||||||
exthtab[slot].extnum = extnum;
|
exthtab[slot].extnum = extnum;
|
||||||
|
@ -1176,6 +1192,11 @@ init_archive_mod() {
|
||||||
return (rv);
|
return (rv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Identify file type based on extension. Lookup is fast as we have a perfect hash function.
|
||||||
|
* If the given extension maps to a slot which has a different extension or maps to a slot
|
||||||
|
* outside the hash table range then the function returns unknown type.
|
||||||
|
*/
|
||||||
static int
|
static int
|
||||||
detect_type_by_ext(char *path, int pathlen)
|
detect_type_by_ext(char *path, int pathlen)
|
||||||
{
|
{
|
||||||
|
@ -1185,17 +1206,53 @@ detect_type_by_ext(char *path, int pathlen)
|
||||||
uint64_t extnum;
|
uint64_t extnum;
|
||||||
|
|
||||||
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
|
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
|
||||||
if (i == 0 || path[i] != '.') goto out;
|
if (i == 0 || path[i] != '.') goto out; // If extension not found give up
|
||||||
len = pathlen - i - 1;
|
len = pathlen - i - 1;
|
||||||
if (len == 0) goto out;
|
if (len == 0) goto out; // If extension is empty give up
|
||||||
ext = &path[i+1];
|
ext = &path[i+1];
|
||||||
slot = phash(ext, len);
|
slot = phash(ext, len);
|
||||||
if (slot > NUM_EXT) goto out;
|
if (slot > NUM_EXT) goto out; // Extension maps outside hash table range, give up
|
||||||
extnum = 0;
|
extnum = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pack given extension into 64-bit integer.
|
||||||
|
*/
|
||||||
for (i = 0; i < len; i++)
|
for (i = 0; i < len; i++)
|
||||||
extnum = (extnum << 1) | ext[i];
|
extnum = (extnum << 1) | tolower(ext[i]);
|
||||||
if (exthtab[slot].extnum == extnum)
|
if (exthtab[slot].extnum == extnum)
|
||||||
return (exthtab[slot].type);
|
return (exthtab[slot].type);
|
||||||
out:
|
out:
|
||||||
return (TYPE_UNKNOWN);
|
return (TYPE_UNKNOWN);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* 0x7fELF packed into 32-bit integer. */
|
||||||
|
#define ELFSHORT (0x7f454c46U)
|
||||||
|
|
||||||
|
/* TZif packed into 32-bit integer. */
|
||||||
|
#define TZSHORT (0x545a6966U)
|
||||||
|
|
||||||
|
/* PPMZ packed into 32-bit integer. */
|
||||||
|
#define PPMSHORT (0x50504d5aU)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Detect a few file types from looking at magic signatures.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
detect_type_by_data(uchar_t *buf, size_t len)
|
||||||
|
{
|
||||||
|
// At least a few bytes.
|
||||||
|
if (len < 16) return (TYPE_UNKNOWN);
|
||||||
|
|
||||||
|
if (U32_P(buf) == ELFSHORT)
|
||||||
|
return (TYPE_EXE); // Regular ELF
|
||||||
|
if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z')
|
||||||
|
return (TYPE_EXE); // MSDOS Exe
|
||||||
|
if (buf[0] == 0xe9)
|
||||||
|
return (TYPE_EXE); // MSDOS COM
|
||||||
|
if (U32_P(buf) == TZSHORT)
|
||||||
|
return (TYPE_BINARY); // Timezone data
|
||||||
|
if (U32_P(buf) == PPMSHORT)
|
||||||
|
return (TYPE_COMPRESSED); // PPM Compressed archive
|
||||||
|
|
||||||
|
return (TYPE_UNKNOWN);
|
||||||
|
}
|
||||||
|
|
|
@ -45,6 +45,7 @@ struct ext_entry {
|
||||||
{"m4" , TYPE_TEXT, 2},
|
{"m4" , TYPE_TEXT, 2},
|
||||||
{"vb" , TYPE_TEXT, 2},
|
{"vb" , TYPE_TEXT, 2},
|
||||||
{"xslt" , TYPE_TEXT, 4},
|
{"xslt" , TYPE_TEXT, 4},
|
||||||
|
{"xsl" , TYPE_TEXT, 3},
|
||||||
{"yacc" , TYPE_TEXT, 4},
|
{"yacc" , TYPE_TEXT, 4},
|
||||||
{"lex" , TYPE_TEXT, 3},
|
{"lex" , TYPE_TEXT, 3},
|
||||||
{"csv" , TYPE_TEXT, 3},
|
{"csv" , TYPE_TEXT, 3},
|
||||||
|
@ -67,6 +68,16 @@ struct ext_entry {
|
||||||
{"bib" , TYPE_TEXT, 3},
|
{"bib" , TYPE_TEXT, 3},
|
||||||
{"lua" , TYPE_TEXT, 3},
|
{"lua" , TYPE_TEXT, 3},
|
||||||
{"qml" , TYPE_TEXT, 3},
|
{"qml" , TYPE_TEXT, 3},
|
||||||
|
{"fa" , TYPE_TEXT, 2},
|
||||||
|
{"tcc" , TYPE_TEXT, 3},
|
||||||
|
{"css" , TYPE_TEXT, 3},
|
||||||
|
{"pod" , TYPE_TEXT, 3},
|
||||||
|
{"al" , TYPE_TEXT, 2},
|
||||||
|
{"vim" , TYPE_TEXT, 3},
|
||||||
|
{"am" , TYPE_TEXT, 2},
|
||||||
|
{"upp" , TYPE_TEXT, 3},
|
||||||
|
{"mom" , TYPE_TEXT, 3},
|
||||||
|
{"tmac" , TYPE_TEXT, 4},
|
||||||
{"exe" , TYPE_EXE, 3},
|
{"exe" , TYPE_EXE, 3},
|
||||||
{"dll" , TYPE_EXE, 3},
|
{"dll" , TYPE_EXE, 3},
|
||||||
{"bin" , TYPE_EXE, 3},
|
{"bin" , TYPE_EXE, 3},
|
||||||
|
@ -109,7 +120,13 @@ struct ext_entry {
|
||||||
{"uha" , TYPE_COMPRESSED, 3},
|
{"uha" , TYPE_COMPRESSED, 3},
|
||||||
{"alz" , TYPE_COMPRESSED, 3},
|
{"alz" , TYPE_COMPRESSED, 3},
|
||||||
{"ace" , TYPE_COMPRESSED, 3},
|
{"ace" , TYPE_COMPRESSED, 3},
|
||||||
|
{"rar" , TYPE_COMPRESSED, 3},
|
||||||
|
{"xz" , TYPE_COMPRESSED, 2},
|
||||||
{"xcf" , TYPE_BINARY, 3},
|
{"xcf" , TYPE_BINARY, 3},
|
||||||
|
{"mo" , TYPE_BINARY, 2},
|
||||||
|
{"bmp" , TYPE_BINARY, 3},
|
||||||
|
{"pyo" , TYPE_BINARY, 3},
|
||||||
|
{"pyc" , TYPE_BINARY, 3},
|
||||||
};
|
};
|
||||||
#define NUM_EXT (99)
|
#define NUM_EXT (116)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -32,6 +32,7 @@ java,TYPE_TEXT
|
||||||
m4,TYPE_TEXT
|
m4,TYPE_TEXT
|
||||||
vb,TYPE_TEXT
|
vb,TYPE_TEXT
|
||||||
xslt,TYPE_TEXT
|
xslt,TYPE_TEXT
|
||||||
|
xsl,TYPE_TEXT
|
||||||
yacc,TYPE_TEXT
|
yacc,TYPE_TEXT
|
||||||
lex,TYPE_TEXT
|
lex,TYPE_TEXT
|
||||||
csv,TYPE_TEXT
|
csv,TYPE_TEXT
|
||||||
|
@ -54,6 +55,16 @@ ps,TYPE_TEXT
|
||||||
bib,TYPE_TEXT
|
bib,TYPE_TEXT
|
||||||
lua,TYPE_TEXT
|
lua,TYPE_TEXT
|
||||||
qml,TYPE_TEXT
|
qml,TYPE_TEXT
|
||||||
|
fa,TYPE_TEXT
|
||||||
|
tcc,TYPE_TEXT
|
||||||
|
css,TYPE_TEXT
|
||||||
|
pod,TYPE_TEXT
|
||||||
|
al,TYPE_TEXT
|
||||||
|
vim,TYPE_TEXT
|
||||||
|
am,TYPE_TEXT
|
||||||
|
upp,TYPE_TEXT
|
||||||
|
mom,TYPE_TEXT
|
||||||
|
tmac,TYPE_TEXT
|
||||||
exe,TYPE_EXE
|
exe,TYPE_EXE
|
||||||
dll,TYPE_EXE
|
dll,TYPE_EXE
|
||||||
bin,TYPE_EXE
|
bin,TYPE_EXE
|
||||||
|
@ -96,4 +107,10 @@ dmg,TYPE_COMPRESSED
|
||||||
uha,TYPE_COMPRESSED
|
uha,TYPE_COMPRESSED
|
||||||
alz,TYPE_COMPRESSED
|
alz,TYPE_COMPRESSED
|
||||||
ace,TYPE_COMPRESSED
|
ace,TYPE_COMPRESSED
|
||||||
|
rar,TYPE_COMPRESSED
|
||||||
|
xz,TYPE_COMPRESSED
|
||||||
xcf,TYPE_BINARY
|
xcf,TYPE_BINARY
|
||||||
|
mo,TYPE_BINARY
|
||||||
|
bmp,TYPE_BINARY
|
||||||
|
pyo,TYPE_BINARY
|
||||||
|
pyc,TYPE_BINARY
|
||||||
|
|
|
@ -12,10 +12,10 @@
|
||||||
|
|
||||||
/* small adjustments to _a_ to make values distinct */
|
/* small adjustments to _a_ to make values distinct */
|
||||||
ub1 tab[] = {
|
ub1 tab[] = {
|
||||||
20,70,0,4,61,76,0,119,0,0,16,4,10,1,61,76,
|
10,76,0,76,70,42,0,1,0,0,119,1,61,1,70,79,
|
||||||
61,0,0,16,1,61,0,76,0,123,32,70,28,34,119,51,
|
0,0,0,4,70,1,0,122,0,119,47,76,76,34,110,101,
|
||||||
0,76,4,122,70,0,0,43,0,106,20,83,0,0,28,66,
|
0,76,70,70,42,28,0,66,0,108,0,109,28,4,28,4,
|
||||||
79,0,1,47,79,122,0,0,71,75,85,26,0,103,0,76,
|
70,0,1,20,4,123,123,0,79,75,34,76,69,77,0,69,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* The hash function */
|
/* The hash function */
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
|
|
||||||
extern ub1 tab[];
|
extern ub1 tab[];
|
||||||
#define PHASHLEN 0x40 /* length of hash mapping table */
|
#define PHASHLEN 0x40 /* length of hash mapping table */
|
||||||
#define PHASHNKEYS 99 /* How many keys were hashed */
|
#define PHASHNKEYS 116 /* How many keys were hashed */
|
||||||
#define PHASHRANGE 128 /* Range any input might map to */
|
#define PHASHRANGE 128 /* Range any input might map to */
|
||||||
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */
|
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue