diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..75109dd --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.gitignore +.idea +cmake-build* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..a1e5220 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.17) +project(att_regex_tester) + +set(CMAKE_CXX_STANDARD 14) + +add_executable(att_regex_tester + man/man1/testregex.html + basic.dat + categorize.dat + forcedassoc.dat + leftassoc.dat + nullsubexpr.dat + re-assoc.html + re-categorize.html + re-interpretation.html + re-nullsubexpr.html + re-repetition.html + re2-exhaustive.txt.bz2 + re2-search.txt + README.md + repetition.dat + rightassoc.dat + testregex.c + testregex.html) diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/man/man1/testregex.html b/docs/man/man1/testregex.html similarity index 100% rename from man/man1/testregex.html rename to docs/man/man1/testregex.html diff --git a/re-assoc.html b/docs/re-assoc.html similarity index 100% rename from re-assoc.html rename to docs/re-assoc.html diff --git a/re-categorize.html b/docs/re-categorize.html similarity index 100% rename from re-categorize.html rename to docs/re-categorize.html diff --git a/re-interpretation.html b/docs/re-interpretation.html similarity index 100% rename from re-interpretation.html rename to docs/re-interpretation.html diff --git a/re-nullsubexpr.html b/docs/re-nullsubexpr.html similarity index 100% rename from re-nullsubexpr.html rename to docs/re-nullsubexpr.html diff --git a/re-repetition.html b/docs/re-repetition.html similarity index 100% rename from re-repetition.html rename to docs/re-repetition.html diff --git a/testregex.html b/docs/testregex.html similarity index 100% rename from testregex.html rename to docs/testregex.html diff --git a/repetition.dat b/repetition.dat index b54a2c6..8e855ac 100644 --- a/repetition.dat +++ b/repetition.dat @@ -77,3 +77,65 @@ E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) diff --git a/testregex.c b/testregex.c index 7b86ab7..b09ec05 100644 --- a/testregex.c +++ b/testregex.c @@ -9,10 +9,12 @@ * then supply #define REG_foo REG_foo for each enum REG_foo * * Glenn Fowler - * AT&T Labs Research + * AT&T Research * * PLEASE: publish your tests so everyone can benefit * + * The following license covers testregex.c and all associated test data. + * * Permission is hereby granted, free of charge, to any person obtaining a * copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, @@ -32,7 +34,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -static const char id[] = "\n@(#)$Id: testregex (AT&T Research) 2005-05-20 $\0\n"; +static const char id[] = "\n@(#)$Id: testregex (AT&T Research) 2010-06-10 $\0\n"; #if _PACKAGE_ast #include @@ -53,6 +55,10 @@ static const char id[] = "\n@(#)$Id: testregex (AT&T Research) 2005-05-20 $\0\n" #include #endif +#ifndef RE_DUP_MAX +#define RE_DUP_MAX 32767 +#endif + #if !_PACKAGE_ast #undef REG_DISCIPLINE #endif @@ -68,15 +74,15 @@ static const char id[] = "\n@(#)$Id: testregex (AT&T Research) 2005-05-20 $\0\n" #define TEST_LRE 0x00000010 #define TEST_SRE 0x00000020 -#define TEST_EXPAND 0x00000040 -#define TEST_LENIENT 0x00000080 +#define TEST_EXPAND 0x00000100 +#define TEST_LENIENT 0x00000200 -#define TEST_QUERY 0x00000100 -#define TEST_SUB 0x00000200 -#define TEST_UNSPECIFIED 0x00000400 -#define TEST_VERIFY 0x00000800 -#define TEST_AND 0x00001000 -#define TEST_OR 0x00002000 +#define TEST_QUERY 0x00000400 +#define TEST_SUB 0x00000800 +#define TEST_UNSPECIFIED 0x00001000 +#define TEST_VERIFY 0x00002000 +#define TEST_AND 0x00004000 +#define TEST_OR 0x00008000 #define TEST_DELIMIT 0x00010000 #define TEST_OK 0x00020000 @@ -95,6 +101,8 @@ static const char id[] = "\n@(#)$Id: testregex (AT&T Research) 2005-05-20 $\0\n" #define TEST_CATCH 0x10000000 #define TEST_VERBOSE 0x20000000 +#define TEST_DECOMP 0x40000000 + #define TEST_GLOBAL (TEST_ACTUAL|TEST_AND|TEST_BASELINE|TEST_CATCH|TEST_FAIL|TEST_IGNORE_ERROR|TEST_IGNORE_OVER|TEST_IGNORE_POSITION|TEST_OR|TEST_PASS|TEST_SUMMARY|TEST_VERBOSE) #ifdef REG_DISCIPLINE @@ -114,7 +122,7 @@ compf(const regex_t* re, const char* xstr, size_t xlen, regdisc_t* disc) { Disc_t* dp = (Disc_t*)disc; - return (void*)++dp->ordinal; + return (void*)((char*)0 + ++dp->ordinal); } static int @@ -122,7 +130,7 @@ execf(const regex_t* re, void* data, const char* xstr, size_t xlen, const char* { Disc_t* dp = (Disc_t*)disc; - sfprintf(dp->sp, "{%-.*s}(%d:%d)", xlen, xstr, (int)data, slen); + sfprintf(dp->sp, "{%-.*s}(%lu:%d)", xlen, xstr, (char*)data - (char*)0, slen); return atoi(xstr); } @@ -195,8 +203,8 @@ T(" 0 pointer.\n"); T("\n"); T(" Field 1: the regex(3) flags to apply, one character per REG_feature\n"); T(" flag. The test is skipped if REG_feature is not supported by the\n"); -T(" implementation. If the first character is not [BEASKL] then the\n"); -T(" specification is a global control line. One or more of [BEASKL] may be\n"); +T(" implementation. If the first character is not [BEASKLP] then the\n"); +T(" specification is a global control line. One or more of [BEASKLP] may be\n"); T(" specified; the test will be repeated for each mode.\n"); T("\n"); T(" B basic BRE (grep, ed, sed)\n"); @@ -227,12 +235,14 @@ T(" r REG_RIGHT implicit ...$\n"); T(" s REG_SHELL_ESCAPED \\ not special\n"); T(" t REG_MUSTDELIM all delimiters must be specified\n"); T(" u standard unspecified behavior -- errors not counted\n"); +T(" v REG_CLASS_ESCAPE \\ special inside [...]\n"); T(" w REG_NOSUB no subexpression match array\n"); T(" x REG_LENIENT let some errors slide\n"); T(" y REG_LEFT regexec() implicit ^...\n"); T(" z REG_NULL NULL subexpressions ok\n"); T(" $ expand C \\c escapes in fields 2 and 3\n"); T(" / field 2 is a regsubcomp() expression\n"); +T(" = field 3 is a regdecomp() expression\n"); T("\n"); T(" Field 1 control lines:\n"); T("\n"); @@ -253,9 +263,11 @@ T("\n"); T(" number use number for nmatch (20 by default)\n"); T("\n"); T(" Field 2: the regular expression pattern; SAME uses the pattern from\n"); -T(" the previous specification.\n"); +T(" the previous specification. RE_DUP_MAX inside {...} expands to the\n"); +T(" value from .\n"); T("\n"); -T(" Field 3: the string to match.\n"); +T(" Field 3: the string to match. X...{RE_DUP_MAX} expands to RE_DUP_MAX\n"); +T(" copies of X.\n"); T("\n"); T(" Field 4: the test outcome. This is either one of the posix error\n"); T(" codes (with REG_ omitted) or the match array, a list of (m,n)\n"); @@ -270,7 +282,7 @@ T(" matched (?{...}) expression, where x is the text enclosed by {...},\n"); T(" o is the expression ordinal counting from 1, and n is the length of\n"); T(" the unmatched portion of the subject string. If x starts with a\n"); T(" number then that is the return value of re_execf(), otherwise 0 is\n"); -T(" returned.\n"); +T(" returned. RE_DUP_MAX[-+]N expands to the value -+N.\n"); T("\n"); T(" Field 5: optional comment appended to the report.\n"); T("\n"); @@ -324,6 +336,9 @@ static const char* unsupported[] = "SHELL", #endif +#ifndef REG_CLASS_ESCAPE + "CLASS_ESCAPE", +#endif #ifndef REG_COMMENT "COMMENT", #endif @@ -398,10 +413,16 @@ static const char* unsupported[] = #endif #if !_REG_subcomp "regsubcomp", +#endif +#if !_REG_decomp + "redecomp", #endif 0 }; +#ifndef REG_CLASS_ESCAPE +#define REG_CLASS_ESCAPE NOTEST +#endif #ifndef REG_COMMENT #define REG_COMMENT NOTEST #endif @@ -555,6 +576,9 @@ quote(char* s, int len, unsigned long test) unsigned char* u = (unsigned char*)s; unsigned char* e; int c; +#ifdef MB_CUR_MAX + int w; +#endif if (!u) printf("NIL"); @@ -604,6 +628,15 @@ quote(char* s, int len, unsigned long test) printf("\\v"); break; default: +#ifdef MB_CUR_MAX + s = (char*)u - 1; + if ((w = mblen(s, (char*)e - s)) > 1) + { + u += w - 1; + fwrite(s, 1, w, stdout); + } + else +#endif if (!iscntrl(c) && isprint(c)) putchar(c); else @@ -862,7 +895,8 @@ matchcheck(regmatch_t* match, int nmatch, int nsub, char* ans, char* re, char* s #ifdef REG_DISCIPLINE char* x; - x = sfstruse(state.disc.sp); + if (!(x = sfstruse(state.disc.sp))) + bad("out of space [discipline string]\n", NiL, NiL, 0, 0); if (strcmp(p, x)) { if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) @@ -883,6 +917,13 @@ matchcheck(regmatch_t* match, int nmatch, int nsub, char* ans, char* re, char* s m = -1; p++; } + else if (*p == 'R' && !memcmp(p, "RE_DUP_MAX", 10)) + { + m = RE_DUP_MAX; + p += 10; + if (*p == '+' || *p == '-') + m += strtol(p, &p, 10); + } else m = strtol(p, &p, 10); if (*p++ != ',') @@ -892,6 +933,13 @@ matchcheck(regmatch_t* match, int nmatch, int nsub, char* ans, char* re, char* s n = -1; p++; } + else if (*p == 'R' && !memcmp(p, "RE_DUP_MAX", 10)) + { + n = RE_DUP_MAX; + p += 10; + if (*p == '+' || *p == '-') + n += strtol(p, &p, 10); + } else n = strtol(p, &p, 10); if (*p++ != ')') @@ -985,7 +1033,7 @@ gotcha(int sig) } static char* -getline(FILE* fp) +nextline(FILE* fp) { static char buf[32 * 1024]; @@ -1112,6 +1160,59 @@ catchfree(regex_t* preg, int flags, int* tabs, char* spec, char* re, char* s, ch return eret; } +static char* +expand(char* os, char* ot) +{ + char* s = os; + char* t; + int n = 0; + int r; + long m; + + for (;;) + { + switch (*s++) + { + case 0: + break; + case '{': + n++; + continue; + case '}': + n--; + continue; + case 'R': + if (n == 1 && !memcmp(s, "E_DUP_MAX", 9)) + { + s--; + for (t = ot; os < s; *t++ = *os++); + r = ((t - ot) >= 5 && t[-1] == '{' && t[-2] == '.' && t[-3] == '.' && t[-4] == '.') ? t[-5] : 0; + os = ot; + m = RE_DUP_MAX; + if (*(s += 10) == '+' || *s == '-') + m += strtol(s, &s, 10); + if (r) + { + t -= 5; + while (m-- > 0) + *t++ = r; + while (*s && *s++ != '}'); + } + else + t += snprintf(t, 32, "%ld", m); + while (*t = *s++) + t++; + break; + } + continue; + default: + continue; + } + break; + } + return os; +} + int main(int argc, char** argv) { @@ -1123,7 +1224,7 @@ main(int argc, char** argv) int nstr; int cret; int eret; - int nsub; + size_t nsub; int i; int j; int expected; @@ -1153,6 +1254,8 @@ main(int argc, char** argv) regex_t preg; static char pat[32 * 1024]; + static char patbuf[32 * 1024]; + static char strbuf[32 * 1024]; int nonosub = REG_NOSUB == 0; int nonexec = 0; @@ -1343,7 +1446,7 @@ main(int argc, char** argv) signal(SIGBUS, gotcha); signal(SIGSEGV, gotcha); } - while (p = getline(fp)) + while (p = nextline(fp)) { /* parse: */ @@ -1460,7 +1563,13 @@ main(int argc, char** argv) s = field[1]; if (!s || streq(s, "POSIX")) s = "C"; - if (!(ans = setlocale(LC_COLLATE, s)) || streq(ans, "C") || streq(ans, "POSIX") || !(ans = setlocale(LC_CTYPE, s)) || streq(ans, "C") || streq(ans, "POSIX")) + if ((ans = setlocale(LC_COLLATE, s)) && streq(ans, "POSIX")) + ans = "C"; + if (!ans || !streq(ans, s) && streq(s, "C")) + ans = 0; + else if ((ans = setlocale(LC_CTYPE, s)) && streq(ans, "POSIX")) + ans = "C"; + if (!ans || !streq(ans, s) && streq(s, "C")) skip = note(level, s, skip, test); else { @@ -1550,6 +1659,9 @@ main(int argc, char** argv) case 'u': test |= TEST_UNSPECIFIED; continue; + case 'v': + cflags |= REG_CLASS_ESCAPE; + continue; case 'w': cflags |= REG_NOSUB; continue; @@ -1574,6 +1686,10 @@ main(int argc, char** argv) test |= TEST_SUB; continue; + case '=': + test |= TEST_DECOMP; + continue; + case '?': test |= TEST_VERIFY; test &= ~(TEST_AND|TEST_OR); @@ -1675,7 +1791,7 @@ main(int argc, char** argv) continue; state.passed = state.verify; } - if (i < 4) + if (i < ((test & TEST_DECOMP) ? 3 : 4)) bad("too few fields\n", NiL, NiL, 0, test); while (i < elementsof(field)) field[i++] = 0; @@ -1690,21 +1806,26 @@ main(int argc, char** argv) { if (test & TEST_EXPAND) escape(re); + re = expand(re, patbuf); strcpy(ppat = pat, re); } } else ppat = 0; nstr = -1; - if ((s = field[2]) && (test & TEST_EXPAND)) + if (s = field[2]) { - nstr = escape(s); + s = expand(s, strbuf); + if (test & TEST_EXPAND) + { + nstr = escape(s); #if _REG_nexec - if (nstr != strlen(s)) - nexec = nstr; + if (nstr != strlen(s)) + nexec = nstr; #endif + } } - if (!(ans = field[3])) + if (!(ans = field[(test & TEST_DECOMP) ? 2 : 3])) bad("NIL answer\n", NiL, NiL, 0, test); msg = field[4]; fflush(stdout); @@ -1714,6 +1835,10 @@ main(int argc, char** argv) #else continue; #endif +#if !_REG_decomp + if (test & TEST_DECOMP) + continue; +#endif compile: @@ -1839,6 +1964,46 @@ main(int argc, char** argv) cret = REG_EFLAGS; } } +#endif +#if _REG_decomp + if (!cret && (test & TEST_DECOMP)) + { + char buf[128]; + + if ((j = nmatch) > sizeof(buf)) + j = sizeof(buf); + fun = "regdecomp"; + p = re + preg.re_npat; + if (!(test & TEST_CATCH)) + i = regdecomp(&preg, -1, buf, j); + else if (!(cret = setjmp(state.gotcha))) + { + alarm(HUNG); + i = regdecomp(&preg, -1, buf, j); + alarm(0); + } + if (!cret) + { + catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + if (i > j) + { + if (i != (strlen(ans) + 1)) + { + report("failed", fun, re, s, nstr, msg, flags, test); + printf(" %d byte buffer supplied, %d byte buffer required\n", j, i); + } + } + else if (strcmp(buf, ans)) + { + report("failed", fun, re, s, nstr, msg, flags, test); + quote(ans, -1, test|TEST_DELIMIT); + printf(" expected, "); + quote(buf, -1, test|TEST_DELIMIT); + printf(" returned\n"); + } + continue; + } + } #endif if (!cret) { @@ -1865,7 +2030,7 @@ main(int argc, char** argv) else { report("re_nsub incorrect", fun, re, NiL, -1, msg, flags, test); - printf("at least %d expected, %d returned\n", nsub, preg.re_nsub); + printf("at least %ld expected, %ld returned\n", nsub, preg.re_nsub); state.errors++; } } @@ -1874,7 +2039,7 @@ main(int argc, char** argv) } } } - if (!(test & TEST_SUB) && *ans && *ans != '(' && !streq(ans, "OK") && !streq(ans, "NOMATCH")) + if (!(test & (TEST_DECOMP|TEST_SUB)) && *ans && *ans != '(' && !streq(ans, "OK") && !streq(ans, "NOMATCH")) { if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) skip = extract(tabs, line, re, s, ans, msg, "OK", NiL, 0, 0, skip, level, test|TEST_DELIMIT); @@ -2080,7 +2245,7 @@ main(int argc, char** argv) goto execute; } #endif - if (!(test & (TEST_SUB|TEST_VERIFY)) && !nonosub) + if (!(test & (TEST_DECOMP|TEST_SUB|TEST_VERIFY)) && !nonosub) { if (catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test)) continue;