From 786fb2e904283c5356382f7eb98e595eae2df3f4 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Fri, 14 Aug 2020 21:45:38 -0400 Subject: [PATCH] Initial import from Archive.org Wayback Machine. (wget -mkEpnp http://web.archive.org/web/20130420020035id_/http://www2.research.att.com/~gsf/testregex/ and wget -mkEpnp http://web.archive.org/web/20130420020018id_/http://www2.research.att.com/~gsf/man/man1/testregex.html) --- basic.dat | 216 ++++ categorize.dat | 62 ++ forcedassoc.dat | 30 + leftassoc.dat | 16 + man/man1/testregex.html | 142 +++ nullsubexpr.dat | 73 ++ re-assoc.html | 64 ++ re-categorize.html | 209 ++++ re-interpretation.html | 997 ++++++++++++++++++ re-nullsubexpr.html | 62 ++ re-repetition.html | 60 ++ repetition.dat | 79 ++ rightassoc.dat | 16 + testregex.c | 2121 +++++++++++++++++++++++++++++++++++++++ testregex.html | 241 +++++ 15 files changed, 4388 insertions(+) create mode 100644 basic.dat create mode 100644 categorize.dat create mode 100644 forcedassoc.dat create mode 100644 leftassoc.dat create mode 100644 man/man1/testregex.html create mode 100644 nullsubexpr.dat create mode 100644 re-assoc.html create mode 100644 re-categorize.html create mode 100644 re-interpretation.html create mode 100644 re-nullsubexpr.html create mode 100644 re-repetition.html create mode 100644 repetition.dat create mode 100644 rightassoc.dat create mode 100644 testregex.c create mode 100644 testregex.html diff --git a/basic.dat b/basic.dat new file mode 100644 index 0000000..5c50f37 --- /dev/null +++ b/basic.dat @@ -0,0 +1,216 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +BE [[-]] [[-]] (2,4) +BE [[.NIL.]] NULL ECOLLATE +BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\xff (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +E (a*)* - (0,0)(0,0) +E (a*)+ - (0,0)(0,0) +E (a*|b)* - (0,0)(0,0) +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +E (^)* - (0,0)(0,0) +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +E ((a*|b))* - (0,0)(0,0)(0,0) +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/categorize.dat b/categorize.dat new file mode 100644 index 0000000..d348512 --- /dev/null +++ b/categorize.dat @@ -0,0 +1,62 @@ +NOTE regex implementation categorization 2004-05-31 + +?E aa* xaxaax (1,2) POSITION=leftmost +; POSITION=bug + +?E (a*)(ab)*(b*) abc (0,2)(0,1)(?,?)(1,2) ASSOCIATIVITY=right +|E (a*)(ab)*(b*) abc (0,2)(0,0)(0,2)(2,2) ASSOCIATIVITY=left +; ASSOCIATIVITY=bug + +?E ((a*)(ab)*)((b*)(a*)) aba (0,3)(0,2)(0,0)(0,2)(2,3)(2,2)(2,3) SUBEXPRESSION=precedence +|E ((a*)(ab)*)((b*)(a*)) aba (0,3)(0,1)(0,1)(?,?)(1,3)(1,2)(2,3) SUBEXPRESSION=grouping +; SUBEXPRESSION=bug + +?E (...?.?)* xxxxxx (0,6)(4,6) REPEAT_LONGEST=first +|E (...?.?)* xxxxxx (0,6)(2,6) REPEAT_LONGEST=last +|E (...?.?)* xxxxxx OK REPEAT_LONGEST=unknown +; REPEAT_LONGEST=bug + +?E (a|ab)(bc|c) abcabc (0,3)(0,2)(2,3) EXPECTED +|E (a|ab)(bc|c) abcabc (0,3)(0,1)(1,3) BUG=alternation-order +; BUG=alternation-order-UNKNOWN + +?E (aba|a*b)(aba|a*b) ababa (0,5)(0,2)(2,5) EXPECTED +|E (aba|a*b)(aba|a*b) ababa (0,4)(0,3)(3,4) BUG=first-match +; BUG=unknown-match + +?B a\(b\)*\1 a NOMATCH EXPECTED +|B a\(b\)*\1 a (0,1) BUG=nomatch-match +|B a\(b\)*\1 abab (0,2)(1,2) # BUG=repeat-any +; BUG=nomatch-match-UNKNOWN + +?E (a*){2} xxxxx (0,0)(0,0) EXPECTED +|E (a*){2} xxxxx (5,5)(5,5) BUG=range-null +; BUG=range-null-UNKNOWN + +?B a\(b\)*\1 abab NOMATCH EXPECTED +|B a\(b\)*\1 abab (0,1) # BUG=nomatch-match +|B a\(b\)*\1 abab (0,2)(1,2) BUG=repeat-any +; BUG=repeat-any-UNKNOWN + +?E (a*)* a (0,1)(0,1) EXPECTED +|E (a*)* ax (0,1)(0,1) BUG=repeat-null-unknown +|E (a*)* a (0,1)(1,1) BUG=repeat-null +; BUG=repeat-null-UNKNOWN + +?E (aba|a*b)* ababa (0,5)(2,5) EXPECTED +|E (aba|a*b)* ababa (0,5)(3,4) BUG=repeat-short +|E (aba|a*b)* ababa (0,4)(3,4) # LENGTH=first +; BUG=repeat-short-UNKNOWN + +?E (a(b)?)+ aba (0,3)(2,3) EXPECTED +|E (a(b)?)+ aba (0,3)(2,3)(1,2) BUG=repeat-artifact +; BUG=repeat-artifact-UNKNOWN + +?B \(a\(b\)*\)*\2 abab NOMATCH EXPECTED +|B \(a\(b\)*\)*\2 abab (0,4)(2,3)(1,2) BUG=repeat-artifact-nomatch +; BUG=repeat-artifact-nomatch-UNKNOWN + +?E (a?)((ab)?)(b?)a?(ab)?b? abab (0,4)(0,1)(1,1)(?,?)(1,2)(2,4) BUG=subexpression-first +|E .*(.*) ab (0,2)(2,2) EXPECTED +|E .*(.*) ab (0,2)(0,2) BUG=subexpression-first +; BUG=subexpression-first-UNKNOWN diff --git a/forcedassoc.dat b/forcedassoc.dat new file mode 100644 index 0000000..39f3111 --- /dev/null +++ b/forcedassoc.dat @@ -0,0 +1,30 @@ +NOTE left-assoc:pass-all right-assoc:pass-all : 2002-04-29 + +E (a|ab)(c|bcd) abcd (0,4)(0,1)(1,4) +E (a|ab)(bcd|c) abcd (0,4)(0,1)(1,4) +E (ab|a)(c|bcd) abcd (0,4)(0,1)(1,4) +E (ab|a)(bcd|c) abcd (0,4)(0,1)(1,4) +E ((a|ab)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) +E ((a|ab)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) +E ((ab|a)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) +E ((ab|a)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) +E (a|ab)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) +E (a|ab)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) +E (ab|a)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) +E (ab|a)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) +E (a*)(b|abc) abc (0,3)(0,0)(0,3) +E (a*)(abc|b) abc (0,3)(0,0)(0,3) +E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) +E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) +E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) +E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) +E (a*)(b|abc) abc (0,3)(0,0)(0,3) +E (a*)(abc|b) abc (0,3)(0,0)(0,3) +E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) +E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) +E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) +E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) +E (a|ab) ab (0,2)(0,2) +E (ab|a) ab (0,2)(0,2) +E (a|ab)(b*) ab (0,2)(0,2)(2,2) +E (ab|a)(b*) ab (0,2)(0,2)(2,2) diff --git a/leftassoc.dat b/leftassoc.dat new file mode 100644 index 0000000..9c068c6 --- /dev/null +++ b/leftassoc.dat @@ -0,0 +1,16 @@ +NOTE left-assoc:pass-all right-assoc:pass-none : 2002-04-29 + +E (a|ab)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4) +E (a|ab)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4) +E (ab|a)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4) +E (ab|a)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4) + +E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3) +E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3) +E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3) +E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3) + +E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) +E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) +E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) +E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) diff --git a/man/man1/testregex.html b/man/man1/testregex.html new file mode 100644 index 0000000..fe7f321 --- /dev/null +++ b/man/man1/testregex.html @@ -0,0 +1,142 @@ + + + +testregex man document + + +
+NAME
+  testregex - regex(3) test harness
+
+SYNOPSIS
+  testregex [ options ]
+
+DESCRIPTION
+  testregex reads regex(3) test specifications, one per line, from the
+  standard input and writes one output line for each failed test. A
+  summary line is written after all tests are done. Each successful
+  test is run again with REG_NOSUB. Unsupported features are noted
+  before the first test, and tests requiring these features are
+  silently ignored.
+
+OPTIONS
+  -c	catch signals and non-terminating calls
+  -e	ignore error return mismatches
+  -h	list help on standard error
+  -n	do not repeat successful tests with regnexec()
+  -o	ignore match[] overrun errors
+  -p	ignore negative position mismatches
+  -s	use stack instead of malloc
+  -x	do not repeat successful tests with REG_NOSUB
+  -v	list each test line
+  -A	list failed test lines with actual answers
+  -B	list all test lines with actual answers
+  -F	list failed test lines
+  -P	list passed test lines
+  -S	output one summary line
+
+INPUT FORMAT
+  Input lines may be blank, a comment beginning with #, or a test
+  specification. A specification is five fields separated by one
+  or more tabs. NULL denotes the empty string and NIL denotes the
+  0 pointer.
+
+  Field 1: the regex(3) flags to apply, one character per REG_feature
+  flag. The test is skipped if REG_feature is not supported by the
+  implementation. If the first character is not [BEASKLP] then the
+  specification is a global control line. One or more of [BEASKLP] may be
+  specified; the test will be repeated for each mode.
+
+    B 	basic			BRE	(grep, ed, sed)
+    E 	REG_EXTENDED		ERE	(egrep)
+    A	REG_AUGMENTED		ARE	(egrep with negation)
+    S	REG_SHELL		SRE	(sh glob)
+    K	REG_SHELL|REG_AUGMENTED	KRE	(ksh glob)
+    L	REG_LITERAL		LRE	(fgrep)
+
+    a	REG_LEFT|REG_RIGHT	implicit ^...$
+    b	REG_NOTBOL		lhs does not match ^
+    c	REG_COMMENT		ignore space and #...\n
+    d	REG_SHELL_DOT		explicit leading . match
+    e	REG_NOTEOL		rhs does not match $
+    f	REG_MULTIPLE		multiple \n separated patterns
+    g	FNM_LEADING_DIR		testfnmatch only -- match until /
+    h	REG_MULTIREF		multiple digit backref
+    i	REG_ICASE		ignore case
+    j	REG_SPAN		. matches \n
+    k	REG_ESCAPE		\ to ecape [...] delimiter
+    l	REG_LEFT		implicit ^...
+    m	REG_MINIMAL		minimal match
+    n	REG_NEWLINE		explicit \n match
+    o	REG_ENCLOSED		(|&) magic inside [@|&](...)
+    p	REG_SHELL_PATH		explicit / match
+    q	REG_DELIMITED		delimited pattern
+    r	REG_RIGHT		implicit ...$
+    s	REG_SHELL_ESCAPED	\ not special
+    t	REG_MUSTDELIM		all delimiters must be specified
+    u	standard unspecified behavior -- errors not counted
+    v	REG_CLASS_ESCAPE	\ special inside [...]
+    w	REG_NOSUB		no subexpression match array
+    x	REG_LENIENT		let some errors slide
+    y	REG_LEFT		regexec() implicit ^...
+    z	REG_NULL		NULL subexpressions ok
+    $	                        expand C \c escapes in fields 2 and 3
+    /	                        field 2 is a regsubcomp() expression
+    =	                        field 3 is a regdecomp() expression
+
+  Field 1 control lines:
+
+    C		set LC_COLLATE and LC_CTYPE to locale in field 2
+
+    ?test ...	output field 5 if passed and != EXPECTED, silent otherwise
+    &test ...	output field 5 if current and previous passed
+    |test ...	output field 5 if current passed and previous failed
+    ; ...	output field 2 if previous failed
+    {test ...	skip if failed until }
+    }		end of skip
+
+    : comment		comment copied as output NOTE
+    :comment:test	:comment: ignored
+    N[OTE] comment	comment copied as output NOTE
+    T[EST] comment	comment
+
+    number		use number for nmatch (20 by default)
+
+  Field 2: the regular expression pattern; SAME uses the pattern from
+    the previous specification.
+
+  Field 3: the string to match.
+
+  Field 4: the test outcome. This is either one of the posix error
+    codes (with REG_ omitted) or the match array, a list of (m,n)
+    entries with m and n being first and last+1 positions in the
+    field 3 string, or NULL if REG_NOSUB is in effect and success
+    is expected. BADPAT is acceptable in place of any regcomp(3)
+    error code. The match[] array is initialized to (-2,-2) before
+    each test. All array elements from 0 to nmatch-1 must be specified
+    in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
+    Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
+    matched (?{...}) expression, where x is the text enclosed by {...},
+    o is the expression ordinal counting from 1, and n is the length of
+    the unmatched portion of the subject string. If x starts with a
+    number then that is the return value of re_execf(), otherwise 0 is
+    returned.
+
+  Field 5: optional comment appended to the report.
+
+CAVEAT
+    If a regex implementation misbehaves with memory then all bets are off.
+
+CONTRIBUTORS
+  Glenn Fowler    gsf@research.att.com        (ksh strmatch, regex extensions)
+  David Korn      dgk@research.att.com        (ksh glob matcher)
+  Doug McIlroy    mcilroy@dartmouth.edu       (ast regex/testre in C++)
+  Tom Lord        lord@regexps.com            (rx tests)
+  Henry Spencer   henry@zoo.toronto.edu       (original public regex)
+  Andrew Hume     andrew@research.att.com     (gre tests)
+  John Maddock    John_Maddock@compuserve.com (regex++ tests)
+  Philip Hazel    ph10@cam.ac.uk              (pcre tests)
+  Ville Laurikari vl@iki.fi                   (libtre tests)
+
+ + diff --git a/nullsubexpr.dat b/nullsubexpr.dat new file mode 100644 index 0000000..c73d8f0 --- /dev/null +++ b/nullsubexpr.dat @@ -0,0 +1,73 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +E SAME b (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaaaa (0,0)(0,0) +E ([^ab]*)* ccccxx (0,6)(0,6) +E SAME ababab (0,0)(0,0) + +E ((z)+|a)* zabcde (0,2)(1,2) + +{E a+? aaaaaa (0,1) no *? +? mimimal match ops +E (a) aaa (0,1)(0,1) +E (a*?) aaa (0,0)(0,0) +E (a)*? aaa (0,0) +E (a*?)*? aaa (0,0) +} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/re-assoc.html b/re-assoc.html new file mode 100644 index 0000000..2bbc14b --- /dev/null +++ b/re-assoc.html @@ -0,0 +1,64 @@ + + + + + + ../re/re-assoc.mm mm document + + + + +
+ +

+


+
+

regex catenation associativity tests

+
Glenn Fowler <gsf@research.att.com> +

AT&T Labs Research - Florham Park NJ +

+


+The +regex +tests in +{ + leftassoc.dat + rightassoc.dat + forcedassoc.dat +} +exercise the associativity of catenation. +

+


+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Glenn Fowler
Information and Software Systems Research
AT&T Labs Research
Florham Park NJ
August 04, 2002
+

+ +

+ + + diff --git a/re-categorize.html b/re-categorize.html new file mode 100644 index 0000000..eb473d6 --- /dev/null +++ b/re-categorize.html @@ -0,0 +1,209 @@ + + + + + + ../re/re-categorize.mm mm document + + + + +
+ +

+


+
+

regex implementation categorization

+
Glenn Fowler <gsf@research.att.com> +

AT&T Labs Research - Florham Park NJ +

+


+The +regex +tests in + categorize.dat +attempt to categorize +regex +implementations. +The tests do not address internationalization. +All implementations report the leftmost match; this is omitted from the table. +

+ + + + + + + + + + + + + + + + + + + + + + + + + +
LABEL    ASSOC    SUBEXPR    REP_LONGEST    BUGS
+A    right    precedence    first    -
+B    right    grouping    first    repeat-null  repeat-short  repeat-artifact-nomatch
+D    right    grouping    first    -
+G    right    grouping    first    alternation-order  repeat-null  repeat-artifact  repeat-artifact-nomatch
+H    right    grouping    first    alternation-order  nomatch-match  repeat-null  repeat-artifact  repeat-artifact-nomatch
+I    right    grouping    first    repeat-any  repeat-short  repeat-artifact-nomatch
+J    right    precedence    last    nomatch-match  repeat-artifact  repeat-artifact-nomatch  subexpression-first
+M    right    precedence    last    range-null  repeat-artifact  repeat-artifact-nomatch  subexpression-first
+O    right    grouping    first    repeat-null  repeat-short  repeat-artifact-nomatch
+P    right    grouping    first    alternation-order  first-match  repeat-null  repeat-artifact
+R    left    precedence    last    -
+S    right    grouping    first    repeat-null  repeat-short  repeat-artifact-nomatch
+T    left    precedence    last    -
+U    right    precedence    first    repeat-null  subexpression-first
+darwin.ppc    right    grouping    first    repeat-null  repeat-short
+freebsd.i386    right    grouping    first    repeat-null  repeat-short
+hp.pa    right    grouping    first    repeat-artifact
+ibm.risc    right    grouping    first    alternation-order  nomatch-match  repeat-artifact  repeat-artifact-nomatch
+linux.i386    right    grouping    first    alternation-order  repeat-artifact  repeat-null
+sgi.mips3    right    grouping    first    repeat-short
+sol8.sun4    right    grouping    first    alternation-order  nomatch-match  repeat-artifact
+unixware.i386    right    precedence    first    repeat-null  subexpression-first
+

+The categories are: +

+
+
LABEL
+The implementation label from + testregex. +
ASSOC
+Subpattern (or atom) associativity: either +left +or +right. +The subexpression match rule in the rationale requires +right +for expressions where each concatenated part is a subexpression. +There is no definition for +subpattern, +but it would be inconsistent for any definition to require different +associativity than that for subexpressions. +Some claim that the BRE and ERE grammars specify +left +associativity, but this interpretation disregards +the subexpression match rule in the rationale. +The grammar can also be interpreted to support +right +associativity, and this interpretation is in accord with the rationale. +
SUBEXPR
+Subexpression semantics: +precedence +if subexpressions can override the default associativity; +grouping +if subexpressions are for repetition and +regmatch_t +grouping only. +The subexpression match rule in the rationale requires +precedence. +
REP_LONGEST
+How repeated subexpressions that match more than once are handled: +first +if the longest possible matches occur first; +last +if the longest possible matches occur last; +unknown +otherwise. +The subexpression match rule in the rationale requires +first. +
BUGS
+Miscellaneous bugs (see + categorize.dat +for specific examples): +
+
+
alternation-order
+A change in the order of subexpression alternation operands, +not involved in a tie, +changes +regmatch_t +values. +Some implementations with this bug can be coaxed into missing the +overall longest match. +
first-match
+The first of the leftmost matches, instead of the longest of the +leftmost matches, is returned. +
nomatch-match
+A back-reference to a +regmatch_t +(-1,-1) value is treated as matching. +
range-null
+A range-repeated subexpression that matches null does not report the match +at offset (0,0). +
repeat-artifact
+A +regmatch_t +value is reported for a repeated match that is not the last match. +
repeat-artifact-nomatch
+To prevent not matching, +a +regmatch_t +value is reported for a repeated match that is not the last match. +
repeat-null
+A repeated subexpression matches the null string even though it is not +the only match and is not necessary to satisfy the exact or minimum +number of occurrences for an interval expression. +
repeat-short
+Incorrect +regmatch_t +values for a repeated subexpression. +This may be a variant of +repeat-artifact. +
subexpression-first
+A subexpression match takes precedence over a subpattern +to its left. +
+
+
+
+

+


+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Glenn Fowler
Information and Software Systems Research
AT&T Labs Research
Florham Park NJ
June 01, 2004
+

+ +

+ + + diff --git a/re-interpretation.html b/re-interpretation.html new file mode 100644 index 0000000..cb603b1 --- /dev/null +++ b/re-interpretation.html @@ -0,0 +1,997 @@ + + + + + + ../re/re-interpretation.mm mm document + + + + +
+ + + + + + + + + + + +
AbstractBackgroundNotationregex GlossaryA subexpression is A subpattern is The Dark Corners Conclusion
+
+

+


+
+

An Interpretation of the POSIX regex Standard

+
Glenn Fowler <gsf@research.att.com> +

AT&T Research - Florham Park NJ +

+

+

Abstract

+Many passages in the POSIX +regex +standard seem to be open for interpretation. +Differences between several published + implementations +of the +regex +API bear this out. +Instead of relegating these differences to the +undefined behavior +bucket, this paper proposes a resolution to each +by direct application of the standard text. + +

+


Background

+The POSIX +regex +standard is spread across four documents: +

+ + + + + + +
+glossary    G    http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap03.html
+api    A    http://www.opengroup.org/onlinepubs/007904975/functions/regcomp.html
+definition    D    http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap09.html
+rationale    R    http://www.opengroup.org/onlinepubs/007904975/xrat/xbd_chap09.html
+

+It describes +BREs +(basic regular expressions, a.k.a., +grep(1) +style) and +EREs +(extended regular expressions, a.k.a., +egrep(1) +style) +and how an RE of each type matches subject strings. +The standard also provides an API: +regcomp(3) +for compiling an RE, and +regexec(3) +for matching a compiled RE against a subject string. +The +regexec +API +

+
+   int regexec(const regex_t* restrict preg, const char* restrict string,
+               size_t nmatch, regmatch_t pmatch[restrict], int eflags);
+
+ +is at the center of multiple, conflicting interpretations of the standard. +These interpretations differ on the setting of the +pmatch[] +array for index values > 0. +This note presents examples that demonstrate interpretation conflicts, +and then provides standard references that, +when taken as a whole, +resolve the conflicts. + +

+


Notation

+Standard references use the notation +[document:begin[-end]] +where +document +is the document letter, { A D G R }, from the table above, +begin +is the beginning line number, and +end +is the ending line number. +Line numbers are taken from the 2001 X/Open printing. +Unfortunately the online links do not display line numbers. +For example, [A:37179-37180] is the reference for the +regexec +API prototype above. +

+Example patterns, subject strings, and +pmatch[] +array values use the regression test notation of + testregex. +You can download the source and compile it against your favorite regex +implementation. +All of the examples in this note have been placed in the file + interpretation.dat; +you can download this file and use it as input to +testregex. +For example, the +testregex +input +

+
+:RE#01:E	a+			xaax	(1,3)
+
+ +specifies that the ERE pattern "a+" matched against the +subject string "xaax" yields +pmatch[0].rm_so==1 +and +pmatch[0].rm_eo==3. +The example is labeled RE#01 for indexing and referencing. +
+
+:RE#02:B	.\(a*\).		xaax	(0,4)(1,3)
+
+ +specifies that the BRE pattern ".\(a*\)." matched against the subject +string "xaax" yields +pmatch[0].rm_so==0, +pmatch[0].rm_eo==4, +pmatch[1].rm_so==1, +pmatch[1].rm_eo==3. +(?,?) denotes +rm_so +and +rm_eo +values of -1, i.e., a non-match. +The first field allows additional flags that exercise all of the +REG_* +regcomp +and +regexec +flags; see +testregex(1) +or +testregex --man +for details. +Note that +tab +is the field separator in the +testregex +syntax; if you mouse snarf then make sure that +tabs +are preserved. + +

+


regex Glossary

+
+
+
[G:41]Basic Regular Expression (BRE)
+A regular expression used by the majority of utilities that select strings +from a set of character strings. +
[G:148]Entire Regular Expression
+The concatenated set of one or more basic regular expressions or extended +regular expressions that make up the pattern specified for string selection. +
[G:158]Extended Regular Expression (ERE)
+A regular expression that is an alternative to the Basic Regular +Expression using a more extensive syntax, occasionally used by some utilities. +
[G:269]Pattern
+A sequence of characters used either with regular expression notation or for +pathname expansion, as a means of selecting various character strings or +pathnames, respectively. +
[G:316]Regular Expression
+A pattern that selects specific strings from a set of character strings. +
+
+ +

+


A subexpression is

+The +regex +standard is surprisingly cavalier with terminology: +some terms are used interchangeably, some are used in a general context +in one section and a specific context in another, and some are +used without any definition whatsoever. +Acutely subject to this abuse are: +RE, +pattern, +subpattern, +expression, +and +subexpression. +In particular, +subpattern +and +subexpression +are central to the description of the matching algorithm and how +pmatch[] +is assigned. +Any interpretation of the +regex +standard involving these terms, absent a precise and accurate definition +for each, is useless. +

+subexpression +appears 70 times, and each reference is in the context of parenthesis grouping: +

+
+
[D:5909-5911]
+For example, matching the BRE "\(.*\).*" against "abcdef" , the +subexpression "(\1)" is "abcdef" , and matching the BRE +"\(a*\)*" against "bc" , the subexpression "(\1)" is the null +string. +
[D:5984-5988]
+The asterisk shall be special except when used: As the first +character of a subexpression (after an initial '^' , if any); +
[D:6094-6097]
+A subexpression can be defined within a BRE by enclosing it +between the character pairs "\(" and "\)" . Subexpressions can +be arbitrarily nested. +
[D:6100-6109]
+The character 'n' shall be a digit from 1 through 9, specifying +the nth subexpression (the one that begins with the nth "\(" +from the beginning of the pattern and ends with the +corresponding paired "\)" ). The expression is invalid if less +than n subexpressions precede the '\n' . For example, the +expression "\(.*\)\1$" matches a line consisting of two +adjacent appearances of the same string, and the expression +"\(a\)*\1" fails to match 'a' . When the referenced +subexpression matched more than one string, the back-referenced +expression shall refer to the last matched string. If the +subexpression referenced by the back-reference matches more +than one string because of an asterisk ( '*' ) or an interval +expression (see item (5)), the back-reference shall match the +last (rightmost) of these strings. +
[D:6110-6112]
+When a BRE matching a single character, a subexpression, or a +back-reference is followed by the special character asterisk ('*' ), +together with that asterisk it shall match what zero or +more consecutive occurrences of the BRE would match. +
[D:6114-6117]
+When a BRE matching a single character, a subexpression, or a +back-reference is followed by an interval expression of the +format "\{m\}" , "\{m,\}" , or "\{m,n\}" , together with that +interval expression it shall match what repeated consecutive +occurrences of the BRE would match. "\{m,n\}" , together with +that interval expression it shall match what repeated +consecutive occurrences of the BRE would match. +
[D:6127-6129]
+A subexpression repeated by an asterisk ('*') or an interval expression +shall not match a null expression unless this is the only match for the +repetition or it is necessary to satisfy the exact or minimum number of +occurrences for the interval expression. +
[D:6136]
+Subexpressions/back-references \(\) \n +
[D:6145-6151]
+The implementation may treat the circumflex as an anchor when +used as the first character of a subexpression. The circumflex +shall anchor the +expression (or optionally subexpression) to the beginning of a +string; only sequences starting at the first character of a +string shall be matched by the BRE. For example, the BRE "^ab" +matches "ab" in the string "abcdef" , but fails to match in the +string "cdefab" . The BRE "\(^ab\)" may match the former +string. A portable BRE shall escape a leading circumflex in a +subexpression to match a literal circumflex. +
[D:6152-6156]
+A dollar sign ( '$' ) shall be an anchor when used as the last +character of an entire BRE. The implementation may treat a +dollar sign as an anchor when used as the last character of a +subexpression. The dollar sign shall anchor the expression (or +optionally subexpression) to the end of the string being matched; +the dollar sign can be said to match the end-of-string following +the last character. +
[D:6265-6270]
+A circumflex ( '^' ) outside a bracket expression shall anchor +the expression or subexpression it begins to the beginning of a +string; such an expression or subexpression can match only a +sequence starting at the first character of a string. For +example, the EREs "^ab" and "(^ab)" match "ab" in the string +"abcdef" , but fail to match in the string "cdefab" , and the +ERE "a^b" is valid, but can never match because the 'a' +prevents the expression "^b" from matching starting at the +first character. +
[D:6271-6276]
+A dollar sign ( '$' ) outside a bracket expression shall anchor +the expression or subexpression it ends to the end of a string; +such an expression or subexpression can match only a sequence +ending at the last character of a string. For example, the EREs +"ef$" and "(ef$)" match "ef" in the string "abcdef" , but fail +to match in the string "cdefab" , and the ERE "e$f" is valid, +but can never match because the 'f' prevents the expression +"e$" from matching ending at the last character. +
[R:2359-2370]
+It is possible to determine what strings correspond to +subexpressions by recursively applying the leftmost longest +rule to each subexpression, but only with the proviso that the +overall match is leftmost longest. For example, matching +"\(ac*\)c*d[ac]*\1" against acdacaaa matches acdacaaa (with +\1=a); simply matching the longest match for "\(ac*\)" would +yield \1=ac, but the overall match would be smaller (acdac). +Conceptually, the implementation must examine every possible +match and among those that yield the leftmost longest total +matches, pick the one that does the longest match for the +leftmost subexpression, and so on. Note that this means that +matching by subexpressions is context-dependent: a +subexpression within a larger RE may match a different string +from the one it would match as an independent RE, and two +instances of the same subexpression within the same larger RE +may match different lengths even in similar sequences of +characters. For example, in the ERE "(a.*b)(a.*b)" , the two +identical subexpressions would match four and six characters, +respectively, of accbaccccb. +
[R:2512-2520]
+The limit of nine back-references to subexpressions in the RE +is based on the use of a single-digit identifier; increasing +this to multiple digits would break historical applications. +This does not imply that only nine subexpressions are allowed +in REs. The following is a valid BRE with ten subexpressions: +
+
+\(\(\(ab\)*c\)*d\)\(ef\)*\(gh\)\{2\}\(ij\)*\(kl\)*\(mn\)*\(op\)*\(qr\)*
+
+ +The standard developers regarded the common historical +behavior, which supported "\n*" , but not "\n\{min,max\}" , +"\(...\)*" , or "\(...\)\{min,max\}" , as a non-intentional +result of a specific implementation, and they supported both +duplication and interval expressions following subexpressions +and back-references. +
[R:2537-2544]
+However, one relatively uncommon case was changed to allow an +extension used on some implementations. Historically, the BREs +"^foo" and "\(^foo\)" did not match the same string, despite +the general rule that subexpressions and entire BREs match the +same strings. To increase consensus, IEEE Std 1003.1-2001 has +allowed an extension on some implementations to treat these two +cases in the same way by declaring that anchoring may occur at +the beginning or end of a subexpression. Therefore, portable +BREs that require a literal circumflex at the beginning or a +dollar sign at the end of a subexpression must escape them. +Note that a BRE such as "a\(^bc\)" will either match "a^bc" or +nothing on different systems under the rules. +
[R:2549-2554]
+Some implementations have extended the BRE syntax to add +alternation. For example, the subexpression "\(foo$\|bar\)" +would match either "foo" at the end of the string or "bar" +anywhere. The extension is triggered by the use of the +undefined "\|" sequence. Because the BRE is undefined for +portable scripts, the extending system is free to make other +assumptions, such that the '$' represents the end-of-line +anchor in the middle of a subexpression. If it were not for the +extension, the '$' would match a literal dollar sign under the +rules. +
[R:2617-2620]
+The removal of the Back_open_paren Back_close_paren option from +the nondupl_RE specification is the result of PASC +Interpretation 1003.2-92 #43 submitted for the ISO POSIX-2:1993 +standard. Although the grammar required support for null +subexpressions, this section does not describe the meaning of, +and historical practice did not support, this construct. +
[A:37188]
+size_t re_nsub Number of parenthesized subexpressions +
[A:37206-37208]
+If the REG_NOSUB flag was not set in cflags, then regcomp() +shall set re_nsub to the number of parenthesized subexpressions +(delimited by "\(\)" in basic regular expressions or "()" in +extended regular expressions) found in pattern. +
[A:37220-37257]
+If nmatch is 0 or REG_NOSUB was set in the cflags argument to +regcomp(), then regexec() shall ignore the pmatch argument. +Otherwise, the application shall ensure that the pmatch +argument points to an array with at least nmatch elements, and +regexec() shall fill in the elements of that array with offsets +of the substrings of string that correspond to the +parenthesized subexpressions of pattern: pmatch[i].rm_so +shall be the byte offset of the beginning and pmatch[i].rm_eo +shall be one greater than the byte offset of the end of +substring i. (Subexpression i begins at the ith matched open +parenthesis, counting from 1.) Offsets in pmatch[0] identify +the substring that corresponds to the entire regular +expression. Unused elements of pmatch up to pmatch[nmatch-1] +shall be filled with -1. If there are more than nmatch +subexpressions in pattern ( pattern itself counts as a +subexpression), then regexec() shall still do the match, but +shall record only the first nmatch substrings. +

+When matching a basic or extended regular expression, any given +parenthesized subexpression of pattern might participate in the +match of several different substrings of string, or it might +not match any substring even though the pattern as a whole did +match. The following rules shall be used to determine which +substrings to report in pmatch when matching regular +expressions: +

+
    +
  1. +If subexpression i in a regular expression is not contained +within another subexpression, and it participated in the match +several times, then the byte offsets in pmatch[i] shall +delimit the last such match. +
  2. +If subexpression i is not contained within another +subexpression, and it did not participate in an otherwise +successful match, the byte offsets in pmatch[i] shall be -1. A +subexpression does not participate in the match when: +
    + '*' or "\{\}" appears immediately after the
    +subexpression in a basic regular expression, or '*' ,
    + '?' , or "{}" appears immediately after the
    +subexpression in an extended regular expression, and
    +the subexpression did not match (matched 0 times)
    +

    +or: +

    + '|' is used in an extended regular expression to select +this subexpression or another, and the other +subexpression matched. +

    +
  3. +If subexpression i is contained within another subexpression +j, and i is not contained within any other subexpression that +is contained within j, and a match of subexpression j is +reported in pmatch[j], then the match or non-match of +subexpression i reported in pmatch[i] shall be as described in +1. and 2. above, but within the substring reported in pmatch[ +j] rather than the whole string. The offsets in pmatch[i] are +still relative to the start of string. +
  4. +If subexpression i is contained in subexpression j, and the +byte offsets in pmatch[j] are -1, then the pointers in pmatch[ +i] shall also be -1. +
  5. +If subexpression i matched a zero-length string, then both +byte offsets in pmatch[i] shall be the byte offset of the +character or null terminator immediately following the +zero-length string. +
+
+
[A:37363-37366]
+The regexec() function must fill in all nmatch elements of +pmatch, where nmatch and pmatch are supplied by the +application, even if some elements of pmatch do not correspond +to subexpressions in pattern. The application writer should +note that there is probably no reason for using a value of +nmatch that is larger than preg-> re_nsub+1. +
[A:37407-37413]
+The number of subexpressions in the RE is reported in re_nsub +in preg. With this change to regexec(), consideration was given +to dropping the REG_NOSUB flag since the user can now specify +this with a zero nmatch argument to regexec(). However, keeping +REG_NOSUB allows an implementation to use a different (perhaps +more efficient) algorithm if it knows in regcomp() that no +subexpressions need be reported. The implementation is only +required to fill in pmatch if nmatch is not zero and if +REG_NOSUB is not specified. +
+
+

+This sentence is as close as the standard gets to a definition: +

+
+
[A:37225-37226]
+Subexpression i begins at the ith matched open parenthesis, counting from 1. +
+
+

+Using nonterminals from the BRE [D:6371-6731] and ERE [D:6452-6452] grammar +productions (text not listed in this document) yields the following: +

+
+
DEFINITION
+A +subexpression +corresponds to the +Back_open_paren RE_expression Back_close_paren +form of the +nondupl_RE +BRE grammar production or +the +'(' extended_reg_exp ')' +form of the +ERE_expression +ERE grammar production. +Subexpression i begins at the ith matched open parenthesis +(Back_open_paren +for BREs and '(' for EREs), +starting from the left and counting from 1. +Subexpression 0 is the entire RE. +
+
+

+This definition and the subexpression match rule [R:2359-2370] can be used to +to examine a class of EREs where the top level catenation operands are +subexpressions. +(A top level subexpression is not contained in any other subexpression +except subexpression 0.) +The subexpression match rule in pseudo code is: +

    +
  • +determine the longest of the leftmost matches for subexpression-0 +[R:2359-2361] +
  • +for 1<=i<=re_nsub +determine the longest match for +subexpression-i +consistent with the matches already determined for +subexpression-j, +0<=j<i. +[R:2359-2370] [A:37235-37257] +
+For example, given +
+
+:RE#03:E	(a?)((ab)?)		ab	(0,2)(0,0)(0,2)(0,2)
+
+ +the subexpressions are: +
+
+subexpression-0	(a?)((ab)?)
+subexpression-1	(a?)
+subexpression-2	((ab)?)
+subexpression-3	(ab)
+
+ +The longest of the leftmost matches for subexpression-0 is (0,2). +The longest match for subexpression-1, consistent with the match +for subexpression-0, is (0,0); otherwise if it had matched (0,1) then +subexpression-2 would not match and the subexpression-0 match would be +limited to (0,1). +The longest match for subexpression-2, consistent with the matches +for subexpression-0 and subexpression-1, is (0,2). +The longest match for subexpression-3, consistent with the matches +for subexpression-0, subexpression-1 and subexpression-2, is (0,2). +This table illustrates the matching: +
+
+subexpr	pattern			match
+   0	(a?)((ab)?)		(0,2)
+   1	(a?)			(0,0)
+   2	((ab)?)			(0,2)
+   3	(ab)			(0,2)
+
+ +RE#04 is a similar example that exposes the associativity of subexpression +concatenation: +
+
+:RE#04:E	(a?)((ab)?)(b?)		ab	(0,2)(0,1)(1,1)(?,?)(1,2)
+
+subexpr	pattern			match
+   0	(a?)((ab)?)(b?)		(0,2)
+   1	(a?)			(0,1)
+   2	((ab)?)			(1,1)
+   3	(ab)			(?,?)
+   4	(b?)			(1,2)
+
+ +[R:2363-2365] also shows that parenthesis can be used to alter the +order of matching: +
+
+:RE#05:E	((a?)((ab)?))(b?)	ab	(0,2)(0,2)(0,0)(0,2)(0,2)(2,2)
+
+subexpr	pattern			match
+   0	((a?)((ab)?))(b?)	(0,2)
+   1	((a?)((ab)?))		(0,2)
+   2	(a?)			(0,0)
+   3	((ab)?)			(0,2)
+   4	(ab)			(0,2)
+   5	(b?)			(2,2)
+
+ +In RE#05 the extra parenthesis (around subexpression-1 and subexpression-2 in +RE#04) form a new subexpression-1, and change the +match for the last subexpression +(b?) +to (2,2) (from (1,2) in RE#04.) +
+
+:RE#06:E	(a?)(((ab)?)(b?))	ab	(0,2)(0,1)(1,2)(1,1)(?,?)(1,2)
+
+subexpr	pattern			match
+   0	(a?)(((ab)?)(b?))	(0,2)
+   1	(a?)			(0,1)
+   2	(((ab)?)(b?))		(1,2)
+   3	((ab)?)			(1,1)
+   4	(ab)			(?,?)
+   5	(b?)			(1,2)
+
+ +In RE#06 the extra parenthesis pair forces right associativity and results +in the same match of (1,2) for the last subexpression +(b?) +as in RE#04. +These examples show that: +
+
+
PROPERTY
+Subexpression grouping can alter the precedence of concatenation. +
PROPERTY
+Subexpression concatenation is right associative. +
+
+

+The following examples examine replicated subexpressions. +

+
+:RE#07:E	(.?)			x	(0,1)(0,1)
+:RE#08:E	(.?){1}			x	(0,1)(0,1)
+:RE#09:E	(.?)(.?)		x	(0,1)(0,1)(1,1)
+:RE#10:E	(.?){2}			x	(0,1)(1,1)
+:RE#11:E	(.?)*			x	(0,1)(0,1)
+
+ +[D:6227-6234] specifies that RE#07 and RE#08 are equivalent, and that +RE#09 and RE#10 are equivalent, and +[D:6217-6219] specifies that RE#09 and RE#11 are equivalent. +
+
+
[D:6227-6234]
+When an ERE matching a single character or an ERE enclosed in +parentheses is followed by an interval expression of the format "{m}" , +"{m,}" , or "{m,n}" , together with that interval expression it shall +match what repeated consecutive occurrences of the ERE would match. The +values of m and n are decimal integers in the range 0 <= m<= n<= +{RE_DUP_MAX}, where m specifies the exact or minimum number of +occurrences and n specifies the maximum number of occurrences. The +expression "{m}" matches exactly m occurrences of the preceding ERE, +"{m,}" matches at least m occurrences, and "{m,n}" matches any number +of occurrences between m and n, inclusive. +
[D:6217-6219]
+When an ERE matching a single character or an ERE enclosed in +parentheses is followed by the special character asterisk ( '*' ), +together with that asterisk it shall match what zero or more +consecutive occurrences of the ERE would match. +
+
+In RE#09 subexpression-1 matches (0,1), leaving the null string at (1,1) for +subexpression-2. +In RE#10 the first iteration of subexpression-1 matches (0,1), the same +as subexpression-1 in RE#09, and the second iteration of subexpression-1 +matches (1,1), the same as subexpression-2 in RE#09. +RE#07 and RE#08 show that only one iteration is needed to match the subject +string, so the match in RE#11 requires only one iteration, and as such is the +last iteration of [D:6107-6109] [A:37235-37237]. +RE#10 and RE#11 also illustrate [D:6127-6129] [D:6239-6241], which +specify that a repeated RE matches the null string only if it is the only +match (not this case) or if it is necessary to satisfy an interval expression +minimum (2 in this case.) +
+
+
[D:6239-6241]
+An ERE matching a single character repeated by an '*' , '?' , or an +interval expression shall not match a null expression unless this is +the only match for the repetition or it is necessary to satisfy the +exact or minimum number of occurrences for the interval expression. +
+
+

+The following examples dig deeper into replicated subexpressions. +

+
+:RE#12:E	(.?.?)			xxx	(0,2)(0,2)
+:RE#13:E	(.?.?){1}		xxx	(0,2)(0,2)
+:RE#14:E	(.?.?)(.?.?)		xxx	(0,3)(0,2)(2,3)
+:RE#15:E	(.?.?){2}		xxx	(0,3)(2,3)
+:RE#16:E	(.?.?)(.?.?)(.?.?)	xxx	(0,3)(0,2)(2,3)(3,3)
+:RE#17:E	(.?.?){3}		xxx	(0,3)(3,3)
+:RE#18:E	(.?.?)*			xxx	(0,3)(2,3)
+
+ +Here RE#14 shows that only two iterations are needed for a complete match, +making the last iteration match for RE#18 (2,3), since the first +iteration matched (0,2), as in RE#14. + +

+


A subpattern is

+The term +subpattern +appears exactly once: +
+
+
[D:5907-5908]
+Consistent with the whole match being the longest of the leftmost matches, +each subpattern, from left to right, shall match the longest possible string. +
+
+Consider RE#04 and RE#05 again: +
+
+:RE#04:E	(a?)((ab)?)(b?)		ab	(0,2)(0,1)(1,1)(?,?)(1,2)
+:RE#05:E	((a?)((ab)?))(b?)	ab	(0,2)(0,2)(0,0)(0,2)(0,2)(2,2)
+
+ +If a subpattern were an entity that combined adjacent subexpressions, +e.g., +(a?)((ab)?) +in RE#04, then [D:5907-5908] would violate [R:2359-2370]. +Similarly, if a subpattern were an entity that "went inside" subexpressions, +e.g., +(a?) +in RE#05, then again [D:5907-5908] would violate [R:2359-2370]. +In other words, a subpattern can be neither larger than nor smaller than +a subexpression; +a subpattern must be a grammatical entity equivalent to a subexpression. +This corresponds to the nonterminal +nondupl_RE +in the BRE grammar; there is no direct correspondence to a nonterminal +in the ERE grammar. +However, if the optional duplication operator (*,+,?,range) is included then +subpattern corresponds to +simple_RE +in the BRE grammar and +ERE_expression +in the ERE grammar, and both [D:5907-5908] and [R:2359-2370] are satisfied. +
+
+
DEFINITION
+A +subpattern +corresponds to the +simple_RE +nonterminal in the BRE grammar or the +ERE_expression +nonterminal in the ERE grammar. +
+
+This means that subexpressions and subpatterns are of equal importance +in RE matching. +Also note that any other definition for subpattern will put +[D:5907-5908] in direct conflict with [R:2359-2370]. +

+RE#19, RE#20 and RE#21 examine the relationship between subexpression +and subpattern: +

+
+:RE#19:E	a?((ab)?)(b?)		ab	(0,2)(1,1)(?,?)(1,2)
+:RE#20:E	(a?)((ab)?)b?		ab	(0,2)(0,1)(1,1)(?,?)
+:RE#21:E	a?((ab)?)b?		ab	(0,2)(1,1)(?,?)
+
+ +

+These are all variations of RE#04. +Other than subexpression renumbering, the match for the subexpression +((ab)?) +must be the same in RE#04, RE#19, RE#20 and RE#21. +a? +is a subpattern in RE#19 and RE#21, of equal matching importance to +(a?) +in RE#04, and +b? +is a subpattern in RE#20 and RE#21, of equal matching +importance to +(b?) +in RE#04. + +

+


The Dark Corners

+The remaining examples explore dark corners of the standard +and implementations. +Although the differences between some of the examples are subtle, +for some implementations it may mean the difference between an answer and +a core dump. +

+In RE#22 subexpression +(a*) +matches the null string at (0,0), and continues to match at that position +until the minimal range count is satisfied. +

+
+:RE#22:E	(a*){2}			xxxxx	(0,0)(0,0)
+
+ +RE#23 through RE#27 expose implementations that sometimes do +first match +for alternation within subexpressions. +Some implementations erroneously match the first iteration of +subexpression-1 in RE#24 through RE#27 to (0,1). +RE#27 is equivalent to RE#26; the match requires two iterations, the first +matching (0,2) and the last matching (2,3). +
+
+:RE#23:E	(ab?)(b?a)		aba	(0,3)(0,2)(2,3)
+:RE#24:E	(a|ab)(ba|a)		aba	(0,3)(0,2)(2,3)
+:RE#25:E	(a|ab|ba)		aba	(0,2)(0,2)
+:RE#26:E	(a|ab|ba)(a|ab|ba)	aba	(0,3)(0,2)(2,3)
+:RE#27:E	(a|ab|ba)*		aba	(0,3)(2,3)
+
+ +RE#28 through RE#33 expose implementations that report short matches +for some repeated subexpressions. +Some implementations report incorrect matches for +subexpression-1 in RE#30 and RE#33. +
+
+:RE#28:E	(aba|a*b)		ababa	(0,3)(0,3)
+:RE#29:E	(aba|a*b)(aba|a*b)	ababa	(0,5)(0,2)(2,5)
+:RE#30:E	(aba|a*b)*		ababa	(0,5)(2,5)
+:RE#31:E	(aba|ab|a)		ababa	(0,3)(0,3)
+:RE#32:E	(aba|ab|a)(aba|ab|a)	ababa	(0,5)(0,2)(2,5)
+:RE#33:E	(aba|ab|a)*		ababa	(0,5)(2,5)
+
+ +RE#34 through RE#36 expose implementations that report subexpression matches +for earlier iterations of the subexpression. +Some implementations report a match for subexpression-2 in RE#36 +while reporting the (2,3) match for subexpression-1: clearly a bug. +
+
+:RE#34:E	(a(b)?)			aba	(0,2)(0,2)(1,2)
+:RE#35:E	(a(b)?)(a(b)?)		aba	(0,3)(0,2)(1,2)(2,3)(?,?)
+:RE#36:E	(a(b)?)+		aba	(0,3)(2,3)(?,?)
+
+ +RE#37 and RE#38 expose implementations that give priority to subexpression +matching over subpattern matching. +
+
+:RE#37:E	(.*)(.*)		xx	(0,2)(0,2)(2,2)
+:RE#38:E	.*(.*)			xx	(0,2)(2,2)
+
+ +RE#39 through RE#41 expose implementations that treat explicit vs. implicit +subexpression repetition differently. +This is a theme common to many of the previous examples. +Again, the subexpression in RE#41 requires two iterations to match, +and the second iteration matches (5,7), as illustrated by RE#40. +
+
+:RE#39:E	(a.*z|b.*y)		azbazby	(0,5)(0,5)
+:RE#40:E	(a.*z|b.*y)(a.*z|b.*y)	azbazby	(0,7)(0,5)(5,7)
+:RE#41:E	(a.*z|b.*y)*		azbazby	(0,7)(5,7)
+
+ +RE#42 is another +first match +test. +Some implementations erroneously report a match of (0,1) for subexpression-1. +
+
+:RE#42:E	(.|..)(.*)		ab	(0,2)(0,2)(2,2)
+
+ +RE#43 through RE#45 require only one iteration of subexpression-1 to +match the entire subject string. +RE#45 exposes three separate bugs in the implementations that were tested. +The most common was +over iteration, +where subexpression-1 is matched for a second iteration to the null string +at (3,3). +
+
+:RE#43:E	((..)*(...)*)			xxx		(0,3)(0,3)(?,?)(0,3)
+:RE#44:E	((..)*(...)*)((..)*(...)*)	xxx		(0,3)(0,3)(?,?)(0,3)(3,3)(?,?)(?,?)
+:RE#45:E	((..)*(...)*)*			xxx		(0,3)(0,3)(?,?)(0,3)
+
+ +RE#46 through RE#82 are nasty; +backreferences are intuitive neither for the implementor nor the user. +

+RE#49, RE#53, RE#67 and RE#68 illustrate the second part of the +subpattern +rule: +

+
+
[D:5908-5909]
+For this purpose, a null string shall be considered to be longer than +no match at all. +
+
+RE#53 requires close examination to see why the match is (0,2)(1,1)(2,2) +instead of (0,2)(0,1)(?,?). +The match of (0,1) for subexpression-1 is longer than (1,1), but +subexpression-1 can be repeated, and that second iteration allows +subexpression-2 to match (2,2), which is longer than (?,?) by [D:5908-5909]. +
+
+:RE#46:B	\(a\{0,1\}\)*b\1	ab	(0,2)(1,1)
+:RE#47:B	\(a*\)*b\1		ab	(0,2)(1,1)
+:RE#48:B	\(a*\)b\1*		ab	(0,2)(0,1)
+:RE#49:B	\(a*\)*b\1*		ab	(0,2)(1,1)
+:RE#50:B	\(a\{0,1\}\)*b\(\1\)	ab	(0,2)(1,1)(2,2)
+:RE#51:B	\(a*\)*b\(\1\)		ab	(0,2)(1,1)(2,2)
+:RE#52:B	\(a*\)b\(\1\)*		ab	(0,2)(0,1)(?,?)
+:RE#53:B	\(a*\)*b\(\1\)*		ab	(0,2)(1,1)(2,2)
+:RE#54:B	\(a\{0,1\}\)*b\1	aba	(0,3)(0,1)
+:RE#55:B	\(a*\)*b\1		aba	(0,3)(0,1)
+:RE#56:B	\(a*\)b\1*		aba	(0,3)(0,1)
+:RE#57:B	\(a*\)*b\1*		aba	(0,3)(0,1)
+:RE#58:B	\(a*\)*b\(\1\)*		aba	(0,3)(0,1)(2,3)
+:RE#59:B	\(a\{0,1\}\)*b\1	abaa	(0,3)(0,1)
+:RE#60:B	\(a*\)*b\1		abaa	(0,3)(0,1)
+:RE#61:B	\(a*\)b\1*		abaa	(0,4)(0,1)
+:RE#62:B	\(a*\)*b\1*		abaa	(0,4)(0,1)
+:RE#63:B	\(a*\)*b\(\1\)*		abaa	(0,4)(0,1)(3,4)
+:RE#64:B	\(a\{0,1\}\)*b\1	aab	(0,3)(2,2)
+:RE#65:B	\(a*\)*b\1		aab	(0,3)(2,2)
+:RE#66:B	\(a*\)b\1*		aab	(0,3)(0,2)
+:RE#67:B	\(a*\)*b\1*		aab	(0,3)(2,2)
+:RE#68:B	\(a*\)*b\(\1\)*		aab	(0,3)(2,2)(3,3)
+:RE#69:B	\(a\{0,1\}\)*b\1	aaba	(0,4)(1,2)
+:RE#70:B	\(a*\)*b\1		aaba	(0,4)(1,2)
+:RE#71:B	\(a*\)b\1*		aaba	(0,3)(0,2)
+:RE#72:B	\(a*\)*b\1*		aaba	(0,4)(1,2)
+:RE#73:B	\(a*\)*b\(\1\)*		aaba	(0,4)(1,2)(3,4)
+:RE#74:B	\(a\{0,1\}\)*b\1	aabaa	(0,4)(1,2)
+:RE#75:B	\(a*\)*b\1		aabaa	(0,5)(0,2)
+:RE#76:B	\(a*\)b\1*		aabaa	(0,5)(0,2)
+:RE#77:B	\(a*\)*b\1*		aabaa	(0,5)(0,2)
+:RE#78:B	\(a*\)*b\(\1\)*		aabaa	(0,5)(0,2)(3,5)
+:RE#79:B	\(x\)*a\1		a	NOMATCH
+:RE#80:B	\(x\)*a\1*		a	(0,1)(?,?)
+:RE#81:B	\(x\)*a\(\1\)		a	NOMATCH
+:RE#82:B	\(x\)*a\(\1\)*		a	(0,1)(?,?)(?,?)
+:RE#83:E	(aa(b(b))?)+		aabbaa	(0,6)(4,6)(?,?)(?,?)
+:RE#84:E	(a(b)?)+		aba	(0,3)(2,3)(?,?)
+:RE#85:E	([ab]+)([bc]+)([cd]*)		abcd		(0,4)(0,2)(2,3)(3,4)
+:RE#86:B	\([ab]*\)\([bc]*\)\([cd]*\)\1	abcdaa		(0,5)(0,1)(1,3)(3,4)
+:RE#87:B	\([ab]*\)\([bc]*\)\([cd]*\)\1	abcdab		(0,6)(0,2)(2,3)(3,4)
+:RE#88:B	\([ab]*\)\([bc]*\)\([cd]*\)\1*	abcdaa		(0,6)(0,1)(1,3)(3,4)
+:RE#89:B	\([ab]*\)\([bc]*\)\([cd]*\)\1*	abcdab		(0,6)(0,2)(2,3)(3,4)
+:RE#90:E	^(A([^B]*))?(B(.*))?		Aa		(0,2)(0,2)(1,2)
+:RE#91:E	^(A([^B]*))?(B(.*))?		Bb		(0,2)(?,?)(?,?)(0,2)(1,2)
+:RE#92:B	.*\([AB]\).*\1			ABA		(0,3)(0,1)
+:RE#93:B$	[^A]*A				\nA		(0,2)
+
+ + +

+


Conclusion

+It is possible to use the 2001 issue of the POSIX +regex +standard, +with the addition of one sentence, +to resolve the interpretation differences that have surfaced since 1995. +That key sentence is a precise and consistent definition for the term +subpattern. +By noting the relationship between +subpatterns +and +subexpressions, +the proposed definition is shown to be the only one that can be +consistent with all parts of the standard. +

+


+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Glenn Fowler
Information and Software Systems Research
AT&T Labs Research
Florham Park NJ
January 2003
+

+ +

+ + + diff --git a/re-nullsubexpr.html b/re-nullsubexpr.html new file mode 100644 index 0000000..f0d7d1f --- /dev/null +++ b/re-nullsubexpr.html @@ -0,0 +1,62 @@ + + + + + + ../re/re-nullsubexpr.mm mm document + + + + +
+ +

+


+
+

regular expression null subexpression tests

+
Glenn Fowler <gsf@research.att.com> +

AT&T Labs Research - Florham Park NJ +

+


+The +regex +tests in + nullsubexpr.dat +exercise +regex +null subexpression matching. +

+


+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Glenn Fowler
Information and Software Systems Research
AT&T Labs Research
Florham Park NJ
August 04, 2002
+

+ +

+ + + diff --git a/re-repetition.html b/re-repetition.html new file mode 100644 index 0000000..2381811 --- /dev/null +++ b/re-repetition.html @@ -0,0 +1,60 @@ + + + + + + ../re/re-repetition.mm mm document + + + + +
+ +

+


+
+

regular expression repetition tests

+
Glenn Fowler <gsf@research.att.com> +

AT&T Labs Research - Florham Park NJ +

+


+The +regex +tests in + repetition.dat +exercise explicit and implicit repetition. +

+


+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Glenn Fowler
Information and Software Systems Research
AT&T Labs Research
Florham Park NJ
August 04, 2002
+

+ +

+ + + diff --git a/repetition.dat b/repetition.dat new file mode 100644 index 0000000..b54a2c6 --- /dev/null +++ b/repetition.dat @@ -0,0 +1,79 @@ +NOTE implicit vs. explicit repetitions : 2002-08-01 +# +# Glenn Fowler +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) +# + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) + +E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) diff --git a/rightassoc.dat b/rightassoc.dat new file mode 100644 index 0000000..ed7f28e --- /dev/null +++ b/rightassoc.dat @@ -0,0 +1,16 @@ +NOTE left-assoc:pass-none right-assoc:pass-all : 2002-04-29 + +E (a|ab)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4) +E (a|ab)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4) +E (ab|a)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4) +E (ab|a)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4) + +E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3) +E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3) +E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3) +E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3) + +E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) +E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) +E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) +E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) diff --git a/testregex.c b/testregex.c new file mode 100644 index 0000000..7b86ab7 --- /dev/null +++ b/testregex.c @@ -0,0 +1,2121 @@ +#pragma prototyped noticed + +/* + * regex(3) test harness + * + * build: cc -o testregex testregex.c + * help: testregex --man + * note: REG_* features are detected by #ifdef; if REG_* are enums + * then supply #define REG_foo REG_foo for each enum REG_foo + * + * Glenn Fowler + * AT&T Labs Research + * + * PLEASE: publish your tests so everyone can benefit + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software + * without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +static const char id[] = "\n@(#)$Id: testregex (AT&T Research) 2005-05-20 $\0\n"; + +#if _PACKAGE_ast +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __STDC__ +#include +#include +#endif + +#if !_PACKAGE_ast +#undef REG_DISCIPLINE +#endif + +#ifndef REG_DELIMITED +#undef _REG_subcomp +#endif + +#define TEST_ARE 0x00000001 +#define TEST_BRE 0x00000002 +#define TEST_ERE 0x00000004 +#define TEST_KRE 0x00000008 +#define TEST_LRE 0x00000010 +#define TEST_SRE 0x00000020 + +#define TEST_EXPAND 0x00000040 +#define TEST_LENIENT 0x00000080 + +#define TEST_QUERY 0x00000100 +#define TEST_SUB 0x00000200 +#define TEST_UNSPECIFIED 0x00000400 +#define TEST_VERIFY 0x00000800 +#define TEST_AND 0x00001000 +#define TEST_OR 0x00002000 + +#define TEST_DELIMIT 0x00010000 +#define TEST_OK 0x00020000 +#define TEST_SAME 0x00040000 + +#define TEST_ACTUAL 0x00100000 +#define TEST_BASELINE 0x00200000 +#define TEST_FAIL 0x00400000 +#define TEST_PASS 0x00800000 +#define TEST_SUMMARY 0x01000000 + +#define TEST_IGNORE_ERROR 0x02000000 +#define TEST_IGNORE_OVER 0x04000000 +#define TEST_IGNORE_POSITION 0x08000000 + +#define TEST_CATCH 0x10000000 +#define TEST_VERBOSE 0x20000000 + +#define TEST_GLOBAL (TEST_ACTUAL|TEST_AND|TEST_BASELINE|TEST_CATCH|TEST_FAIL|TEST_IGNORE_ERROR|TEST_IGNORE_OVER|TEST_IGNORE_POSITION|TEST_OR|TEST_PASS|TEST_SUMMARY|TEST_VERBOSE) + +#ifdef REG_DISCIPLINE + + +#include + +typedef struct Disc_s +{ + regdisc_t disc; + int ordinal; + Sfio_t* sp; +} Disc_t; + +static void* +compf(const regex_t* re, const char* xstr, size_t xlen, regdisc_t* disc) +{ + Disc_t* dp = (Disc_t*)disc; + + return (void*)++dp->ordinal; +} + +static int +execf(const regex_t* re, void* data, const char* xstr, size_t xlen, const char* sstr, size_t slen, char** snxt, regdisc_t* disc) +{ + Disc_t* dp = (Disc_t*)disc; + + sfprintf(dp->sp, "{%-.*s}(%d:%d)", xlen, xstr, (int)data, slen); + return atoi(xstr); +} + +static void* +resizef(void* handle, void* data, size_t size) +{ + if (!size) + return 0; + return stkalloc((Sfio_t*)handle, size); +} + +#endif + +#ifndef NiL +#ifdef __STDC__ +#define NiL 0 +#else +#define NiL (char*)0 +#endif +#endif + +#define H(x) do{if(html)fprintf(stderr,x);}while(0) +#define T(x) fprintf(stderr,x) + +static void +help(int html) +{ +H("\n"); +H("\n"); +H("\n"); +H("testregex man document\n"); +H("\n"); +H("\n"); +H("
\n");
+T("NAME\n");
+T("  testregex - regex(3) test harness\n");
+T("\n");
+T("SYNOPSIS\n");
+T("  testregex [ options ]\n");
+T("\n");
+T("DESCRIPTION\n");
+T("  testregex reads regex(3) test specifications, one per line, from the\n");
+T("  standard input and writes one output line for each failed test. A\n");
+T("  summary line is written after all tests are done. Each successful\n");
+T("  test is run again with REG_NOSUB. Unsupported features are noted\n");
+T("  before the first test, and tests requiring these features are\n");
+T("  silently ignored.\n");
+T("\n");
+T("OPTIONS\n");
+T("  -c	catch signals and non-terminating calls\n");
+T("  -e	ignore error return mismatches\n");
+T("  -h	list help on standard error\n");
+T("  -n	do not repeat successful tests with regnexec()\n");
+T("  -o	ignore match[] overrun errors\n");
+T("  -p	ignore negative position mismatches\n");
+T("  -s	use stack instead of malloc\n");
+T("  -x	do not repeat successful tests with REG_NOSUB\n");
+T("  -v	list each test line\n");
+T("  -A	list failed test lines with actual answers\n");
+T("  -B	list all test lines with actual answers\n");
+T("  -F	list failed test lines\n");
+T("  -P	list passed test lines\n");
+T("  -S	output one summary line\n");
+T("\n");
+T("INPUT FORMAT\n");
+T("  Input lines may be blank, a comment beginning with #, or a test\n");
+T("  specification. A specification is five fields separated by one\n");
+T("  or more tabs. NULL denotes the empty string and NIL denotes the\n");
+T("  0 pointer.\n");
+T("\n");
+T("  Field 1: the regex(3) flags to apply, one character per REG_feature\n");
+T("  flag. The test is skipped if REG_feature is not supported by the\n");
+T("  implementation. If the first character is not [BEASKL] then the\n");
+T("  specification is a global control line. One or more of [BEASKL] may be\n");
+T("  specified; the test will be repeated for each mode.\n");
+T("\n");
+T("    B 	basic			BRE	(grep, ed, sed)\n");
+T("    E 	REG_EXTENDED		ERE	(egrep)\n");
+T("    A	REG_AUGMENTED		ARE	(egrep with negation)\n");
+T("    S	REG_SHELL		SRE	(sh glob)\n");
+T("    K	REG_SHELL|REG_AUGMENTED	KRE	(ksh glob)\n");
+T("    L	REG_LITERAL		LRE	(fgrep)\n");
+T("\n");
+T("    a	REG_LEFT|REG_RIGHT	implicit ^...$\n");
+T("    b	REG_NOTBOL		lhs does not match ^\n");
+T("    c	REG_COMMENT		ignore space and #...\\n\n");
+T("    d	REG_SHELL_DOT		explicit leading . match\n");
+T("    e	REG_NOTEOL		rhs does not match $\n");
+T("    f	REG_MULTIPLE		multiple \\n separated patterns\n");
+T("    g	FNM_LEADING_DIR		testfnmatch only -- match until /\n");
+T("    h	REG_MULTIREF		multiple digit backref\n");
+T("    i	REG_ICASE		ignore case\n");
+T("    j	REG_SPAN		. matches \\n\n");
+T("    k	REG_ESCAPE		\\ to ecape [...] delimiter\n");
+T("    l	REG_LEFT		implicit ^...\n");
+T("    m	REG_MINIMAL		minimal match\n");
+T("    n	REG_NEWLINE		explicit \\n match\n");
+T("    o	REG_ENCLOSED		(|&) magic inside [@|&](...)\n");
+T("    p	REG_SHELL_PATH		explicit / match\n");
+T("    q	REG_DELIMITED		delimited pattern\n");
+T("    r	REG_RIGHT		implicit ...$\n");
+T("    s	REG_SHELL_ESCAPED	\\ not special\n");
+T("    t	REG_MUSTDELIM		all delimiters must be specified\n");
+T("    u	standard unspecified behavior -- errors not counted\n");
+T("    w	REG_NOSUB		no subexpression match array\n");
+T("    x	REG_LENIENT		let some errors slide\n");
+T("    y	REG_LEFT		regexec() implicit ^...\n");
+T("    z	REG_NULL		NULL subexpressions ok\n");
+T("    $	                        expand C \\c escapes in fields 2 and 3\n");
+T("    /	                        field 2 is a regsubcomp() expression\n");
+T("\n");
+T("  Field 1 control lines:\n");
+T("\n");
+T("    C		set LC_COLLATE and LC_CTYPE to locale in field 2\n");
+T("\n");
+T("    ?test ...	output field 5 if passed and != EXPECTED, silent otherwise\n");
+T("    &test ...	output field 5 if current and previous passed\n");
+T("    |test ...	output field 5 if current passed and previous failed\n");
+T("    ; ...	output field 2 if previous failed\n");
+T("    {test ...	skip if failed until }\n");
+T("    }		end of skip\n");
+T("\n");
+T("    : comment		comment copied as output NOTE\n");
+T("    :comment:test	:comment: ignored\n");
+T("    N[OTE] comment	comment copied as output NOTE\n");
+T("    T[EST] comment	comment\n");
+T("\n");
+T("    number		use number for nmatch (20 by default)\n");
+T("\n");
+T("  Field 2: the regular expression pattern; SAME uses the pattern from\n");
+T("    the previous specification.\n");
+T("\n");
+T("  Field 3: the string to match.\n");
+T("\n");
+T("  Field 4: the test outcome. This is either one of the posix error\n");
+T("    codes (with REG_ omitted) or the match array, a list of (m,n)\n");
+T("    entries with m and n being first and last+1 positions in the\n");
+T("    field 3 string, or NULL if REG_NOSUB is in effect and success\n");
+T("    is expected. BADPAT is acceptable in place of any regcomp(3)\n");
+T("    error code. The match[] array is initialized to (-2,-2) before\n");
+T("    each test. All array elements from 0 to nmatch-1 must be specified\n");
+T("    in the outcome. Unspecified endpoints (offset -1) are denoted by ?.\n");
+T("    Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a\n");
+T("    matched (?{...}) expression, where x is the text enclosed by {...},\n");
+T("    o is the expression ordinal counting from 1, and n is the length of\n");
+T("    the unmatched portion of the subject string. If x starts with a\n");
+T("    number then that is the return value of re_execf(), otherwise 0 is\n");
+T("    returned.\n");
+T("\n");
+T("  Field 5: optional comment appended to the report.\n");
+T("\n");
+T("CAVEAT\n");
+T("    If a regex implementation misbehaves with memory then all bets are off.\n");
+T("\n");
+T("CONTRIBUTORS\n");
+T("  Glenn Fowler    gsf@research.att.com        (ksh strmatch, regex extensions)\n");
+T("  David Korn      dgk@research.att.com        (ksh glob matcher)\n");
+T("  Doug McIlroy    mcilroy@dartmouth.edu       (ast regex/testre in C++)\n");
+T("  Tom Lord        lord@regexps.com            (rx tests)\n");
+T("  Henry Spencer   henry@zoo.toronto.edu       (original public regex)\n");
+T("  Andrew Hume     andrew@research.att.com     (gre tests)\n");
+T("  John Maddock    John_Maddock@compuserve.com (regex++ tests)\n");
+T("  Philip Hazel    ph10@cam.ac.uk              (pcre tests)\n");
+T("  Ville Laurikari vl@iki.fi                   (libtre tests)\n");
+H("
\n"); +H("\n"); +H("\n"); +} + +#ifndef elementsof +#define elementsof(x) (sizeof(x)/sizeof(x[0])) +#endif + +#ifndef streq +#define streq(a,b) (*(a)==*(b)&&!strcmp(a,b)) +#endif + +#define HUNG 2 +#define NOTEST (~0) + +#ifndef REG_TEST_DEFAULT +#define REG_TEST_DEFAULT 0 +#endif + +#ifndef REG_EXEC_DEFAULT +#define REG_EXEC_DEFAULT 0 +#endif + +static const char* unsupported[] = +{ + "BASIC", +#ifndef REG_EXTENDED + "EXTENDED", +#endif +#ifndef REG_AUGMENTED + "AUGMENTED", +#endif +#ifndef REG_SHELL + "SHELL", +#endif + +#ifndef REG_COMMENT + "COMMENT", +#endif +#ifndef REG_DELIMITED + "DELIMITED", +#endif +#ifndef REG_DISCIPLINE + "DISCIPLINE", +#endif +#ifndef REG_ESCAPE + "ESCAPE", +#endif +#ifndef REG_ICASE + "ICASE", +#endif +#ifndef REG_LEFT + "LEFT", +#endif +#ifndef REG_LENIENT + "LENIENT", +#endif +#ifndef REG_LITERAL + "LITERAL", +#endif +#ifndef REG_MINIMAL + "MINIMAL", +#endif +#ifndef REG_MULTIPLE + "MULTIPLE", +#endif +#ifndef REG_MULTIREF + "MULTIREF", +#endif +#ifndef REG_MUSTDELIM + "MUSTDELIM", +#endif +#ifndef REG_NEWLINE + "NEWLINE", +#endif +#ifndef REG_NOTBOL + "NOTBOL", +#endif +#ifndef REG_NOTEOL + "NOTEOL", +#endif +#ifndef REG_NULL + "NULL", +#endif +#ifndef REG_RIGHT + "RIGHT", +#endif +#ifndef REG_SHELL_DOT + "SHELL_DOT", +#endif +#ifndef REG_SHELL_ESCAPED + "SHELL_ESCAPED", +#endif +#ifndef REG_SHELL_GROUP + "SHELL_GROUP", +#endif +#ifndef REG_SHELL_PATH + "SHELL_PATH", +#endif +#ifndef REG_SPAN + "SPAN", +#endif +#if REG_NOSUB & REG_TEST_DEFAULT + "SUBMATCH", +#endif +#if !_REG_nexec + "regnexec", +#endif +#if !_REG_subcomp + "regsubcomp", +#endif + 0 +}; + +#ifndef REG_COMMENT +#define REG_COMMENT NOTEST +#endif +#ifndef REG_DELIMITED +#define REG_DELIMITED NOTEST +#endif +#ifndef REG_ESCAPE +#define REG_ESCAPE NOTEST +#endif +#ifndef REG_ICASE +#define REG_ICASE NOTEST +#endif +#ifndef REG_LEFT +#define REG_LEFT NOTEST +#endif +#ifndef REG_LENIENT +#define REG_LENIENT 0 +#endif +#ifndef REG_MINIMAL +#define REG_MINIMAL NOTEST +#endif +#ifndef REG_MULTIPLE +#define REG_MULTIPLE NOTEST +#endif +#ifndef REG_MULTIREF +#define REG_MULTIREF NOTEST +#endif +#ifndef REG_MUSTDELIM +#define REG_MUSTDELIM NOTEST +#endif +#ifndef REG_NEWLINE +#define REG_NEWLINE NOTEST +#endif +#ifndef REG_NOTBOL +#define REG_NOTBOL NOTEST +#endif +#ifndef REG_NOTEOL +#define REG_NOTEOL NOTEST +#endif +#ifndef REG_NULL +#define REG_NULL NOTEST +#endif +#ifndef REG_RIGHT +#define REG_RIGHT NOTEST +#endif +#ifndef REG_SHELL_DOT +#define REG_SHELL_DOT NOTEST +#endif +#ifndef REG_SHELL_ESCAPED +#define REG_SHELL_ESCAPED NOTEST +#endif +#ifndef REG_SHELL_GROUP +#define REG_SHELL_GROUP NOTEST +#endif +#ifndef REG_SHELL_PATH +#define REG_SHELL_PATH NOTEST +#endif +#ifndef REG_SPAN +#define REG_SPAN NOTEST +#endif + +#define REG_UNKNOWN (-1) + +#ifndef REG_ENEWLINE +#define REG_ENEWLINE (REG_UNKNOWN-1) +#endif +#ifndef REG_ENULL +#ifndef REG_EMPTY +#define REG_ENULL (REG_UNKNOWN-2) +#else +#define REG_ENULL REG_EMPTY +#endif +#endif +#ifndef REG_ECOUNT +#define REG_ECOUNT (REG_UNKNOWN-3) +#endif +#ifndef REG_BADESC +#define REG_BADESC (REG_UNKNOWN-4) +#endif +#ifndef REG_EMEM +#define REG_EMEM (REG_UNKNOWN-5) +#endif +#ifndef REG_EHUNG +#define REG_EHUNG (REG_UNKNOWN-6) +#endif +#ifndef REG_EBUS +#define REG_EBUS (REG_UNKNOWN-7) +#endif +#ifndef REG_EFAULT +#define REG_EFAULT (REG_UNKNOWN-8) +#endif +#ifndef REG_EFLAGS +#define REG_EFLAGS (REG_UNKNOWN-9) +#endif +#ifndef REG_EDELIM +#define REG_EDELIM (REG_UNKNOWN-9) +#endif + +static const struct { int code; char* name; } codes[] = +{ + REG_UNKNOWN, "UNKNOWN", + REG_NOMATCH, "NOMATCH", + REG_BADPAT, "BADPAT", + REG_ECOLLATE, "ECOLLATE", + REG_ECTYPE, "ECTYPE", + REG_EESCAPE, "EESCAPE", + REG_ESUBREG, "ESUBREG", + REG_EBRACK, "EBRACK", + REG_EPAREN, "EPAREN", + REG_EBRACE, "EBRACE", + REG_BADBR, "BADBR", + REG_ERANGE, "ERANGE", + REG_ESPACE, "ESPACE", + REG_BADRPT, "BADRPT", + REG_ENEWLINE, "ENEWLINE", + REG_ENULL, "ENULL", + REG_ECOUNT, "ECOUNT", + REG_BADESC, "BADESC", + REG_EMEM, "EMEM", + REG_EHUNG, "EHUNG", + REG_EBUS, "EBUS", + REG_EFAULT, "EFAULT", + REG_EFLAGS, "EFLAGS", + REG_EDELIM, "EDELIM", +}; + +static struct +{ + regmatch_t NOMATCH; + int errors; + int extracted; + int ignored; + int lineno; + int passed; + int signals; + int unspecified; + int verify; + int warnings; + char* file; + char* stack; + char* which; + jmp_buf gotcha; +#ifdef REG_DISCIPLINE + Disc_t disc; +#endif +} state; + +static void +quote(char* s, int len, unsigned long test) +{ + unsigned char* u = (unsigned char*)s; + unsigned char* e; + int c; + + if (!u) + printf("NIL"); + else if (!*u && len <= 1) + printf("NULL"); + else if (test & TEST_EXPAND) + { + if (len < 0) + len = strlen((char*)u); + e = u + len; + if (test & TEST_DELIMIT) + printf("\""); + while (u < e) + switch (c = *u++) + { + case '\\': + printf("\\\\"); + break; + case '"': + if (test & TEST_DELIMIT) + printf("\\\""); + else + printf("\""); + break; + case '\a': + printf("\\a"); + break; + case '\b': + printf("\\b"); + break; + case 033: + printf("\\e"); + break; + case '\f': + printf("\\f"); + break; + case '\n': + printf("\\n"); + break; + case '\r': + printf("\\r"); + break; + case '\t': + printf("\\t"); + break; + case '\v': + printf("\\v"); + break; + default: + if (!iscntrl(c) && isprint(c)) + putchar(c); + else + printf("\\x%02x", c); + break; + } + if (test & TEST_DELIMIT) + printf("\""); + } + else + printf("%s", s); +} + +static void +report(char* comment, char* fun, char* re, char* s, int len, char* msg, int flags, unsigned long test) +{ + if (state.file) + printf("%s:", state.file); + printf("%d:", state.lineno); + if (re) + { + printf(" "); + quote(re, -1, test|TEST_DELIMIT); + if (s) + { + printf(" versus "); + quote(s, len, test|TEST_DELIMIT); + } + } + if (test & TEST_UNSPECIFIED) + { + state.unspecified++; + printf(" unspecified behavior"); + } + else + state.errors++; + if (state.which) + printf(" %s", state.which); + if (flags & REG_NOSUB) + printf(" NOSUB"); + if (fun) + printf(" %s", fun); + if (comment[strlen(comment)-1] == '\n') + printf(" %s", comment); + else + { + printf(" %s: ", comment); + if (msg) + printf("%s: ", msg); + } +} + +static void +error(regex_t* preg, int code) +{ + char* msg; + char buf[256]; + + switch (code) + { + case REG_EBUS: + msg = "bus error"; + break; + case REG_EFAULT: + msg = "memory fault"; + break; + case REG_EHUNG: + msg = "did not terminate"; + break; + default: + regerror(code, preg, msg = buf, sizeof buf); + break; + } + printf("%s\n", msg); +} + +static void +bad(char* comment, char* re, char* s, int len, unsigned long test) +{ + printf("bad test case "); + report(comment, NiL, re, s, len, NiL, 0, test); + exit(1); +} + +static int +escape(char* s) +{ + char* b; + char* t; + char* q; + char* e; + int c; + + for (b = t = s; *t = *s; s++, t++) + if (*s == '\\') + switch (*++s) + { + case '\\': + break; + case 'a': + *t = '\a'; + break; + case 'b': + *t = '\b'; + break; + case 'c': + if (*t = *++s) + *t &= 037; + else + s--; + break; + case 'e': + case 'E': + *t = 033; + break; + case 'f': + *t = '\f'; + break; + case 'n': + *t = '\n'; + break; + case 'r': + *t = '\r'; + break; + case 's': + *t = ' '; + break; + case 't': + *t = '\t'; + break; + case 'v': + *t = '\v'; + break; + case 'u': + case 'x': + c = 0; + q = c == 'u' ? (s + 5) : (char*)0; + e = s + 1; + while (!e || !q || s < q) + { + switch (*++s) + { + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + c = (c << 4) + *s - 'a' + 10; + continue; + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + c = (c << 4) + *s - 'A' + 10; + continue; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + c = (c << 4) + *s - '0'; + continue; + case '{': + case '[': + if (s != e) + { + s--; + break; + } + e = 0; + continue; + case '}': + case ']': + if (e) + s--; + break; + default: + s--; + break; + } + break; + } + *t = c; + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + c = *s - '0'; + q = s + 2; + while (s < q) + { + switch (*++s) + { + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + c = (c << 3) + *s - '0'; + break; + default: + q = --s; + break; + } + } + *t = c; + break; + default: + *(s + 1) = 0; + bad("invalid C \\ escape\n", s - 1, NiL, 0, 0); + } + return t - b; +} + +static void +matchoffprint(int off) +{ + switch (off) + { + case -2: + printf("X"); + break; + case -1: + printf("?"); + break; + default: + printf("%d", off); + break; + } +} + +static void +matchprint(regmatch_t* match, int nmatch, int nsub, char* ans, unsigned long test) +{ + int i; + + for (; nmatch > nsub + 1; nmatch--) + if ((match[nmatch-1].rm_so != -1 || match[nmatch-1].rm_eo != -1) && (!(test & TEST_IGNORE_POSITION) || match[nmatch-1].rm_so >= 0 && match[nmatch-1].rm_eo >= 0)) + break; + for (i = 0; i < nmatch; i++) + { + printf("("); + matchoffprint(match[i].rm_so); + printf(","); + matchoffprint(match[i].rm_eo); + printf(")"); + } + if (!(test & (TEST_ACTUAL|TEST_BASELINE))) + { + if (ans) + printf(" expected: %s", ans); + printf("\n"); + } +} + +static int +matchcheck(regmatch_t* match, int nmatch, int nsub, char* ans, char* re, char* s, int len, int flags, unsigned long test) +{ + char* p; + int i; + int m; + int n; + + if (streq(ans, "OK")) + return test & (TEST_BASELINE|TEST_PASS|TEST_VERIFY); + for (i = 0, p = ans; i < nmatch && *p; i++) + { + if (*p == '{') + { +#ifdef REG_DISCIPLINE + char* x; + + x = sfstruse(state.disc.sp); + if (strcmp(p, x)) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + return 0; + report("callout failed", NiL, re, s, len, NiL, flags, test); + quote(p, -1, test); + printf(" expected, "); + quote(x, -1, test); + printf(" returned\n"); + } +#endif + break; + } + if (*p++ != '(') + bad("improper answer\n", re, s, -1, test); + if (*p == '?') + { + m = -1; + p++; + } + else + m = strtol(p, &p, 10); + if (*p++ != ',') + bad("improper answer\n", re, s, -1, test); + if (*p == '?') + { + n = -1; + p++; + } + else + n = strtol(p, &p, 10); + if (*p++ != ')') + bad("improper answer\n", re, s, -1, test); + if (m!=match[i].rm_so || n!=match[i].rm_eo) + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY))) + { + report("failed: match was", NiL, re, s, len, NiL, flags, test); + matchprint(match, nmatch, nsub, ans, test); + } + return 0; + } + } + for (; i < nmatch; i++) + { + if (match[i].rm_so!=-1 || match[i].rm_eo!=-1) + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_VERIFY))) + { + if ((test & TEST_IGNORE_POSITION) && (match[i].rm_so<0 || match[i].rm_eo<0)) + { + state.ignored++; + return 0; + } + if (!(test & TEST_SUMMARY)) + { + report("failed: match was", NiL, re, s, len, NiL, flags, test); + matchprint(match, nmatch, nsub, ans, test); + } + } + return 0; + } + } + if (!(test & TEST_IGNORE_OVER) && match[nmatch].rm_so != state.NOMATCH.rm_so) + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY))) + { + report("failed: overran match array", NiL, re, s, len, NiL, flags, test); + matchprint(match, nmatch + 1, nsub, NiL, test); + } + return 0; + } + return 1; +} + +static void +sigunblock(int s) +{ +#ifdef SIG_SETMASK + int op; + sigset_t mask; + + sigemptyset(&mask); + if (s) + { + sigaddset(&mask, s); + op = SIG_UNBLOCK; + } + else op = SIG_SETMASK; + sigprocmask(op, &mask, NiL); +#else +#ifdef sigmask + sigsetmask(s ? (sigsetmask(0L) & ~sigmask(s)) : 0L); +#endif +#endif +} + +static void +gotcha(int sig) +{ + int ret; + + signal(sig, gotcha); + alarm(0); + state.signals++; + switch (sig) + { + case SIGALRM: + ret = REG_EHUNG; + break; + case SIGBUS: + ret = REG_EBUS; + break; + default: + ret = REG_EFAULT; + break; + } + sigunblock(sig); + longjmp(state.gotcha, ret); +} + +static char* +getline(FILE* fp) +{ + static char buf[32 * 1024]; + + register char* s = buf; + register char* e = &buf[sizeof(buf)]; + register char* b; + + for (;;) + { + if (!(b = fgets(s, e - s, fp))) + return 0; + state.lineno++; + s += strlen(s); + if (s == b || *--s != '\n' || s == b || *(s - 1) != '\\') + { + *s = 0; + break; + } + s--; + } + return buf; +} + +static unsigned long +note(unsigned long level, char* msg, unsigned long skip, unsigned long test) +{ + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY)) && !skip) + { + printf("NOTE\t"); + if (msg) + printf("%s: ", msg); + printf("skipping lines %d", state.lineno); + } + return skip | level; +} + +#define TABS(n) &ts[7-((n)&7)] + +static char ts[] = "\t\t\t\t\t\t\t"; + +static unsigned long +extract(int* tabs, char* spec, char* re, char* s, char* ans, char* msg, char* accept, regmatch_t* match, int nmatch, int nsub, unsigned long skip, unsigned long level, unsigned long test) +{ + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_OK|TEST_PASS|TEST_SUMMARY)) + { + state.extracted = 1; + if (test & TEST_OK) + { + state.passed++; + if ((test & TEST_VERIFY) && !(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + { + if (msg && strcmp(msg, "EXPECTED")) + printf("NOTE\t%s\n", msg); + return skip; + } + test &= ~(TEST_PASS|TEST_QUERY); + } + if (test & (TEST_QUERY|TEST_VERIFY)) + { + if (test & TEST_BASELINE) + test &= ~(TEST_BASELINE|TEST_PASS); + else + test |= TEST_PASS; + skip |= level; + } + if (!(test & TEST_OK)) + { + if (test & TEST_UNSPECIFIED) + state.unspecified++; + else + state.errors++; + } + if (test & (TEST_PASS|TEST_SUMMARY)) + return skip; + test &= ~TEST_DELIMIT; + printf("%s%s", spec, TABS(*tabs++)); + if ((test & (TEST_BASELINE|TEST_SAME)) == (TEST_BASELINE|TEST_SAME)) + printf("SAME"); + else + quote(re, -1, test); + printf("%s", TABS(*tabs++)); + quote(s, -1, test); + printf("%s", TABS(*tabs++)); + if (!(test & (TEST_ACTUAL|TEST_BASELINE)) || !accept && !match) + printf("%s", ans); + else if (accept) + printf("%s", accept); + else + matchprint(match, nmatch, nsub, NiL, test); + if (msg) + printf("%s%s", TABS(*tabs++), msg); + putchar('\n'); + } + else if (test & TEST_QUERY) + skip = note(level, msg, skip, test); + else if (test & TEST_VERIFY) + state.extracted = 1; + return skip; +} + +static int +catchfree(regex_t* preg, int flags, int* tabs, char* spec, char* re, char* s, char* ans, char* msg, char* accept, regmatch_t* match, int nmatch, int nsub, unsigned long skip, unsigned long level, unsigned long test) +{ + int eret; + + if (!(test & TEST_CATCH)) + { + regfree(preg); + eret = 0; + } + else if (!(eret = setjmp(state.gotcha))) + { + alarm(HUNG); + regfree(preg); + alarm(0); + } + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + extract(tabs, spec, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + else + { + report("failed", "regfree", re, NiL, -1, msg, flags, test); + error(preg, eret); + } + return eret; +} + +int +main(int argc, char** argv) +{ + int flags; + int cflags; + int eflags; + int nmatch; + int nexec; + int nstr; + int cret; + int eret; + int nsub; + int i; + int j; + int expected; + int got; + int locale; + int subunitlen; + int testno; + unsigned long level; + unsigned long skip; + char* p; + char* line; + char* spec; + char* re; + char* s; + char* ans; + char* msg; + char* fun; + char* ppat; + char* subunit; + char* version; + char* field[6]; + char* delim[6]; + FILE* fp; + int tabs[6]; + char unit[64]; + regmatch_t match[100]; + regex_t preg; + + static char pat[32 * 1024]; + + int nonosub = REG_NOSUB == 0; + int nonexec = 0; + + unsigned long test = 0; + + static char* filter[] = { "-", 0 }; + + state.NOMATCH.rm_so = state.NOMATCH.rm_eo = -2; + p = unit; + version = (char*)id + 10; + while (p < &unit[sizeof(unit)-1] && (*p = *version++) && !isspace(*p)) + p++; + *p = 0; + while ((p = *++argv) && *p == '-') + for (;;) + { + switch (*++p) + { + case 0: + break; + case 'c': + test |= TEST_CATCH; + continue; + case 'e': + test |= TEST_IGNORE_ERROR; + continue; + case 'h': + case '?': + help(0); + return 2; + case '-': + help(p[1] == 'h'); + return 2; + case 'n': + nonexec = 1; + continue; + case 'o': + test |= TEST_IGNORE_OVER; + continue; + case 'p': + test |= TEST_IGNORE_POSITION; + continue; + case 's': +#ifdef REG_DISCIPLINE + if (!(state.stack = stkalloc(stkstd, 0))) + fprintf(stderr, "%s: out of space [stack]", unit); + state.disc.disc.re_resizef = resizef; + state.disc.disc.re_resizehandle = (void*)stkstd; +#endif + continue; + case 'x': + nonosub = 1; + continue; + case 'v': + test |= TEST_VERBOSE; + continue; + case 'A': + test |= TEST_ACTUAL; + continue; + case 'B': + test |= TEST_BASELINE; + continue; + case 'F': + test |= TEST_FAIL; + continue; + case 'P': + test |= TEST_PASS; + continue; + case 'S': + test |= TEST_SUMMARY; + continue; + default: + fprintf(stderr, "%s: %c: invalid option\n", unit, *p); + return 2; + } + break; + } + if (!*argv) + argv = filter; + locale = 0; + while (state.file = *argv++) + { + if (streq(state.file, "-") || streq(state.file, "/dev/stdin") || streq(state.file, "/dev/fd/0")) + { + state.file = 0; + fp = stdin; + } + else if (!(fp = fopen(state.file, "r"))) + { + fprintf(stderr, "%s: %s: cannot read\n", unit, state.file); + return 2; + } + testno = state.errors = state.ignored = state.lineno = state.passed = + state.signals = state.unspecified = state.warnings = 0; + skip = 0; + level = 1; + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + { + printf("TEST\t%s ", unit); + if (s = state.file) + { + subunit = p = 0; + for (;;) + { + switch (*s++) + { + case 0: + break; + case '/': + subunit = s; + continue; + case '.': + p = s - 1; + continue; + default: + continue; + } + break; + } + if (!subunit) + subunit = state.file; + if (p < subunit) + p = s - 1; + subunitlen = p - subunit; + printf("%-.*s ", subunitlen, subunit); + } + else + subunit = 0; + for (s = version; *s && (*s != ' ' || *(s + 1) != '$'); s++) + putchar(*s); + if (test & TEST_CATCH) + printf(", catch"); + if (test & TEST_IGNORE_ERROR) + printf(", ignore error code mismatches"); + if (test & TEST_IGNORE_POSITION) + printf(", ignore negative position mismatches"); +#ifdef REG_DISCIPLINE + if (state.stack) + printf(", stack"); +#endif + if (test & TEST_VERBOSE) + printf(", verbose"); + printf("\n"); +#ifdef REG_VERSIONID + if (regerror(REG_VERSIONID, NiL, pat, sizeof(pat)) > 0) + s = pat; + else +#endif +#ifdef REG_TEST_VERSION + s = REG_TEST_VERSION; +#else + s = "regex"; +#endif + printf("NOTE\t%s\n", s); + if (elementsof(unsupported) > 1) + { +#if (REG_TEST_DEFAULT & (REG_AUGMENTED|REG_EXTENDED|REG_SHELL)) || !defined(REG_EXTENDED) + i = 0; +#else + i = REG_EXTENDED != 0; +#endif + for (got = 0; i < elementsof(unsupported) - 1; i++) + { + if (!got) + { + got = 1; + printf("NOTE\tunsupported: %s", unsupported[i]); + } + else + printf(",%s", unsupported[i]); + } + if (got) + printf("\n"); + } + } +#ifdef REG_DISCIPLINE + state.disc.disc.re_version = REG_VERSION; + state.disc.disc.re_compf = compf; + state.disc.disc.re_execf = execf; + if (!(state.disc.sp = sfstropen())) + bad("out of space [discipline string stream]\n", NiL, NiL, 0, 0); + preg.re_disc = &state.disc.disc; +#endif + if (test & TEST_CATCH) + { + signal(SIGALRM, gotcha); + signal(SIGBUS, gotcha); + signal(SIGSEGV, gotcha); + } + while (p = getline(fp)) + { + + /* parse: */ + + line = p; + if (*p == ':' && !isspace(*(p + 1))) + { + while (*++p && *p != ':'); + if (!*p++) + { + if (test & TEST_BASELINE) + printf("%s\n", line); + continue; + } + } + while (isspace(*p)) + p++; + if (*p == 0 || *p == '#' || *p == 'T') + { + if (test & TEST_BASELINE) + printf("%s\n", line); + continue; + } + if (*p == ':' || *p == 'N') + { + if (test & TEST_BASELINE) + printf("%s\n", line); + else if (!(test & (TEST_ACTUAL|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + { + while (*++p && !isspace(*p)); + while (isspace(*p)) + p++; + printf("NOTE %s\n", p); + } + continue; + } + j = 0; + i = 0; + field[i++] = p; + for (;;) + { + switch (*p++) + { + case 0: + p--; + j = 0; + goto checkfield; + case '\t': + *(delim[i] = p - 1) = 0; + j = 1; + checkfield: + s = field[i - 1]; + if (streq(s, "NIL")) + field[i - 1] = 0; + else if (streq(s, "NULL")) + *s = 0; + while (*p == '\t') + { + p++; + j++; + } + tabs[i - 1] = j; + if (!*p) + break; + if (i >= elementsof(field)) + bad("too many fields\n", NiL, NiL, 0, 0); + field[i++] = p; + /*FALLTHROUGH*/ + default: + continue; + } + break; + } + if (!(spec = field[0])) + bad("NIL spec\n", NiL, NiL, 0, 0); + + /* interpret: */ + + cflags = REG_TEST_DEFAULT; + eflags = REG_EXEC_DEFAULT; + test &= TEST_GLOBAL; + state.extracted = 0; + nmatch = 20; + nsub = -1; + for (p = spec; *p; p++) + { + if (isdigit(*p)) + { + nmatch = strtol(p, &p, 10); + if (nmatch >= elementsof(match)) + bad("nmatch must be < 100\n", NiL, NiL, 0, 0); + p--; + continue; + } + switch (*p) + { + case 'A': + test |= TEST_ARE; + continue; + case 'B': + test |= TEST_BRE; + continue; + case 'C': + if (!(test & TEST_QUERY) && !(skip & level)) + bad("locale must be nested\n", NiL, NiL, 0, 0); + test &= ~TEST_QUERY; + if (locale) + bad("locale nesting not supported\n", NiL, NiL, 0, 0); + if (i != 2) + bad("locale field expected\n", NiL, NiL, 0, 0); + if (!(skip & level)) + { +#if defined(LC_COLLATE) && defined(LC_CTYPE) + s = field[1]; + if (!s || streq(s, "POSIX")) + s = "C"; + if (!(ans = setlocale(LC_COLLATE, s)) || streq(ans, "C") || streq(ans, "POSIX") || !(ans = setlocale(LC_CTYPE, s)) || streq(ans, "C") || streq(ans, "POSIX")) + skip = note(level, s, skip, test); + else + { + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + printf("NOTE \"%s\" locale\n", s); + locale = level; + } +#else + skip = note(level, skip, test, "locales not supported"); +#endif + } + cflags = NOTEST; + continue; + case 'E': + test |= TEST_ERE; + continue; + case 'K': + test |= TEST_KRE; + continue; + case 'L': + test |= TEST_LRE; + continue; + case 'S': + test |= TEST_SRE; + continue; + + case 'a': + cflags |= REG_LEFT|REG_RIGHT; + continue; + case 'b': + eflags |= REG_NOTBOL; + continue; + case 'c': + cflags |= REG_COMMENT; + continue; + case 'd': + cflags |= REG_SHELL_DOT; + continue; + case 'e': + eflags |= REG_NOTEOL; + continue; + case 'f': + cflags |= REG_MULTIPLE; + continue; + case 'g': + cflags |= NOTEST; + continue; + case 'h': + cflags |= REG_MULTIREF; + continue; + case 'i': + cflags |= REG_ICASE; + continue; + case 'j': + cflags |= REG_SPAN; + continue; + case 'k': + cflags |= REG_ESCAPE; + continue; + case 'l': + cflags |= REG_LEFT; + continue; + case 'm': + cflags |= REG_MINIMAL; + continue; + case 'n': + cflags |= REG_NEWLINE; + continue; + case 'o': + cflags |= REG_SHELL_GROUP; + continue; + case 'p': + cflags |= REG_SHELL_PATH; + continue; + case 'q': + cflags |= REG_DELIMITED; + continue; + case 'r': + cflags |= REG_RIGHT; + continue; + case 's': + cflags |= REG_SHELL_ESCAPED; + continue; + case 't': + cflags |= REG_MUSTDELIM; + continue; + case 'u': + test |= TEST_UNSPECIFIED; + continue; + case 'w': + cflags |= REG_NOSUB; + continue; + case 'x': + if (REG_LENIENT) + cflags |= REG_LENIENT; + else + test |= TEST_LENIENT; + continue; + case 'y': + eflags |= REG_LEFT; + continue; + case 'z': + cflags |= REG_NULL; + continue; + + case '$': + test |= TEST_EXPAND; + continue; + + case '/': + test |= TEST_SUB; + continue; + + case '?': + test |= TEST_VERIFY; + test &= ~(TEST_AND|TEST_OR); + state.verify = state.passed; + continue; + case '&': + test |= TEST_VERIFY|TEST_AND; + test &= ~TEST_OR; + continue; + case '|': + test |= TEST_VERIFY|TEST_OR; + test &= ~TEST_AND; + continue; + case ';': + test |= TEST_OR; + test &= ~TEST_AND; + continue; + + case '{': + level <<= 1; + if (skip & (level >> 1)) + { + skip |= level; + cflags = NOTEST; + } + else + { + skip &= ~level; + test |= TEST_QUERY; + } + continue; + case '}': + if (level == 1) + bad("invalid {...} nesting\n", NiL, NiL, 0, 0); + if ((skip & level) && !(skip & (level>>1))) + { + if (!(test & (TEST_BASELINE|TEST_SUMMARY))) + { + if (test & (TEST_ACTUAL|TEST_FAIL)) + printf("}\n"); + else if (!(test & TEST_PASS)) + printf("-%d\n", state.lineno); + } + } +#if defined(LC_COLLATE) && defined(LC_CTYPE) + else if (locale & level) + { + locale = 0; + if (!(skip & level)) + { + s = "C"; + setlocale(LC_COLLATE, s); + setlocale(LC_CTYPE, s); + if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_SUMMARY))) + printf("NOTE \"%s\" locale\n", s); + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_PASS)) + printf("}\n"); + } + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL)) + printf("}\n"); + } +#endif + level >>= 1; + cflags = NOTEST; + continue; + + default: + bad("bad spec\n", spec, NiL, 0, test); + break; + + } + break; + } + if ((cflags|eflags) == NOTEST || (skip & level) && (test & TEST_BASELINE)) + { + if (test & TEST_BASELINE) + { + while (i > 1) + *delim[--i] = '\t'; + printf("%s\n", line); + } + continue; + } + if (test & TEST_OR) + { + if (!(test & TEST_VERIFY)) + { + test &= ~TEST_OR; + if (state.passed == state.verify && i > 1) + printf("NOTE\t%s\n", field[1]); + continue; + } + else if (state.passed > state.verify) + continue; + } + else if (test & TEST_AND) + { + if (state.passed == state.verify) + continue; + state.passed = state.verify; + } + if (i < 4) + bad("too few fields\n", NiL, NiL, 0, test); + while (i < elementsof(field)) + field[i++] = 0; + if (re = field[1]) + { + if (streq(re, "SAME")) + { + re = ppat; + test |= TEST_SAME; + } + else + { + if (test & TEST_EXPAND) + escape(re); + strcpy(ppat = pat, re); + } + } + else + ppat = 0; + nstr = -1; + if ((s = field[2]) && (test & TEST_EXPAND)) + { + nstr = escape(s); +#if _REG_nexec + if (nstr != strlen(s)) + nexec = nstr; +#endif + } + if (!(ans = field[3])) + bad("NIL answer\n", NiL, NiL, 0, test); + msg = field[4]; + fflush(stdout); + if (test & TEST_SUB) +#if _REG_subcomp + cflags |= REG_DELIMITED; +#else + continue; +#endif + + compile: + + if (state.extracted || (skip & level)) + continue; +#if !(REG_TEST_DEFAULT & (REG_AUGMENTED|REG_EXTENDED|REG_SHELL)) +#ifdef REG_EXTENDED + if (REG_EXTENDED != 0 && (test & TEST_BRE)) +#else + if (test & TEST_BRE) +#endif + { + test &= ~TEST_BRE; + flags = cflags; + state.which = "BRE"; + } + else +#endif +#ifdef REG_EXTENDED + if (test & TEST_ERE) + { + test &= ~TEST_ERE; + flags = cflags | REG_EXTENDED; + state.which = "ERE"; + } + else +#endif +#ifdef REG_AUGMENTED + if (test & TEST_ARE) + { + test &= ~TEST_ARE; + flags = cflags | REG_AUGMENTED; + state.which = "ARE"; + } + else +#endif +#ifdef REG_LITERAL + if (test & TEST_LRE) + { + test &= ~TEST_LRE; + flags = cflags | REG_LITERAL; + state.which = "LRE"; + } + else +#endif +#ifdef REG_SHELL + if (test & TEST_SRE) + { + test &= ~TEST_SRE; + flags = cflags | REG_SHELL; + state.which = "SRE"; + } + else +#ifdef REG_AUGMENTED + if (test & TEST_KRE) + { + test &= ~TEST_KRE; + flags = cflags | REG_SHELL | REG_AUGMENTED; + state.which = "KRE"; + } + else +#endif +#endif + { + if (test & (TEST_BASELINE|TEST_PASS|TEST_VERIFY)) + extract(tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test|TEST_OK); + continue; + } + if ((test & (TEST_QUERY|TEST_VERBOSE|TEST_VERIFY)) == TEST_VERBOSE) + { + printf("test %-3d %s ", state.lineno, state.which); + quote(re, -1, test|TEST_DELIMIT); + printf(" "); + quote(s, nstr, test|TEST_DELIMIT); + printf("\n"); + } + + nosub: + fun = "regcomp"; +#if _REG_nexec + if (nstr >= 0 && nstr != strlen(s)) + nexec = nstr; + + else +#endif + nexec = -1; + if (state.extracted || (skip & level)) + continue; + if (!(test & TEST_QUERY)) + testno++; +#ifdef REG_DISCIPLINE + if (state.stack) + stkset(stkstd, state.stack, 0); + flags |= REG_DISCIPLINE; + state.disc.ordinal = 0; + sfstrseek(state.disc.sp, 0, SEEK_SET); +#endif + if (!(test & TEST_CATCH)) + cret = regcomp(&preg, re, flags); + else if (!(cret = setjmp(state.gotcha))) + { + alarm(HUNG); + cret = regcomp(&preg, re, flags); + alarm(0); + } +#if _REG_subcomp + if (!cret && (test & TEST_SUB)) + { + fun = "regsubcomp"; + p = re + preg.re_npat; + if (!(test & TEST_CATCH)) + cret = regsubcomp(&preg, p, NiL, 0, 0); + else if (!(cret = setjmp(state.gotcha))) + { + alarm(HUNG); + cret = regsubcomp(&preg, p, NiL, 0, 0); + alarm(0); + } + if (!cret && *(p += preg.re_npat) && !(preg.re_sub->re_flags & REG_SUB_LAST)) + { + if (catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test)) + continue; + cret = REG_EFLAGS; + } + } +#endif + if (!cret) + { + if (!(flags & REG_NOSUB) && nsub < 0 && *ans == '(') + { + for (p = ans; *p; p++) + if (*p == '(') + nsub++; + else if (*p == '{') + nsub--; + if (nsub >= 0) + { + if (test & TEST_IGNORE_OVER) + { + if (nmatch > nsub) + nmatch = nsub + 1; + } + else if (nsub != preg.re_nsub) + { + if (nsub > preg.re_nsub) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "OK", NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("re_nsub incorrect", fun, re, NiL, -1, msg, flags, test); + printf("at least %d expected, %d returned\n", nsub, preg.re_nsub); + state.errors++; + } + } + else + nsub = preg.re_nsub; + } + } + } + if (!(test & TEST_SUB) && *ans && *ans != '(' && !streq(ans, "OK") && !streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "OK", NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else if (!(test & TEST_LENIENT)) + { + report("failed", fun, re, NiL, -1, msg, flags, test); + printf("%s expected, OK returned\n", ans); + } + catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + continue; + } + } + else + { + if (test & TEST_LENIENT) + /* we'll let it go this time */; + else if (!*ans || ans[0]=='(' || cret == REG_BADPAT && streq(ans, "NOMATCH")) + { + got = 0; + for (i = 1; i < elementsof(codes); i++) + if (cret==codes[i].code) + got = i; + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, codes[got].name, NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("failed", fun, re, NiL, -1, msg, flags, test); + printf("%s returned: ", codes[got].name); + error(&preg, cret); + } + } + else + { + expected = got = 0; + for (i = 1; i < elementsof(codes); i++) + { + if (streq(ans, codes[i].name)) + expected = i; + if (cret==codes[i].code) + got = i; + } + if (!expected) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, codes[got].name, NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("failed: invalid error code", NiL, re, NiL, -1, msg, flags, test); + printf("%s expected, %s returned\n", ans, codes[got].name); + } + } + else if (cret != codes[expected].code && cret != REG_BADPAT) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, codes[got].name, NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else if (test & TEST_IGNORE_ERROR) + state.ignored++; + else + { + report("should fail and did", fun, re, NiL, -1, msg, flags, test); + printf("%s expected, %s returned: ", ans, codes[got].name); + state.errors--; + state.warnings++; + error(&preg, cret); + } + } + } + goto compile; + } + +#if _REG_nexec + execute: + if (nexec >= 0) + fun = "regnexec"; + else +#endif + fun = "regexec"; + + for (i = 0; i < elementsof(match); i++) + match[i] = state.NOMATCH; + +#if _REG_nexec + if (nexec >= 0) + { + eret = regnexec(&preg, s, nexec, nmatch, match, eflags); + s[nexec] = 0; + } + else +#endif + { + if (!(test & TEST_CATCH)) + eret = regexec(&preg, s, nmatch, match, eflags); + else if (!(eret = setjmp(state.gotcha))) + { + alarm(HUNG); + eret = regexec(&preg, s, nmatch, match, eflags); + alarm(0); + } + } +#if _REG_subcomp + if ((test & TEST_SUB) && !eret) + { + fun = "regsubexec"; + if (!(test & TEST_CATCH)) + eret = regsubexec(&preg, s, nmatch, match); + else if (!(eret = setjmp(state.gotcha))) + { + alarm(HUNG); + eret = regsubexec(&preg, s, nmatch, match); + alarm(0); + } + } +#endif + if (flags & REG_NOSUB) + { + if (eret) + { + if (eret != REG_NOMATCH || !streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "NOMATCH", NiL, 0, 0, skip, level, test|TEST_DELIMIT); + else + { + report("REG_NOSUB failed", fun, re, s, nstr, msg, flags, test); + error(&preg, eret); + } + } + } + else if (streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_DELIMIT); + else + { + report("should fail and didn't", fun, re, s, nstr, msg, flags, test); + error(&preg, eret); + } + } + } + else if (eret) + { + if (eret != REG_NOMATCH || !streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, "NOMATCH", NiL, 0, nsub, skip, level, test|TEST_DELIMIT); + else + { + report("failed", fun, re, s, nstr, msg, flags, test); + if (eret != REG_NOMATCH) + error(&preg, eret); + else if (*ans) + printf("expected: %s\n", ans); + else + printf("\n"); + } + } + } + else if (streq(ans, "NOMATCH")) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_DELIMIT); + else + { + report("should fail and didn't", fun, re, s, nstr, msg, flags, test); + matchprint(match, nmatch, nsub, NiL, test); + } + } +#if _REG_subcomp + else if (test & TEST_SUB) + { + p = preg.re_sub->re_buf; + if (strcmp(p, ans)) + { + report("failed", fun, re, s, nstr, msg, flags, test); + quote(ans, -1, test|TEST_DELIMIT); + printf(" expected, "); + quote(p, -1, test|TEST_DELIMIT); + printf(" returned\n"); + } + } +#endif + else if (!*ans) + { + if (match[0].rm_so != state.NOMATCH.rm_so) + { + if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test); + else + { + report("failed: no match but match array assigned", NiL, re, s, nstr, msg, flags, test); + matchprint(match, nmatch, nsub, NiL, test); + } + } + } + else if (matchcheck(match, nmatch, nsub, ans, re, s, nstr, flags, test)) + { +#if _REG_nexec + if (nexec < 0 && !nonexec) + { + nexec = nstr >= 0 ? nstr : strlen(s); + s[nexec] = '\n'; + testno++; + goto execute; + } +#endif + if (!(test & (TEST_SUB|TEST_VERIFY)) && !nonosub) + { + if (catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test)) + continue; + flags |= REG_NOSUB; + goto nosub; + } + if (test & (TEST_BASELINE|TEST_PASS|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_OK); + } + else if (test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS|TEST_QUERY|TEST_SUMMARY|TEST_VERIFY)) + skip = extract(tabs, line, re, s, ans, msg, NiL, match, nmatch, nsub, skip, level, test|TEST_DELIMIT); + if (catchfree(&preg, flags, tabs, line, re, s, ans, msg, NiL, NiL, 0, 0, skip, level, test)) + continue; + goto compile; + } + if (test & TEST_SUMMARY) + printf("tests=%-4d errors=%-4d warnings=%-2d ignored=%-2d unspecified=%-2d signals=%d\n", testno, state.errors, state.warnings, state.ignored, state.unspecified, state.signals); + else if (!(test & (TEST_ACTUAL|TEST_BASELINE|TEST_FAIL|TEST_PASS))) + { + printf("TEST\t%s", unit); + if (subunit) + printf(" %-.*s", subunitlen, subunit); + printf(", %d test%s", testno, testno == 1 ? "" : "s"); + if (state.ignored) + printf(", %d ignored mismatche%s", state.ignored, state.ignored == 1 ? "" : "s"); + if (state.warnings) + printf(", %d warning%s", state.warnings, state.warnings == 1 ? "" : "s"); + if (state.unspecified) + printf(", %d unspecified difference%s", state.unspecified, state.unspecified == 1 ? "" : "s"); + if (state.signals) + printf(", %d signal%s", state.signals, state.signals == 1 ? "" : "s"); + printf(", %d error%s\n", state.errors, state.errors == 1 ? "" : "s"); + } + if (fp != stdin) + fclose(fp); + } + return 0; +} diff --git a/testregex.html b/testregex.html new file mode 100644 index 0000000..da80180 --- /dev/null +++ b/testregex.html @@ -0,0 +1,241 @@ + + + + + + ../re/testregex.mm mm document + + + + +
+ + + + + + + + +
Reference ImplementationsTest Data RepositoryUsageReference Implementation Notestestregex Notes
+
+

+


+
+

AT&T Research regex(3) regression tests

+
Glenn Fowler <gsf@research.att.com> +

AT&T Research - Florham Park NJ +

+


+ testregex.c 2004-05-31 +is the latest source for the AT&T Research regression test +harness for the + X/Open regex +pattern match interface. +See +testregex(1) +for option and test input details. +The source and test data posted here are license free. +

+testregex +can: +

    +
  • +verify stability for a particular implementation in the face of +source code and/or compilation environment changes +
  • +verify standard compliance for all implementations +
  • +provide a basis for discussions on what +compliance +means +
+

+See + An Interpretation of the POSIX regex Standards +for an analysis of the POSIX-X/Open +regex +standards. +

+


Reference Implementations

+testregex +is currently built against these reference implementations: +

+ + + + + + + + + + + + + + + + +
NAME    LABEL    AUTHORS
+AT&T ast    A    Glenn Fowler and Doug McIlroy
+bsd    B     
+Bell Labs    D    Doug McIlroy
+old gnu    G     
+gnu    H    Isamu Hasegawa
+irix    I     
+boost    J    John Maddock
+regex++    M    John Maddock
+pcre perl compatible    P    Philip Hazel
+rx    R    Tom Lord
+spencer    S    Henry Spencer
+libtre    T    Ville Laurikari
+unix caldera    U     
+

+


Test Data Repository

+

+ + + + + + + + + +
+basic.dat      basic regex(3) -- all implementations should pass these
+categorize.dat      implementation categorization
+nullsubexpr.dat      null (...)* tests
+leftassoc.dat      left associative catenation implementation must pass these
+rightassoc.dat      right associative catenation implementation must pass these
+forcedassoc.dat      subexpression grouping to force associativity
+repetition.dat      explicit vs. implicit repetitions
+

+


Usage

+To run the +basic.dat +tests: +
+
+testregex < basic.dat
+
+ +

+If the local implementation hangs or dumps on some tests then run with +the -c option. +The -h option lists the test data format details. +The test data files exercise all features; +the test harness detects and ignores features not +supported by the local implementation. +

+


Reference Implementation Notes

+

+

D: diet libc

+The + diet libc +implementation is currently omitted because it fails all but one +basic.dat +test. +

+

P: PCRE

+The +P +implementation emulates +perl(1) +and is not X/Open compliant by design. +The main differences are: +
    +
  • +P +leftmost-first +matching as opposed to the X/Open +leftmost-longest. +
  • +REG_EXTENDED +patterns only. +
+

+However, the +P +package regression tests, and +perl(1) +features creeping into other implementations, +make it reasonable to include here. +

+


testregex Notes

+Extensions to the standard terminology are derived from the AT&T +implementation, unified under +<regex.h> +with these modes: +

+ + + + + + + + + +
MODE    FLAGS    DESCRIPTION
+BRE    0    basic RE
+ERE    REG_EXTENDED    egrep RE with perl (...) extensions
+ARE    REG_AUGMENTED    ERE with ! negation, <> word boundaries
+SRE    REG_SHELL    sh patterns
+KRE    REG_SHELL|REG_AUGMENTED    ksh93 patterns: ! @ ( | & ) { }
+LRE    REG_LITERAL    fgrep patterns
+

+and a few flags to handle +fnmatch(3): +

+ + + + + + +
regex FLAG    fnmatch FLAG
+REG_SHELL_ESCAPED    FNM_NOESCAPE
+REG_SHELL_PATH    FNM_PATHNAME
+REG_SHELL_DOT    FNM_PERIOD
+

+The original +testregex.c +was done by Doug McIlroy at Bell Labs. +The current implementation is maintained by Glenn Fowler <gsf@research.att.com>. +

+


+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Glenn Fowler
Information and Software Systems Research
AT&T Labs Research
Florham Park NJ
March 22, 2011
+

+ +

+ + +