Gregory Burd 2020-08-14 21:45:38 -04:00
commit 786fb2e904
15 changed files with 4388 additions and 0 deletions

216
basic.dat Normal file
View file

@ -0,0 +1,216 @@
NOTE all standard compliant implementations should pass these : 2002-05-31
BE abracadabra$ abracadabracadabra (7,18)
BE a...b abababbb (2,7)
BE XXXXXX ..XXXXXX (2,8)
E \) () (1,2)
BE a] a]a (0,2)
B } } (0,1)
E \} } (0,1)
BE \] ] (0,1)
B ] ] (0,1)
E ] ] (0,1)
B { { (0,1)
B } } (0,1)
BE ^a ax (0,1)
BE \^a a^a (1,3)
BE a\^ a^ (0,2)
BE a$ aa (1,2)
BE a\$ a$ (0,2)
BE ^$ NULL (0,0)
E $^ NULL (0,0)
E a($) aa (1,2)(2,2)
E a*(^a) aa (0,1)(0,1)
E (..)*(...)* a (0,0)
E (..)*(...)* abcd (0,4)(2,4)
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
E (ab)c|abc abc (0,3)(0,2)
E a{0}b ab (1,2)
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E a{9876543210} NULL BADBR
E ((a|a)|a) a (0,1)(0,1)(0,1)
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
E a*(a.|aa) aaaa (0,4)(2,4)
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
E (a|b)?.* b (0,1)(0,1)
E (a|b)c|a(b|c) ac (0,2)(0,1)
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
E (a|b)*c|(a|ab)*c xc (1,2)
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
E a?(ab|ba)ab abab (0,4)(0,2)
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
E ab|abab abbabab (0,2)
E aba|bab|bba baaabbbaba (5,8)
E aba|bab baaabbbaba (6,9)
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
E ab|a xabc (1,3)
E ab|a xxabc (2,4)
Ei (Ab|cD)* aBcD (0,4)(2,4)
BE [^-] --a (2,3)
BE [a-]* --a (0,3)
BE [a-m-]* --amoma-- (0,4)
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
{E [[:upper:]] A (0,1) [[<element>]] not supported
E [[:lower:]]+ `az{ (1,3)
E [[:upper:]]+ @AZ[ (1,3)
BE [[-]] [[-]] (2,4)
BE [[.NIL.]] NULL ECOLLATE
BE [[=aleph=]] NULL ECOLLATE
}
BE$ \n \n (0,1)
BEn$ \n \n (0,1)
BE$ [^a] \n (0,1)
BE$ \na \na (0,2)
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
BE xxx xxx (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
BE$ .* \x01\xff (0,2)
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
E a*a*a*a*a*b aaaaaaaaab (0,10)
BE ^ NULL (0,0)
BE $ NULL (0,0)
BE ^$ NULL (0,0)
BE ^a$ a (0,1)
BE abc abc (0,3)
BE abc xabcy (1,4)
BE abc ababc (2,5)
BE ab*c abc (0,3)
BE ab*bc abc (0,3)
BE ab*bc abbc (0,4)
BE ab*bc abbbbc (0,6)
E ab+bc abbc (0,4)
E ab+bc abbbbc (0,6)
E ab?bc abbc (0,4)
E ab?bc abc (0,3)
E ab?c abc (0,3)
BE ^abc$ abc (0,3)
BE ^abc abcc (0,3)
BE abc$ aabc (1,4)
BE ^ abc (0,0)
BE $ abc (3,3)
BE a.c abc (0,3)
BE a.c axc (0,3)
BE a.*c axyzc (0,5)
BE a[bc]d abd (0,3)
BE a[b-d]e ace (0,3)
BE a[b-d] aac (1,3)
BE a[-b] a- (0,2)
BE a[b-] a- (0,2)
BE a] a] (0,2)
BE a[]]b a]b (0,3)
BE a[^bc]d aed (0,3)
BE a[^-b]c adc (0,3)
BE a[^]b]c adc (0,3)
E ab|cd abc (0,2)
E ab|cd abcd (0,2)
E a\(b a(b (0,3)
E a\(*b ab (0,2)
E a\(*b a((b (0,4)
E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
E (a*)* - (0,0)(0,0)
E (a*)+ - (0,0)(0,0)
E (a*|b)* - (0,0)(0,0)
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
E (^)* - (0,0)(0,0)
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
E ((a*|b))* - (0,0)(0,0)(0,0)
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
E (ab|cd)e abcde (2,5)(2,4)
BE [abhgefdc]ij hij (0,3)
E (a|b)c*d abcd (1,4)(1,2)
E (ab|ab*)bc abc (0,3)(0,1)
E a([bc]*)c* abc (0,3)(1,3)
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
E a[bcd]*dcdcde adcdcde (0,7)
E (ab|a)b*c abc (0,3)(0,2)
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
E ^a(bc+|b[eh])g|.h$ abh (1,3)
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
BE multiple words multiple words yeah (0,14)
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
BE abcd abcd (0,4)
E a(bc)d abcd (0,4)(1,3)
E a[-]?c ac (0,3)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
E a+(b|c)*d+ aabcdd (0,6)(3,4)
E ^.+$ vivi (0,4)
E ^(.+)$ vivi (0,4)(0,4)
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
E (foo|(bar))!bas foo!bas (0,7)(0,3)
E (foo|bar)!bas bar!bas (0,7)(0,3)
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
E (foo|bar)!bas foo!bas (0,7)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
E .*(/XXX).* /XXX (0,4)(0,4)
E .*(\\XXX).* \XXX (0,4)(0,4)
E \\XXX \XXX (0,4)
E .*(/000).* /000 (0,4)(0,4)
E .*(\\000).* \000 (0,4)(0,4)
E \\000 \000 (0,4)

62
categorize.dat Normal file
View file

@ -0,0 +1,62 @@
NOTE regex implementation categorization 2004-05-31
?E aa* xaxaax (1,2) POSITION=leftmost
; POSITION=bug
?E (a*)(ab)*(b*) abc (0,2)(0,1)(?,?)(1,2) ASSOCIATIVITY=right
|E (a*)(ab)*(b*) abc (0,2)(0,0)(0,2)(2,2) ASSOCIATIVITY=left
; ASSOCIATIVITY=bug
?E ((a*)(ab)*)((b*)(a*)) aba (0,3)(0,2)(0,0)(0,2)(2,3)(2,2)(2,3) SUBEXPRESSION=precedence
|E ((a*)(ab)*)((b*)(a*)) aba (0,3)(0,1)(0,1)(?,?)(1,3)(1,2)(2,3) SUBEXPRESSION=grouping
; SUBEXPRESSION=bug
?E (...?.?)* xxxxxx (0,6)(4,6) REPEAT_LONGEST=first
|E (...?.?)* xxxxxx (0,6)(2,6) REPEAT_LONGEST=last
|E (...?.?)* xxxxxx OK REPEAT_LONGEST=unknown
; REPEAT_LONGEST=bug
?E (a|ab)(bc|c) abcabc (0,3)(0,2)(2,3) EXPECTED
|E (a|ab)(bc|c) abcabc (0,3)(0,1)(1,3) BUG=alternation-order
; BUG=alternation-order-UNKNOWN
?E (aba|a*b)(aba|a*b) ababa (0,5)(0,2)(2,5) EXPECTED
|E (aba|a*b)(aba|a*b) ababa (0,4)(0,3)(3,4) BUG=first-match
; BUG=unknown-match
?B a\(b\)*\1 a NOMATCH EXPECTED
|B a\(b\)*\1 a (0,1) BUG=nomatch-match
|B a\(b\)*\1 abab (0,2)(1,2) # BUG=repeat-any
; BUG=nomatch-match-UNKNOWN
?E (a*){2} xxxxx (0,0)(0,0) EXPECTED
|E (a*){2} xxxxx (5,5)(5,5) BUG=range-null
; BUG=range-null-UNKNOWN
?B a\(b\)*\1 abab NOMATCH EXPECTED
|B a\(b\)*\1 abab (0,1) # BUG=nomatch-match
|B a\(b\)*\1 abab (0,2)(1,2) BUG=repeat-any
; BUG=repeat-any-UNKNOWN
?E (a*)* a (0,1)(0,1) EXPECTED
|E (a*)* ax (0,1)(0,1) BUG=repeat-null-unknown
|E (a*)* a (0,1)(1,1) BUG=repeat-null
; BUG=repeat-null-UNKNOWN
?E (aba|a*b)* ababa (0,5)(2,5) EXPECTED
|E (aba|a*b)* ababa (0,5)(3,4) BUG=repeat-short
|E (aba|a*b)* ababa (0,4)(3,4) # LENGTH=first
; BUG=repeat-short-UNKNOWN
?E (a(b)?)+ aba (0,3)(2,3) EXPECTED
|E (a(b)?)+ aba (0,3)(2,3)(1,2) BUG=repeat-artifact
; BUG=repeat-artifact-UNKNOWN
?B \(a\(b\)*\)*\2 abab NOMATCH EXPECTED
|B \(a\(b\)*\)*\2 abab (0,4)(2,3)(1,2) BUG=repeat-artifact-nomatch
; BUG=repeat-artifact-nomatch-UNKNOWN
?E (a?)((ab)?)(b?)a?(ab)?b? abab (0,4)(0,1)(1,1)(?,?)(1,2)(2,4) BUG=subexpression-first
|E .*(.*) ab (0,2)(2,2) EXPECTED
|E .*(.*) ab (0,2)(0,2) BUG=subexpression-first
; BUG=subexpression-first-UNKNOWN

30
forcedassoc.dat Normal file
View file

@ -0,0 +1,30 @@
NOTE left-assoc:pass-all right-assoc:pass-all : 2002-04-29
E (a|ab)(c|bcd) abcd (0,4)(0,1)(1,4)
E (a|ab)(bcd|c) abcd (0,4)(0,1)(1,4)
E (ab|a)(c|bcd) abcd (0,4)(0,1)(1,4)
E (ab|a)(bcd|c) abcd (0,4)(0,1)(1,4)
E ((a|ab)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
E ((a|ab)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
E ((ab|a)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
E ((ab|a)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
E (a|ab)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
E (a|ab)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
E (ab|a)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
E (ab|a)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
E (a*)(b|abc) abc (0,3)(0,0)(0,3)
E (a*)(abc|b) abc (0,3)(0,0)(0,3)
E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
E (a*)(b|abc) abc (0,3)(0,0)(0,3)
E (a*)(abc|b) abc (0,3)(0,0)(0,3)
E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
E (a|ab) ab (0,2)(0,2)
E (ab|a) ab (0,2)(0,2)
E (a|ab)(b*) ab (0,2)(0,2)(2,2)
E (ab|a)(b*) ab (0,2)(0,2)(2,2)

16
leftassoc.dat Normal file
View file

@ -0,0 +1,16 @@
NOTE left-assoc:pass-all right-assoc:pass-none : 2002-04-29
E (a|ab)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4)
E (a|ab)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4)
E (ab|a)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4)
E (ab|a)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4)
E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3)
E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3)
E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3)
E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3)
E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)
E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)
E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)
E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)

142
man/man1/testregex.html Normal file
View file

@ -0,0 +1,142 @@
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML>
<HEAD>
<TITLE>testregex man document</TITLE>
</HEAD>
<BODY bgcolor=white>
<PRE>
NAME
testregex - regex(3) test harness
SYNOPSIS
testregex [ options ]
DESCRIPTION
testregex reads regex(3) test specifications, one per line, from the
standard input and writes one output line for each failed test. A
summary line is written after all tests are done. Each successful
test is run again with REG_NOSUB. Unsupported features are noted
before the first test, and tests requiring these features are
silently ignored.
OPTIONS
-c catch signals and non-terminating calls
-e ignore error return mismatches
-h list help on standard error
-n do not repeat successful tests with regnexec()
-o ignore match[] overrun errors
-p ignore negative position mismatches
-s use stack instead of malloc
-x do not repeat successful tests with REG_NOSUB
-v list each test line
-A list failed test lines with actual answers
-B list all test lines with actual answers
-F list failed test lines
-P list passed test lines
-S output one summary line
INPUT FORMAT
Input lines may be blank, a comment beginning with #, or a test
specification. A specification is five fields separated by one
or more tabs. NULL denotes the empty string and NIL denotes the
0 pointer.
Field 1: the regex(3) flags to apply, one character per REG_feature
flag. The test is skipped if REG_feature is not supported by the
implementation. If the first character is not [BEASKLP] then the
specification is a global control line. One or more of [BEASKLP] may be
specified; the test will be repeated for each mode.
B basic BRE (grep, ed, sed)
E REG_EXTENDED ERE (egrep)
A REG_AUGMENTED ARE (egrep with negation)
S REG_SHELL SRE (sh glob)
K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
L REG_LITERAL LRE (fgrep)
a REG_LEFT|REG_RIGHT implicit ^...$
b REG_NOTBOL lhs does not match ^
c REG_COMMENT ignore space and #...\n
d REG_SHELL_DOT explicit leading . match
e REG_NOTEOL rhs does not match $
f REG_MULTIPLE multiple \n separated patterns
g FNM_LEADING_DIR testfnmatch only -- match until /
h REG_MULTIREF multiple digit backref
i REG_ICASE ignore case
j REG_SPAN . matches \n
k REG_ESCAPE \ to ecape [...] delimiter
l REG_LEFT implicit ^...
m REG_MINIMAL minimal match
n REG_NEWLINE explicit \n match
o REG_ENCLOSED (|&) magic inside [@|&](...)
p REG_SHELL_PATH explicit / match
q REG_DELIMITED delimited pattern
r REG_RIGHT implicit ...$
s REG_SHELL_ESCAPED \ not special
t REG_MUSTDELIM all delimiters must be specified
u standard unspecified behavior -- errors not counted
v REG_CLASS_ESCAPE \ special inside [...]
w REG_NOSUB no subexpression match array
x REG_LENIENT let some errors slide
y REG_LEFT regexec() implicit ^...
z REG_NULL NULL subexpressions ok
$ expand C \c escapes in fields 2 and 3
/ field 2 is a regsubcomp() expression
= field 3 is a regdecomp() expression
Field 1 control lines:
C set LC_COLLATE and LC_CTYPE to locale in field 2
?test ... output field 5 if passed and != EXPECTED, silent otherwise
&test ... output field 5 if current and previous passed
|test ... output field 5 if current passed and previous failed
; ... output field 2 if previous failed
{test ... skip if failed until }
} end of skip
: comment comment copied as output NOTE
:comment:test :comment: ignored
N[OTE] comment comment copied as output NOTE
T[EST] comment comment
number use number for nmatch (20 by default)
Field 2: the regular expression pattern; SAME uses the pattern from
the previous specification.
Field 3: the string to match.
Field 4: the test outcome. This is either one of the posix error
codes (with REG_ omitted) or the match array, a list of (m,n)
entries with m and n being first and last+1 positions in the
field 3 string, or NULL if REG_NOSUB is in effect and success
is expected. BADPAT is acceptable in place of any regcomp(3)
error code. The match[] array is initialized to (-2,-2) before
each test. All array elements from 0 to nmatch-1 must be specified
in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
matched (?{...}) expression, where x is the text enclosed by {...},
o is the expression ordinal counting from 1, and n is the length of
the unmatched portion of the subject string. If x starts with a
number then that is the return value of re_execf(), otherwise 0 is
returned.
Field 5: optional comment appended to the report.
CAVEAT
If a regex implementation misbehaves with memory then all bets are off.
CONTRIBUTORS
Glenn Fowler gsf@research.att.com (ksh strmatch, regex extensions)
David Korn dgk@research.att.com (ksh glob matcher)
Doug McIlroy mcilroy@dartmouth.edu (ast regex/testre in C++)
Tom Lord lord@regexps.com (rx tests)
Henry Spencer henry@zoo.toronto.edu (original public regex)
Andrew Hume andrew@research.att.com (gre tests)
John Maddock John_Maddock@compuserve.com (regex++ tests)
Philip Hazel ph10@cam.ac.uk (pcre tests)
Ville Laurikari vl@iki.fi (libtre tests)
</PRE>
</BODY>
</HTML>

73
nullsubexpr.dat Normal file
View file

@ -0,0 +1,73 @@
NOTE null subexpression matches : 2002-06-06
E (a*)* a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)* a (0,1)(0,1)
E SAME x (0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)+ a (0,1)(0,1)
E SAME x NOMATCH
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)* a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
E SAME b (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
E SAME aaaaaa (0,6)(0,6)
E SAME ababab (0,6)(0,6)
E SAME bababa (0,6)(0,6)
E SAME b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
E SAME aaaaaa (0,0)(0,0)
E ([^ab]*)* ccccxx (0,6)(0,6)
E SAME ababab (0,0)(0,0)
E ((z)+|a)* zabcde (0,2)(1,2)
{E a+? aaaaaa (0,1) no *? +? mimimal match ops
E (a) aaa (0,1)(0,1)
E (a*?) aaa (0,0)(0,0)
E (a)*? aaa (0,0)
E (a*?)*? aaa (0,0)
}
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)
E (a*)+(x) x (0,1)(0,0)(0,1)
E (a*)+(x) ax (0,2)(0,1)(1,2)
E (a*)+(x) axa (0,2)(0,1)(1,2)
E (a*){2}(x) x (0,1)(0,0)(0,1)
E (a*){2}(x) ax (0,2)(1,1)(1,2)
E (a*){2}(x) axa (0,2)(1,1)(1,2)

64
re-assoc.html Normal file
View file

@ -0,0 +1,64 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
<HTML>
<HEAD>
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
<META name="keywords" content="regex catenation associativity tests">
<TITLE> ../re/re-assoc.mm mm document </TITLE>
<META name="author" content="gsf">
</HEAD>
<BODY bgcolor=white link=slateblue vlink=teal >
<TABLE border=0 align=center width=96%>
<TBODY><TR><TD valign=top align=left>
<!--INDEX--><!--/INDEX-->
<P>
<HR>
<CENTER>
<H3><CENTER><FONT color=red><FONT face=courier>regex catenation associativity tests</FONT></FONT></CENTER></H3>
<BR>Glenn Fowler <SMALL>&lt;<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>&gt;</SMALL>
<P><I>AT&amp;T Labs Research - Florham Park NJ</I>
</CENTER>
<P><HR><P>
The
<STRONG>regex</STRONG>
tests in
{
<A href="http://web.archive.org/web/20080724204655id_/http://www.research.att.com/~gsf/testregex/leftassoc.dat">leftassoc.dat</A>
<A href="http://web.archive.org/web/20080724204655id_/http://www.research.att.com/~gsf/testregex/rightassoc.dat">rightassoc.dat</A>
<A href="http://web.archive.org/web/20080724204655id_/http://www.research.att.com/~gsf/testregex/forcedassoc.dat">forcedassoc.dat</A>
}
exercise the associativity of catenation.
<P>
<HR>
<TABLE border=0 align=center width=96%>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-assoc.mm mm document">Glenn Fowler</A></TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Information and Software Systems Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>AT&amp;T Labs Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Florham Park NJ</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>August 04, 2002</TD>
</TR>
</TABLE>
<P>
</TD></TR></TBODY></TABLE>
</BODY>
</HTML>

209
re-categorize.html Normal file
View file

@ -0,0 +1,209 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
<HTML>
<HEAD>
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
<META name="keywords" content="regex implementation categorization">
<TITLE> ../re/re-categorize.mm mm document </TITLE>
<META name="author" content="gsf">
</HEAD>
<BODY bgcolor=white link=slateblue vlink=teal >
<TABLE border=0 align=center width=96%>
<TBODY><TR><TD valign=top align=left>
<!--INDEX--><!--/INDEX-->
<P>
<HR>
<CENTER>
<H3><CENTER><FONT color=red><FONT face=courier>regex implementation categorization</FONT></FONT></CENTER></H3>
<BR>Glenn Fowler <SMALL>&lt;<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>&gt;</SMALL>
<P><I>AT&amp;T Labs Research - Florham Park NJ</I>
</CENTER>
<P><HR><P>
The
<STRONG>regex</STRONG>
tests in
<A href="http://web.archive.org/web/20080726034626id_/http://www.research.att.com/~gsf/testregex/categorize.dat">categorize.dat</A>
attempt to categorize
<STRONG>regex</STRONG>
implementations.
The tests do not address internationalization.
All implementations report the leftmost match; this is omitted from the table.
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 frame=void rules=none >
<TBODY>
<TR><TD align=center>LABEL&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;ASSOC&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;SUBEXPR&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;REP_LONGEST&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;BUGS</TD></TR>
<TR><TD align=center>
A&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;precedence&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;-</TD></TR>
<TR><TD align=center>
B&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-short&nbsp;&nbsp;repeat-artifact-nomatch</TD></TR>
<TR><TD align=center>
D&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;-</TD></TR>
<TR><TD align=center>
G&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;alternation-order&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-artifact&nbsp;&nbsp;repeat-artifact-nomatch</TD></TR>
<TR><TD align=center>
H&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;alternation-order&nbsp;&nbsp;nomatch-match&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-artifact&nbsp;&nbsp;repeat-artifact-nomatch</TD></TR>
<TR><TD align=center>
I&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-any&nbsp;&nbsp;repeat-short&nbsp;&nbsp;repeat-artifact-nomatch</TD></TR>
<TR><TD align=center>
J&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;precedence&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;last&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;nomatch-match&nbsp;&nbsp;repeat-artifact&nbsp;&nbsp;repeat-artifact-nomatch&nbsp;&nbsp;subexpression-first</TD></TR>
<TR><TD align=center>
M&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;precedence&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;last&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;range-null&nbsp;&nbsp;repeat-artifact&nbsp;&nbsp;repeat-artifact-nomatch&nbsp;&nbsp;subexpression-first</TD></TR>
<TR><TD align=center>
O&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-short&nbsp;&nbsp;repeat-artifact-nomatch</TD></TR>
<TR><TD align=center>
P&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;alternation-order&nbsp;&nbsp;first-match&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-artifact</TD></TR>
<TR><TD align=center>
R&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;left&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;precedence&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;last&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;-</TD></TR>
<TR><TD align=center>
S&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-short&nbsp;&nbsp;repeat-artifact-nomatch</TD></TR>
<TR><TD align=center>
T&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;left&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;precedence&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;last&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;-</TD></TR>
<TR><TD align=center>
U&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;precedence&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-null&nbsp;&nbsp;subexpression-first</TD></TR>
<TR><TD align=center>
darwin.ppc&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-short</TD></TR>
<TR><TD align=center>
freebsd.i386&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-null&nbsp;&nbsp;repeat-short</TD></TR>
<TR><TD align=center>
hp.pa&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-artifact</TD></TR>
<TR><TD align=center>
ibm.risc&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;alternation-order&nbsp;&nbsp;nomatch-match&nbsp;&nbsp;repeat-artifact&nbsp;&nbsp;repeat-artifact-nomatch</TD></TR>
<TR><TD align=center>
linux.i386&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;alternation-order&nbsp;&nbsp;repeat-artifact&nbsp;&nbsp;repeat-null</TD></TR>
<TR><TD align=center>
sgi.mips3&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-short</TD></TR>
<TR><TD align=center>
sol8.sun4&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;grouping&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;alternation-order&nbsp;&nbsp;nomatch-match&nbsp;&nbsp;repeat-artifact</TD></TR>
<TR><TD align=center>
unixware.i386&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;right&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;precedence&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;first&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;repeat-null&nbsp;&nbsp;subexpression-first</TD></TR>
</TBODY></TABLE></TD></TR></TBODY></TABLE>
<P>
The categories are:
<DL COMPACT>
<DL COMPACT>
<DT><STRONG>LABEL</STRONG><DD>
The implementation label from
<A href="http://web.archive.org/web/20080726034626id_/http://www.research.att.com/~gsf/testregex/">testregex.</A>
<DT><STRONG>ASSOC</STRONG><DD>
Subpattern (or atom) associativity: either
<STRONG>left</STRONG>
or
<STRONG>right</STRONG>.
The subexpression match rule in the rationale requires
<STRONG>right</STRONG>
for expressions where each concatenated part is a subexpression.
There is no definition for
<EM>subpattern</EM>,
but it would be inconsistent for any definition to require different
associativity than that for subexpressions.
Some claim that the BRE and ERE grammars specify
<STRONG>left</STRONG>
associativity, but this interpretation disregards
the subexpression match rule in the rationale.
The grammar can also be interpreted to support
<STRONG>right</STRONG>
associativity, and this interpretation is in accord with the rationale.
<DT><STRONG>SUBEXPR</STRONG><DD>
Subexpression semantics:
<STRONG>precedence</STRONG>
if subexpressions can override the default associativity;
<STRONG>grouping</STRONG>
if subexpressions are for repetition and
<STRONG>regmatch_t</STRONG>
grouping only.
The subexpression match rule in the rationale requires
<STRONG>precedence</STRONG>.
<DT><STRONG>REP_LONGEST</STRONG><DD>
How repeated subexpressions that match more than once are handled:
<STRONG>first</STRONG>
if the longest possible matches occur first;
<STRONG>last</STRONG>
if the longest possible matches occur last;
<STRONG>unknown</STRONG>
otherwise.
The subexpression match rule in the rationale requires
<STRONG>first</STRONG>.
<DT><STRONG>BUGS</STRONG><DD>
Miscellaneous bugs (see
<A href="http://web.archive.org/web/20080726034626id_/http://www.research.att.com/~gsf/testregex/categorize.dat">categorize.dat</A>
for specific examples):
<DL COMPACT>
<DL COMPACT>
<DT><STRONG>alternation-order</STRONG><DD>
A change in the order of subexpression alternation operands,
<EM>not involved in a tie</EM>,
changes
<STRONG>regmatch_t</STRONG>
values.
Some implementations with this bug can be coaxed into missing the
overall longest match.
<DT><STRONG>first-match</STRONG><DD>
The first of the leftmost matches, instead of the longest of the
leftmost matches, is returned.
<DT><STRONG>nomatch-match</STRONG><DD>
A back-reference to a
<STRONG>regmatch_t</STRONG>
(-1,-1) value is treated as matching.
<DT><STRONG>range-null</STRONG><DD>
A range-repeated subexpression that matches null does not report the match
at offset (0,0).
<DT><STRONG>repeat-artifact</STRONG><DD>
A
<STRONG>regmatch_t</STRONG>
value is reported for a repeated match that is not the last match.
<DT><STRONG>repeat-artifact-nomatch</STRONG><DD>
To prevent not matching,
a
<STRONG>regmatch_t</STRONG>
value is reported for a repeated match that is not the last match.
<DT><STRONG>repeat-null</STRONG><DD>
A repeated subexpression matches the null string even though it is not
the only match and is not necessary to satisfy the exact or minimum
number of occurrences for an interval expression.
<DT><STRONG>repeat-short</STRONG><DD>
Incorrect
<STRONG>regmatch_t</STRONG>
values for a repeated subexpression.
This may be a variant of
<STRONG>repeat-artifact</STRONG>.
<DT><STRONG>subexpression-first</STRONG><DD>
A subexpression match takes precedence over a subpattern
to its left.
</DL>
</DL>
</DL>
</DL>
<P>
<HR>
<TABLE border=0 align=center width=96%>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-categorize.mm mm document">Glenn Fowler</A></TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Information and Software Systems Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>AT&amp;T Labs Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Florham Park NJ</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>June 01, 2004</TD>
</TR>
</TABLE>
<P>
</TD></TR></TBODY></TABLE>
</BODY>
</HTML>

997
re-interpretation.html Normal file
View file

@ -0,0 +1,997 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
<HTML>
<HEAD>
<META name="generator" content="mm2html (AT&T Research) 2010-09-10">
<META name="keywords" content="regex regular expression standard interpretation">
<TITLE> ../re/re-interpretation.mm mm document </TITLE>
<META name="author" content="gsf">
</HEAD>
<BODY bgcolor=white link=slateblue vlink=teal >
<TABLE border=0 align=center width=96%>
<TBODY><TR><TD valign=top align=left>
<!--INDEX--><!--/INDEX-->
<B><FONT size=-1 face="verdana,arial,helvetica,geneva,sans-serif">
<TABLE align=center cellpadding=2 border=4 bgcolor=lightgrey><TR>
<TD><A href="re-interpretation.html#Abstract">Abstract</A></TD>
<TD><A href="re-interpretation.html#Background">Background</A></TD>
<TD><A href="re-interpretation.html#Notation">Notation</A></TD>
<TD><A href="re-interpretation.html#regex Glossary">regex Glossary</A></TD>
<TD><A href="re-interpretation.html#A subexpression is ">A subexpression is </A></TD>
<TD><A href="re-interpretation.html#A subpattern is ">A subpattern is </A></TD>
<TD><A href="re-interpretation.html#The Dark Corners ">The Dark Corners </A></TD>
<TD><A href="re-interpretation.html#Conclusion">Conclusion</A></TD>
</TR></TABLE>
</FONT></B>
<P>
<HR>
<CENTER>
<H3><CENTER><FONT color=red><FONT face=courier>An Interpretation of the POSIX regex Standard</FONT></FONT></CENTER></H3>
<BR>Glenn Fowler <SMALL>&lt;<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>&gt;</SMALL>
<P><I>AT&amp;T Research - Florham Park NJ</I>
</CENTER>
<P>
<CENTER><FONT color=red><FONT face=courier><H3 align=center><A name="Abstract">Abstract</A></H3></FONT></FONT></CENTER>
Many passages in the POSIX
<STRONG>regex</STRONG>
standard seem to be open for interpretation.
Differences between several published
<A href="http://www.research.att.com/~gsf/testregex/" target=_top>implementations</A>
of the
<STRONG>regex</STRONG>
API bear this out.
Instead of relegating these differences to the
<EM>undefined behavior</EM>
bucket, this paper proposes a resolution to each
by direct application of the standard text.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Background">Background</A></H3></FONT></FONT></CENTER>
The POSIX
<STRONG>regex</STRONG>
standard is spread across four documents:
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 >
<TBODY>
<TR><TD align=right>
glossary&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;G&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;<A href="http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap03.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap03.html</A></TD></TR>
<TR><TD align=right>
api&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;A&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;<A href="http://www.opengroup.org/onlinepubs/007904975/functions/regcomp.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/functions/regcomp.html</A></TD></TR>
<TR><TD align=right>
definition&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;D&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;<A href="http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap09.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap09.html</A></TD></TR>
<TR><TD align=right>
rationale&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;R&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;<A href="http://www.opengroup.org/onlinepubs/007904975/xrat/xbd_chap09.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/xrat/xbd_chap09.html</A></TD></TR>
</TBODY></TABLE></TD></TR></TBODY></TABLE>
<P>
It describes
<STRONG>BRE</STRONG>s
(basic regular expressions, a.k.a.,
<NOBR><A href="http://web.archive.org/~gsf/man/man1/grep.html"><STRONG>grep</STRONG></A>(1)</NOBR>
style) and
<STRONG>ERE</STRONG>s
(extended regular expressions, a.k.a.,
<NOBR><A href="http://web.archive.org/~gsf/man/man1/egrep.html"><STRONG>egrep</STRONG></A>(1)</NOBR>
style)
and how an RE of each type matches subject strings.
The standard also provides an API:
<NOBR><A href="http://web.archive.org/~gsf/man/man3/regcomp.html"><STRONG>regcomp</STRONG></A>(3)</NOBR>
for compiling an RE, and
<NOBR><A href="http://web.archive.org/~gsf/man/man3/regexec.html"><STRONG>regexec</STRONG></A>(3)</NOBR>
for matching a compiled RE against a subject string.
The
<STRONG>regexec</STRONG>
API
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
int regexec(const regex_t* restrict preg, const char* restrict string,
size_t nmatch, regmatch_t pmatch&#0091;restrict&#0093;, int eflags);
</DIV>
</PRE>
is at the center of multiple, conflicting interpretations of the standard.
These interpretations differ on the setting of the
<TT>pmatch&#0091;&#0093;</TT>
array for index values &gt; 0.
This note presents examples that demonstrate interpretation conflicts,
and then provides standard references that,
<EM>when taken as a whole</EM>,
resolve the conflicts.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Notation">Notation</A></H3></FONT></FONT></CENTER>
Standard references use the notation
&#0091;<EM>document</EM>:<EM>begin</EM>&#0091;-<EM>end</EM>&#0093;&#0093;
where
<EM>document</EM>
is the document letter, { A D G R }, from the table above,
<EM>begin</EM>
is the beginning line number, and
<EM>end</EM>
is the ending line number.
Line numbers are taken from the 2001 X/Open printing.
Unfortunately the online links do not display line numbers.
For example, &#0091;A:37179-37180&#0093; is the reference for the
<STRONG>regexec</STRONG>
API prototype above.
<P>
Example patterns, subject strings, and
<TT>pmatch&#0091;&#0093;</TT>
array values use the regression test notation of
<A href="http://www.research.att.com/~gsf/testregex/" target=_top>testregex.</A>
You can download the source and compile it against your favorite regex
implementation.
All of the examples in this note have been placed in the file
<A href="http://www.research.att.com/~gsf/testregex/interpretation.dat" target=_top>interpretation.dat;</A>
you can download this file and use it as input to
<STRONG>testregex</STRONG>.
For example, the
<STRONG>testregex</STRONG>
input
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#01:E a+ xaax (1,3)
</DIV>
</PRE>
specifies that the ERE pattern "a+" matched against the
subject string "xaax" yields
<TT>pmatch&#0091;0&#0093;.rm_so==1</TT>
and
<TT>pmatch&#0091;0&#0093;.rm_eo==3</TT>.
The example is labeled RE#01 for indexing and referencing.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#02:B .&#0092;(a*&#0092;). xaax (0,4)(1,3)
</DIV>
</PRE>
specifies that the BRE pattern ".&#0092;(a*&#0092;)." matched against the subject
string "xaax" yields
<TT>pmatch&#0091;0&#0093;.rm_so==0</TT>,
<TT>pmatch&#0091;0&#0093;.rm_eo==4</TT>,
<TT>pmatch&#0091;1&#0093;.rm_so==1</TT>,
<TT>pmatch&#0091;1&#0093;.rm_eo==3</TT>.
(?,?) denotes
<TT>rm_so</TT>
and
<TT>rm_eo</TT>
values of -1, i.e., a non-match.
The first field allows additional flags that exercise all of the
<STRONG>REG_*</STRONG>
<STRONG>regcomp</STRONG>
and
<STRONG>regexec</STRONG>
flags; see
<NOBR><A href="http://web.archive.org/~gsf/man/man1/testregex.html"><STRONG>testregex</STRONG></A>(1)</NOBR>
or
<STRONG>testregex --man</STRONG>
for details.
Note that
<STRONG>tab</STRONG>
is the field separator in the
<STRONG>testregex</STRONG>
syntax; if you mouse snarf then make sure that
<STRONG>tabs</STRONG>
are preserved.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="regex Glossary">regex Glossary</A></H3></FONT></FONT></CENTER>
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT>&#0091;G:41&#0093;<STRONG>Basic Regular Expression (BRE)</STRONG><DD>
A regular expression used by the majority of utilities that select strings
from a set of character strings.
<DT>&#0091;G:148&#0093;<STRONG>Entire Regular Expression</STRONG><DD>
The concatenated set of one or more basic regular expressions or extended
regular expressions that make up the pattern specified for string selection.
<DT>&#0091;G:158&#0093;<STRONG>Extended Regular Expression (ERE)</STRONG><DD>
A regular expression that is an alternative to the Basic Regular
Expression using a more extensive syntax, occasionally used by some utilities.
<DT>&#0091;G:269&#0093;<STRONG>Pattern</STRONG><DD>
A sequence of characters used either with regular expression notation or for
pathname expansion, as a means of selecting various character strings or
pathnames, respectively.
<DT>&#0091;G:316&#0093;<STRONG>Regular Expression</STRONG><DD>
A pattern that selects specific strings from a set of character strings.
</DL>
</DIV>
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="A subexpression is ">A subexpression is </A></H3></FONT></FONT></CENTER>
The
<STRONG>regex</STRONG>
standard is surprisingly cavalier with terminology:
some terms are used interchangeably, some are used in a general context
in one section and a specific context in another, and some are
used without any definition whatsoever.
Acutely subject to this abuse are:
<EM>RE</EM>,
<EM>pattern</EM>,
<EM>subpattern</EM>,
<EM>expression</EM>,
and
<EM>subexpression</EM>.
In particular,
<EM>subpattern</EM>
and
<EM>subexpression</EM>
are central to the description of the matching algorithm and how
<TT>pmatch&#0091;&#0093;</TT>
is assigned.
Any interpretation of the
<STRONG>regex</STRONG>
standard involving these terms, absent a precise and accurate definition
for each, is useless.
<P>
<EM>subexpression</EM>
appears 70 times, and each reference is in the context of parenthesis grouping:
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT>&#0091;D:5909-5911&#0093;<DD>
For example, matching the BRE "&#0092;(.*&#0092;).*" against "abcdef" , the
subexpression "(&#0092;1)" is "abcdef" , and matching the BRE
"&#0092;(a*&#0092;)*" against "bc" , the subexpression "(&#0092;1)" is the null
string.
<DT>&#0091;D:5984-5988&#0093;<DD>
The asterisk shall be special except when used: As the first
character of a subexpression (after an initial '^' , if any);
<DT>&#0091;D:6094-6097&#0093;<DD>
A subexpression can be defined within a BRE by enclosing it
between the character pairs "&#0092;(" and "&#0092;)" . Subexpressions can
be arbitrarily nested.
<DT>&#0091;D:6100-6109&#0093;<DD>
The character 'n' shall be a digit from 1 through 9, specifying
the nth subexpression (the one that begins with the nth "&#0092;("
from the beginning of the pattern and ends with the
corresponding paired "&#0092;)" ). The expression is invalid if less
than n subexpressions precede the '&#0092;n' . For example, the
expression "&#0092;(.*&#0092;)&#0092;1$" matches a line consisting of two
adjacent appearances of the same string, and the expression
"&#0092;(a&#0092;)*&#0092;1" fails to match 'a' . When the referenced
subexpression matched more than one string, the back-referenced
expression shall refer to the last matched string. If the
subexpression referenced by the back-reference matches more
than one string because of an asterisk ( '*' ) or an interval
expression (see item (5)), the back-reference shall match the
last (rightmost) of these strings.
<DT>&#0091;D:6110-6112&#0093;<DD>
When a BRE matching a single character, a subexpression, or a
back-reference is followed by the special character asterisk ('*' ),
together with that asterisk it shall match what zero or
more consecutive occurrences of the BRE would match.
<DT>&#0091;D:6114-6117&#0093;<DD>
When a BRE matching a single character, a subexpression, or a
back-reference is followed by an interval expression of the
format "&#0092;{m&#0092;}" , "&#0092;{m,&#0092;}" , or "&#0092;{m,n&#0092;}" , together with that
interval expression it shall match what repeated consecutive
occurrences of the BRE would match. "&#0092;{m,n&#0092;}" , together with
that interval expression it shall match what repeated
consecutive occurrences of the BRE would match.
<DT>&#0091;D:6127-6129&#0093;<DD>
A subexpression repeated by an asterisk ('*') or an interval expression
shall not match a null expression unless this is the only match for the
repetition or it is necessary to satisfy the exact or minimum number of
occurrences for the interval expression.
<DT>&#0091;D:6136&#0093;<DD>
Subexpressions/back-references &#0092;(&#0092;) &#0092;n
<DT>&#0091;D:6145-6151&#0093;<DD>
The implementation may treat the circumflex as an anchor when
used as the first character of a subexpression. The circumflex
shall anchor the
expression (or optionally subexpression) to the beginning of a
string; only sequences starting at the first character of a
string shall be matched by the BRE. For example, the BRE "^ab"
matches "ab" in the string "abcdef" , but fails to match in the
string "cdefab" . The BRE "&#0092;(^ab&#0092;)" may match the former
string. A portable BRE shall escape a leading circumflex in a
subexpression to match a literal circumflex.
<DT>&#0091;D:6152-6156&#0093;<DD>
A dollar sign ( '$' ) shall be an anchor when used as the last
character of an entire BRE. The implementation may treat a
dollar sign as an anchor when used as the last character of a
subexpression. The dollar sign shall anchor the expression (or
optionally subexpression) to the end of the string being matched;
the dollar sign can be said to match the end-of-string following
the last character.
<DT>&#0091;D:6265-6270&#0093;<DD>
A circumflex ( '^' ) outside a bracket expression shall anchor
the expression or subexpression it begins to the beginning of a
string; such an expression or subexpression can match only a
sequence starting at the first character of a string. For
example, the EREs "^ab" and "(^ab)" match "ab" in the string
"abcdef" , but fail to match in the string "cdefab" , and the
ERE "a^b" is valid, but can never match because the 'a'
prevents the expression "^b" from matching starting at the
first character.
<DT>&#0091;D:6271-6276&#0093;<DD>
A dollar sign ( '$' ) outside a bracket expression shall anchor
the expression or subexpression it ends to the end of a string;
such an expression or subexpression can match only a sequence
ending at the last character of a string. For example, the EREs
"ef$" and "(ef$)" match "ef" in the string "abcdef" , but fail
to match in the string "cdefab" , and the ERE "e$f" is valid,
but can never match because the 'f' prevents the expression
"e$" from matching ending at the last character.
<DT>&#0091;R:2359-2370&#0093;<DD>
It is possible to determine what strings correspond to
subexpressions by recursively applying the leftmost longest
rule to each subexpression, but only with the proviso that the
overall match is leftmost longest. For example, matching
"&#0092;(ac*&#0092;)c*d&#0091;ac&#0093;*&#0092;1" against acdacaaa matches acdacaaa (with
&#0092;1=a); simply matching the longest match for "&#0092;(ac*&#0092;)" would
yield &#0092;1=ac, but the overall match would be smaller (acdac).
Conceptually, the implementation must examine every possible
match and among those that yield the leftmost longest total
matches, pick the one that does the longest match for the
leftmost subexpression, and so on. Note that this means that
matching by subexpressions is context-dependent: a
subexpression within a larger RE may match a different string
from the one it would match as an independent RE, and two
instances of the same subexpression within the same larger RE
may match different lengths even in similar sequences of
characters. For example, in the ERE "(a.*b)(a.*b)" , the two
identical subexpressions would match four and six characters,
respectively, of accbaccccb.
<DT>&#0091;R:2512-2520&#0093;<DD>
The limit of nine back-references to subexpressions in the RE
is based on the use of a single-digit identifier; increasing
this to multiple digits would break historical applications.
This does not imply that only nine subexpressions are allowed
in REs. The following is a valid BRE with ten subexpressions:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
&#0092;(&#0092;(&#0092;(ab&#0092;)*c&#0092;)*d&#0092;)&#0092;(ef&#0092;)*&#0092;(gh&#0092;)&#0092;{2&#0092;}&#0092;(ij&#0092;)*&#0092;(kl&#0092;)*&#0092;(mn&#0092;)*&#0092;(op&#0092;)*&#0092;(qr&#0092;)*
</DIV>
</PRE>
The standard developers regarded the common historical
behavior, which supported "&#0092;n*" , but not "&#0092;n&#0092;{min,max&#0092;}" ,
"&#0092;(...&#0092;)*" , or "&#0092;(...&#0092;)&#0092;{min,max&#0092;}" , as a non-intentional
result of a specific implementation, and they supported both
duplication and interval expressions following subexpressions
and back-references.
<DT>&#0091;R:2537-2544&#0093;<DD>
However, one relatively uncommon case was changed to allow an
extension used on some implementations. Historically, the BREs
"^foo" and "&#0092;(^foo&#0092;)" did not match the same string, despite
the general rule that subexpressions and entire BREs match the
same strings. To increase consensus, IEEE Std 1003.1-2001 has
allowed an extension on some implementations to treat these two
cases in the same way by declaring that anchoring may occur at
the beginning or end of a subexpression. Therefore, portable
BREs that require a literal circumflex at the beginning or a
dollar sign at the end of a subexpression must escape them.
Note that a BRE such as "a&#0092;(^bc&#0092;)" will either match "a^bc" or
nothing on different systems under the rules.
<DT>&#0091;R:2549-2554&#0093;<DD>
Some implementations have extended the BRE syntax to add
alternation. For example, the subexpression "&#0092;(foo$&#0092;|bar&#0092;)"
would match either "foo" at the end of the string or "bar"
anywhere. The extension is triggered by the use of the
undefined "&#0092;|" sequence. Because the BRE is undefined for
portable scripts, the extending system is free to make other
assumptions, such that the '$' represents the end-of-line
anchor in the middle of a subexpression. If it were not for the
extension, the '$' would match a literal dollar sign under the
rules.
<DT>&#0091;R:2617-2620&#0093;<DD>
The removal of the Back_open_paren Back_close_paren option from
the nondupl_RE specification is the result of PASC
Interpretation 1003.2-92 #43 submitted for the ISO POSIX-2:1993
standard. Although the grammar required support for null
subexpressions, this section does not describe the meaning of,
and historical practice did not support, this construct.
<DT>&#0091;A:37188&#0093;<DD>
size_t re_nsub Number of parenthesized subexpressions
<DT>&#0091;A:37206-37208&#0093;<DD>
If the REG_NOSUB flag was not set in cflags, then regcomp()
shall set re_nsub to the number of parenthesized subexpressions
(delimited by "&#0092;(&#0092;)" in basic regular expressions or "()" in
extended regular expressions) found in pattern.
<DT>&#0091;A:37220-37257&#0093;<DD>
If nmatch is 0 or REG_NOSUB was set in the cflags argument to
regcomp(), then regexec() shall ignore the pmatch argument.
Otherwise, the application shall ensure that the pmatch
argument points to an array with at least nmatch elements, and
regexec() shall fill in the elements of that array with offsets
of the substrings of string that correspond to the
parenthesized subexpressions of pattern: pmatch&#0091;i&#0093;.rm_so
shall be the byte offset of the beginning and pmatch&#0091;i&#0093;.rm_eo
shall be one greater than the byte offset of the end of
substring i. (Subexpression i begins at the ith matched open
parenthesis, counting from 1.) Offsets in pmatch&#0091;0&#0093; identify
the substring that corresponds to the entire regular
expression. Unused elements of pmatch up to pmatch&#0091;nmatch-1&#0093;
shall be filled with -1. If there are more than nmatch
subexpressions in pattern ( pattern itself counts as a
subexpression), then regexec() shall still do the match, but
shall record only the first nmatch substrings.
<P>
When matching a basic or extended regular expression, any given
parenthesized subexpression of pattern might participate in the
match of several different substrings of string, or it might
not match any substring even though the pattern as a whole did
match. The following rules shall be used to determine which
substrings to report in pmatch when matching regular
expressions:
<DIV style="padding-left:16px;text-indent:0px">
<OL>
<LI>
If subexpression i in a regular expression is not contained
within another subexpression, and it participated in the match
several times, then the byte offsets in pmatch&#0091;i&#0093; shall
delimit the last such match.
<LI>
If subexpression i is not contained within another
subexpression, and it did not participate in an otherwise
successful match, the byte offsets in pmatch&#0091;i&#0093; shall be -1. A
subexpression does not participate in the match when:
<PRE>
&nbsp;'*' or "&#0092;{&#0092;}" appears immediately after the
subexpression in a basic regular expression, or '*' ,
&nbsp;'?' , or "{}" appears immediately after the
subexpression in an extended regular expression, and
the subexpression did not match (matched 0 times)
<P>
or:
<P>
&nbsp;'|' is used in an extended regular expression to select
this subexpression or another, and the other
subexpression matched.
</PRE>
<LI>
If subexpression i is contained within another subexpression
j, and i is not contained within any other subexpression that
is contained within j, and a match of subexpression j is
reported in pmatch&#0091;j&#0093;, then the match or non-match of
subexpression i reported in pmatch&#0091;i&#0093; shall be as described in
1. and 2. above, but within the substring reported in pmatch&#0091;
j&#0093; rather than the whole string. The offsets in pmatch&#0091;i&#0093; are
still relative to the start of string.
<LI>
If subexpression i is contained in subexpression j, and the
byte offsets in pmatch&#0091;j&#0093; are -1, then the pointers in pmatch&#0091;
i&#0093; shall also be -1.
<LI>
If subexpression i matched a zero-length string, then both
byte offsets in pmatch&#0091;i&#0093; shall be the byte offset of the
character or null terminator immediately following the
zero-length string.
</OL>
</DIV>
<DT>&#0091;A:37363-37366&#0093;<DD>
The regexec() function must fill in all nmatch elements of
pmatch, where nmatch and pmatch are supplied by the
application, even if some elements of pmatch do not correspond
to subexpressions in pattern. The application writer should
note that there is probably no reason for using a value of
nmatch that is larger than preg-&gt; re_nsub+1.
<DT>&#0091;A:37407-37413&#0093;<DD>
The number of subexpressions in the RE is reported in re_nsub
in preg. With this change to regexec(), consideration was given
to dropping the REG_NOSUB flag since the user can now specify
this with a zero nmatch argument to regexec(). However, keeping
REG_NOSUB allows an implementation to use a different (perhaps
more efficient) algorithm if it knows in regcomp() that no
subexpressions need be reported. The implementation is only
required to fill in pmatch if nmatch is not zero and if
REG_NOSUB is not specified.
</DL>
</DIV>
<P>
This sentence is as close as the standard gets to a definition:
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT>&#0091;A:37225-37226&#0093;<DD>
Subexpression i begins at the ith matched open parenthesis, counting from 1.
</DL>
</DIV>
<P>
Using nonterminals from the BRE &#0091;D:6371-6731&#0093; and ERE &#0091;D:6452-6452&#0093; grammar
productions (text not listed in this document) yields the following:
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT><STRONG>DEFINITION</STRONG><DD>
A
<EM>subexpression</EM>
corresponds to the
<TT>Back_open_paren RE_expression Back_close_paren</TT>
form of the
<TT>nondupl_RE</TT>
BRE grammar production or
the
<TT>'(' extended_reg_exp ')'</TT>
form of the
<TT>ERE_expression</TT>
ERE grammar production.
Subexpression i begins at the ith matched open parenthesis
(<TT>Back_open_paren</TT>
for BREs and '(' for EREs),
starting from the left and counting from 1.
Subexpression 0 is the entire RE.
</DL>
</DIV>
<P>
This definition and the subexpression match rule &#0091;R:2359-2370&#0093; can be used to
to examine a class of EREs where the top level catenation operands are
subexpressions.
(A top level subexpression is not contained in any other subexpression
except subexpression 0.)
The subexpression match rule in pseudo code is:
<UL type=square>
<LI>
determine the longest of the leftmost matches for subexpression-0
&#0091;R:2359-2361&#0093;
<LI>
for 1&lt;=<EM>i</EM>&lt;=<STRONG>re_nsub</STRONG>
determine the longest match for
subexpression-<EM>i</EM>
consistent with the matches already determined for
subexpression-<EM>j,</EM>
0&lt;=<EM>j</EM>&lt;<EM>i</EM>.
&#0091;R:2359-2370&#0093; &#0091;A:37235-37257&#0093;
</UL>
For example, given
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#03:E (a?)((ab)?) ab (0,2)(0,0)(0,2)(0,2)
</DIV>
</PRE>
the subexpressions are:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
subexpression-0 (a?)((ab)?)
subexpression-1 (a?)
subexpression-2 ((ab)?)
subexpression-3 (ab)
</DIV>
</PRE>
The longest of the leftmost matches for subexpression-0 is (0,2).
The longest match for subexpression-1, consistent with the match
for subexpression-0, is (0,0); otherwise if it had matched (0,1) then
subexpression-2 would not match and the subexpression-0 match would be
limited to (0,1).
The longest match for subexpression-2, consistent with the matches
for subexpression-0 and subexpression-1, is (0,2).
The longest match for subexpression-3, consistent with the matches
for subexpression-0, subexpression-1 and subexpression-2, is (0,2).
This table illustrates the matching:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
subexpr pattern match
0 (a?)((ab)?) (0,2)
1 (a?) (0,0)
2 ((ab)?) (0,2)
3 (ab) (0,2)
</DIV>
</PRE>
RE#04 is a similar example that exposes the associativity of subexpression
concatenation:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#04:E (a?)((ab)?)(b?) ab (0,2)(0,1)(1,1)(?,?)(1,2)
subexpr pattern match
0 (a?)((ab)?)(b?) (0,2)
1 (a?) (0,1)
2 ((ab)?) (1,1)
3 (ab) (?,?)
4 (b?) (1,2)
</DIV>
</PRE>
&#0091;R:2363-2365&#0093; also shows that parenthesis can be used to alter the
order of matching:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#05:E ((a?)((ab)?))(b?) ab (0,2)(0,2)(0,0)(0,2)(0,2)(2,2)
subexpr pattern match
0 ((a?)((ab)?))(b?) (0,2)
1 ((a?)((ab)?)) (0,2)
2 (a?) (0,0)
3 ((ab)?) (0,2)
4 (ab) (0,2)
5 (b?) (2,2)
</DIV>
</PRE>
In RE#05 the extra parenthesis (around subexpression-1 and subexpression-2 in
RE#04) form a new subexpression-1, and change the
match for the last subexpression
<TT>(b?)</TT>
to (2,2) (from (1,2) in RE#04.)
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#06:E (a?)(((ab)?)(b?)) ab (0,2)(0,1)(1,2)(1,1)(?,?)(1,2)
subexpr pattern match
0 (a?)(((ab)?)(b?)) (0,2)
1 (a?) (0,1)
2 (((ab)?)(b?)) (1,2)
3 ((ab)?) (1,1)
4 (ab) (?,?)
5 (b?) (1,2)
</DIV>
</PRE>
In RE#06 the extra parenthesis pair forces right associativity and results
in the same match of (1,2) for the last subexpression
<TT>(b?)</TT>
as in RE#04.
These examples show that:
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT><STRONG>PROPERTY</STRONG><DD>
Subexpression grouping can alter the precedence of concatenation.
<DT><STRONG>PROPERTY</STRONG><DD>
Subexpression concatenation is right associative.
</DL>
</DIV>
<P>
The following examples examine replicated subexpressions.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#07:E (.?) x (0,1)(0,1)
:RE#08:E (.?){1} x (0,1)(0,1)
:RE#09:E (.?)(.?) x (0,1)(0,1)(1,1)
:RE#10:E (.?){2} x (0,1)(1,1)
:RE#11:E (.?)* x (0,1)(0,1)
</DIV>
</PRE>
&#0091;D:6227-6234&#0093; specifies that RE#07 and RE#08 are equivalent, and that
RE#09 and RE#10 are equivalent, and
&#0091;D:6217-6219&#0093; specifies that RE#09 and RE#11 are equivalent.
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT>&#0091;D:6227-6234&#0093;<DD>
When an ERE matching a single character or an ERE enclosed in
parentheses is followed by an interval expression of the format "{m}" ,
"{m,}" , or "{m,n}" , together with that interval expression it shall
match what repeated consecutive occurrences of the ERE would match. The
values of m and n are decimal integers in the range 0 &lt;= m&lt;= n&lt;=
{RE_DUP_MAX}, where m specifies the exact or minimum number of
occurrences and n specifies the maximum number of occurrences. The
expression "{m}" matches exactly m occurrences of the preceding ERE,
"{m,}" matches at least m occurrences, and "{m,n}" matches any number
of occurrences between m and n, inclusive.
<DT>&#0091;D:6217-6219&#0093;<DD>
When an ERE matching a single character or an ERE enclosed in
parentheses is followed by the special character asterisk ( '*' ),
together with that asterisk it shall match what zero or more
consecutive occurrences of the ERE would match.
</DL>
</DIV>
In RE#09 subexpression-1 matches (0,1), leaving the null string at (1,1) for
subexpression-2.
In RE#10 the first iteration of subexpression-1 matches (0,1), the same
as subexpression-1 in RE#09, and the second iteration of subexpression-1
matches (1,1), the same as subexpression-2 in RE#09.
RE#07 and RE#08 show that only one iteration is needed to match the subject
string, so the match in RE#11 requires only one iteration, and as such is the
last iteration of &#0091;D:6107-6109&#0093; &#0091;A:37235-37237&#0093;.
RE#10 and RE#11 also illustrate &#0091;D:6127-6129&#0093; &#0091;D:6239-6241&#0093;, which
specify that a repeated RE matches the null string only if it is the only
match (not this case) or if it is necessary to satisfy an interval expression
minimum (2 in this case.)
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT>&#0091;D:6239-6241&#0093;<DD>
An ERE matching a single character repeated by an '*' , '?' , or an
interval expression shall not match a null expression unless this is
the only match for the repetition or it is necessary to satisfy the
exact or minimum number of occurrences for the interval expression.
</DL>
</DIV>
<P>
The following examples dig deeper into replicated subexpressions.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#12:E (.?.?) xxx (0,2)(0,2)
:RE#13:E (.?.?){1} xxx (0,2)(0,2)
:RE#14:E (.?.?)(.?.?) xxx (0,3)(0,2)(2,3)
:RE#15:E (.?.?){2} xxx (0,3)(2,3)
:RE#16:E (.?.?)(.?.?)(.?.?) xxx (0,3)(0,2)(2,3)(3,3)
:RE#17:E (.?.?){3} xxx (0,3)(3,3)
:RE#18:E (.?.?)* xxx (0,3)(2,3)
</DIV>
</PRE>
Here RE#14 shows that only two iterations are needed for a complete match,
making the last iteration match for RE#18 (2,3), since the first
iteration matched (0,2), as in RE#14.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="A subpattern is ">A subpattern is </A></H3></FONT></FONT></CENTER>
The term
<EM>subpattern</EM>
appears exactly once:
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT>&#0091;D:5907-5908&#0093;<DD>
Consistent with the whole match being the longest of the leftmost matches,
each subpattern, from left to right, shall match the longest possible string.
</DL>
</DIV>
Consider RE#04 and RE#05 again:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#04:E (a?)((ab)?)(b?) ab (0,2)(0,1)(1,1)(?,?)(1,2)
:RE#05:E ((a?)((ab)?))(b?) ab (0,2)(0,2)(0,0)(0,2)(0,2)(2,2)
</DIV>
</PRE>
If a subpattern were an entity that combined adjacent subexpressions,
e.g.,
<TT>(a?)((ab)?)</TT>
in RE#04, then &#0091;D:5907-5908&#0093; would violate &#0091;R:2359-2370&#0093;.
Similarly, if a subpattern were an entity that "went inside" subexpressions,
e.g.,
<TT>(a?)</TT>
in RE#05, then again &#0091;D:5907-5908&#0093; would violate &#0091;R:2359-2370&#0093;.
In other words, a subpattern can be neither larger than nor smaller than
a subexpression;
a subpattern must be a grammatical entity equivalent to a subexpression.
This corresponds to the nonterminal
<TT>nondupl_RE</TT>
in the BRE grammar; there is no direct correspondence to a nonterminal
in the ERE grammar.
However, if the optional duplication operator (*,+,?,range) is included then
subpattern corresponds to
<TT>simple_RE</TT>
in the BRE grammar and
<TT>ERE_expression</TT>
in the ERE grammar, and both &#0091;D:5907-5908&#0093; and &#0091;R:2359-2370&#0093; are satisfied.
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT><STRONG>DEFINITION</STRONG><DD>
A
<EM>subpattern</EM>
corresponds to the
<TT>simple_RE</TT>
nonterminal in the BRE grammar or the
<TT>ERE_expression</TT>
nonterminal in the ERE grammar.
</DL>
</DIV>
This means that subexpressions and subpatterns are of equal importance
in RE matching.
Also note that any other definition for subpattern will put
&#0091;D:5907-5908&#0093; in direct conflict with &#0091;R:2359-2370&#0093;.
<P>
RE#19, RE#20 and RE#21 examine the relationship between subexpression
and subpattern:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#19:E a?((ab)?)(b?) ab (0,2)(1,1)(?,?)(1,2)
:RE#20:E (a?)((ab)?)b? ab (0,2)(0,1)(1,1)(?,?)
:RE#21:E a?((ab)?)b? ab (0,2)(1,1)(?,?)
</DIV>
</PRE>
<P>
These are all variations of RE#04.
Other than subexpression renumbering, the match for the subexpression
<TT>((ab)?)</TT>
must be the same in RE#04, RE#19, RE#20 and RE#21.
<TT>a?</TT>
is a subpattern in RE#19 and RE#21, of equal matching importance to
<TT>(a?)</TT>
in RE#04, and
<TT>b?</TT>
is a subpattern in RE#20 and RE#21, of equal matching
importance to
<TT>(b?)</TT>
in RE#04.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="The Dark Corners ">The Dark Corners </A></H3></FONT></FONT></CENTER>
The remaining examples explore dark corners of the standard
and implementations.
Although the differences between some of the examples are subtle,
for some implementations it may mean the difference between an answer and
a core dump.
<P>
In RE#22 subexpression
<TT>(a*)</TT>
matches the null string at (0,0), and continues to match at that position
until the minimal range count is satisfied.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#22:E (a*){2} xxxxx (0,0)(0,0)
</DIV>
</PRE>
RE#23 through RE#27 expose implementations that sometimes do
<EM>first match</EM>
for alternation within subexpressions.
Some implementations erroneously match the first iteration of
subexpression-1 in RE#24 through RE#27 to (0,1).
RE#27 is equivalent to RE#26; the match requires two iterations, the first
matching (0,2) and the last matching (2,3).
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#23:E (ab?)(b?a) aba (0,3)(0,2)(2,3)
:RE#24:E (a|ab)(ba|a) aba (0,3)(0,2)(2,3)
:RE#25:E (a|ab|ba) aba (0,2)(0,2)
:RE#26:E (a|ab|ba)(a|ab|ba) aba (0,3)(0,2)(2,3)
:RE#27:E (a|ab|ba)* aba (0,3)(2,3)
</DIV>
</PRE>
RE#28 through RE#33 expose implementations that report short matches
for some repeated subexpressions.
Some implementations report incorrect matches for
subexpression-1 in RE#30 and RE#33.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#28:E (aba|a*b) ababa (0,3)(0,3)
:RE#29:E (aba|a*b)(aba|a*b) ababa (0,5)(0,2)(2,5)
:RE#30:E (aba|a*b)* ababa (0,5)(2,5)
:RE#31:E (aba|ab|a) ababa (0,3)(0,3)
:RE#32:E (aba|ab|a)(aba|ab|a) ababa (0,5)(0,2)(2,5)
:RE#33:E (aba|ab|a)* ababa (0,5)(2,5)
</DIV>
</PRE>
RE#34 through RE#36 expose implementations that report subexpression matches
for earlier iterations of the subexpression.
Some implementations report a match for subexpression-2 in RE#36
while reporting the (2,3) match for subexpression-1: clearly a bug.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#34:E (a(b)?) aba (0,2)(0,2)(1,2)
:RE#35:E (a(b)?)(a(b)?) aba (0,3)(0,2)(1,2)(2,3)(?,?)
:RE#36:E (a(b)?)+ aba (0,3)(2,3)(?,?)
</DIV>
</PRE>
RE#37 and RE#38 expose implementations that give priority to subexpression
matching over subpattern matching.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#37:E (.*)(.*) xx (0,2)(0,2)(2,2)
:RE#38:E .*(.*) xx (0,2)(2,2)
</DIV>
</PRE>
RE#39 through RE#41 expose implementations that treat explicit vs. implicit
subexpression repetition differently.
This is a theme common to many of the previous examples.
Again, the subexpression in RE#41 requires two iterations to match,
and the second iteration matches (5,7), as illustrated by RE#40.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#39:E (a.*z|b.*y) azbazby (0,5)(0,5)
:RE#40:E (a.*z|b.*y)(a.*z|b.*y) azbazby (0,7)(0,5)(5,7)
:RE#41:E (a.*z|b.*y)* azbazby (0,7)(5,7)
</DIV>
</PRE>
RE#42 is another
<EM>first match</EM>
test.
Some implementations erroneously report a match of (0,1) for subexpression-1.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#42:E (.|..)(.*) ab (0,2)(0,2)(2,2)
</DIV>
</PRE>
RE#43 through RE#45 require only one iteration of subexpression-1 to
match the entire subject string.
RE#45 exposes three separate bugs in the implementations that were tested.
The most common was
<EM>over iteration</EM>,
where subexpression-1 is matched for a second iteration to the null string
at (3,3).
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#43:E ((..)*(...)*) xxx (0,3)(0,3)(?,?)(0,3)
:RE#44:E ((..)*(...)*)((..)*(...)*) xxx (0,3)(0,3)(?,?)(0,3)(3,3)(?,?)(?,?)
:RE#45:E ((..)*(...)*)* xxx (0,3)(0,3)(?,?)(0,3)
</DIV>
</PRE>
RE#46 through RE#82 are nasty;
backreferences are intuitive neither for the implementor nor the user.
<P>
RE#49, RE#53, RE#67 and RE#68 illustrate the second part of the
<EM>subpattern</EM>
rule:
<DIV style="padding-left:16px;text-indent:0px">
<DL COMPACT>
<DT>&#0091;D:5908-5909&#0093;<DD>
For this purpose, a null string shall be considered to be longer than
no match at all.
</DL>
</DIV>
RE#53 requires close examination to see why the match is (0,2)(1,1)(2,2)
instead of (0,2)(0,1)(?,?).
The match of (0,1) for subexpression-1 is longer than (1,1), but
subexpression-1 can be repeated, and that second iteration allows
subexpression-2 to match (2,2), which is longer than (?,?) by &#0091;D:5908-5909&#0093;.
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
:RE#46:B &#0092;(a&#0092;{0,1&#0092;}&#0092;)*b&#0092;1 ab (0,2)(1,1)
:RE#47:B &#0092;(a*&#0092;)*b&#0092;1 ab (0,2)(1,1)
:RE#48:B &#0092;(a*&#0092;)b&#0092;1* ab (0,2)(0,1)
:RE#49:B &#0092;(a*&#0092;)*b&#0092;1* ab (0,2)(1,1)
:RE#50:B &#0092;(a&#0092;{0,1&#0092;}&#0092;)*b&#0092;(&#0092;1&#0092;) ab (0,2)(1,1)(2,2)
:RE#51:B &#0092;(a*&#0092;)*b&#0092;(&#0092;1&#0092;) ab (0,2)(1,1)(2,2)
:RE#52:B &#0092;(a*&#0092;)b&#0092;(&#0092;1&#0092;)* ab (0,2)(0,1)(?,?)
:RE#53:B &#0092;(a*&#0092;)*b&#0092;(&#0092;1&#0092;)* ab (0,2)(1,1)(2,2)
:RE#54:B &#0092;(a&#0092;{0,1&#0092;}&#0092;)*b&#0092;1 aba (0,3)(0,1)
:RE#55:B &#0092;(a*&#0092;)*b&#0092;1 aba (0,3)(0,1)
:RE#56:B &#0092;(a*&#0092;)b&#0092;1* aba (0,3)(0,1)
:RE#57:B &#0092;(a*&#0092;)*b&#0092;1* aba (0,3)(0,1)
:RE#58:B &#0092;(a*&#0092;)*b&#0092;(&#0092;1&#0092;)* aba (0,3)(0,1)(2,3)
:RE#59:B &#0092;(a&#0092;{0,1&#0092;}&#0092;)*b&#0092;1 abaa (0,3)(0,1)
:RE#60:B &#0092;(a*&#0092;)*b&#0092;1 abaa (0,3)(0,1)
:RE#61:B &#0092;(a*&#0092;)b&#0092;1* abaa (0,4)(0,1)
:RE#62:B &#0092;(a*&#0092;)*b&#0092;1* abaa (0,4)(0,1)
:RE#63:B &#0092;(a*&#0092;)*b&#0092;(&#0092;1&#0092;)* abaa (0,4)(0,1)(3,4)
:RE#64:B &#0092;(a&#0092;{0,1&#0092;}&#0092;)*b&#0092;1 aab (0,3)(2,2)
:RE#65:B &#0092;(a*&#0092;)*b&#0092;1 aab (0,3)(2,2)
:RE#66:B &#0092;(a*&#0092;)b&#0092;1* aab (0,3)(0,2)
:RE#67:B &#0092;(a*&#0092;)*b&#0092;1* aab (0,3)(2,2)
:RE#68:B &#0092;(a*&#0092;)*b&#0092;(&#0092;1&#0092;)* aab (0,3)(2,2)(3,3)
:RE#69:B &#0092;(a&#0092;{0,1&#0092;}&#0092;)*b&#0092;1 aaba (0,4)(1,2)
:RE#70:B &#0092;(a*&#0092;)*b&#0092;1 aaba (0,4)(1,2)
:RE#71:B &#0092;(a*&#0092;)b&#0092;1* aaba (0,3)(0,2)
:RE#72:B &#0092;(a*&#0092;)*b&#0092;1* aaba (0,4)(1,2)
:RE#73:B &#0092;(a*&#0092;)*b&#0092;(&#0092;1&#0092;)* aaba (0,4)(1,2)(3,4)
:RE#74:B &#0092;(a&#0092;{0,1&#0092;}&#0092;)*b&#0092;1 aabaa (0,4)(1,2)
:RE#75:B &#0092;(a*&#0092;)*b&#0092;1 aabaa (0,5)(0,2)
:RE#76:B &#0092;(a*&#0092;)b&#0092;1* aabaa (0,5)(0,2)
:RE#77:B &#0092;(a*&#0092;)*b&#0092;1* aabaa (0,5)(0,2)
:RE#78:B &#0092;(a*&#0092;)*b&#0092;(&#0092;1&#0092;)* aabaa (0,5)(0,2)(3,5)
:RE#79:B &#0092;(x&#0092;)*a&#0092;1 a NOMATCH
:RE#80:B &#0092;(x&#0092;)*a&#0092;1* a (0,1)(?,?)
:RE#81:B &#0092;(x&#0092;)*a&#0092;(&#0092;1&#0092;) a NOMATCH
:RE#82:B &#0092;(x&#0092;)*a&#0092;(&#0092;1&#0092;)* a (0,1)(?,?)(?,?)
:RE#83:E (aa(b(b))?)+ aabbaa (0,6)(4,6)(?,?)(?,?)
:RE#84:E (a(b)?)+ aba (0,3)(2,3)(?,?)
:RE#85:E (&#0091;ab&#0093;+)(&#0091;bc&#0093;+)(&#0091;cd&#0093;*) abcd (0,4)(0,2)(2,3)(3,4)
:RE#86:B &#0092;(&#0091;ab&#0093;*&#0092;)&#0092;(&#0091;bc&#0093;*&#0092;)&#0092;(&#0091;cd&#0093;*&#0092;)&#0092;1 abcdaa (0,5)(0,1)(1,3)(3,4)
:RE#87:B &#0092;(&#0091;ab&#0093;*&#0092;)&#0092;(&#0091;bc&#0093;*&#0092;)&#0092;(&#0091;cd&#0093;*&#0092;)&#0092;1 abcdab (0,6)(0,2)(2,3)(3,4)
:RE#88:B &#0092;(&#0091;ab&#0093;*&#0092;)&#0092;(&#0091;bc&#0093;*&#0092;)&#0092;(&#0091;cd&#0093;*&#0092;)&#0092;1* abcdaa (0,6)(0,1)(1,3)(3,4)
:RE#89:B &#0092;(&#0091;ab&#0093;*&#0092;)&#0092;(&#0091;bc&#0093;*&#0092;)&#0092;(&#0091;cd&#0093;*&#0092;)&#0092;1* abcdab (0,6)(0,2)(2,3)(3,4)
:RE#90:E ^(A(&#0091;^B&#0093;*))?(B(.*))? Aa (0,2)(0,2)(1,2)
:RE#91:E ^(A(&#0091;^B&#0093;*))?(B(.*))? Bb (0,2)(?,?)(?,?)(0,2)(1,2)
:RE#92:B .*&#0092;(&#0091;AB&#0093;&#0092;).*&#0092;1 ABA (0,3)(0,1)
:RE#93:B$ &#0091;^A&#0093;*A &#0092;nA (0,2)
</DIV>
</PRE>
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Conclusion">Conclusion</A></H3></FONT></FONT></CENTER>
It is possible to use the 2001 issue of the POSIX
<STRONG>regex</STRONG>
standard,
<EM>with the addition of one sentence</EM>,
to resolve the interpretation differences that have surfaced since 1995.
That key sentence is a precise and consistent definition for the term
<EM>subpattern</EM>.
By noting the relationship between
<EM>subpatterns</EM>
and
<EM>subexpressions</EM>,
the proposed definition is shown to be the only one that can be
consistent with all parts of the standard.
<P>
<HR>
<TABLE border=0 align=center width=96%>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-interpretation.mm mm document">Glenn Fowler</A></TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Information and Software Systems Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>AT&amp;T Labs Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Florham Park NJ</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>January 2003</TD>
</TR>
</TABLE>
<P>
</TD></TR></TBODY></TABLE>
</BODY>
</HTML>

62
re-nullsubexpr.html Normal file
View file

@ -0,0 +1,62 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
<HTML>
<HEAD>
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
<META name="keywords" content="regular expression null subexpression tests">
<TITLE> ../re/re-nullsubexpr.mm mm document </TITLE>
<META name="author" content="gsf">
</HEAD>
<BODY bgcolor=white link=slateblue vlink=teal >
<TABLE border=0 align=center width=96%>
<TBODY><TR><TD valign=top align=left>
<!--INDEX--><!--/INDEX-->
<P>
<HR>
<CENTER>
<H3><CENTER><FONT color=red><FONT face=courier>regular expression null subexpression tests</FONT></FONT></CENTER></H3>
<BR>Glenn Fowler <SMALL>&lt;<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>&gt;</SMALL>
<P><I>AT&amp;T Labs Research - Florham Park NJ</I>
</CENTER>
<P><HR><P>
The
<STRONG>regex</STRONG>
tests in
<A href="http://web.archive.org/web/20080709091423id_/http://www.research.att.com/~gsf/testregex/nullsubexpr.dat">nullsubexpr.dat</A>
exercise
<STRONG>regex</STRONG>
null subexpression matching.
<P>
<HR>
<TABLE border=0 align=center width=96%>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-nullsubexpr.mm mm document">Glenn Fowler</A></TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Information and Software Systems Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>AT&amp;T Labs Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Florham Park NJ</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>August 04, 2002</TD>
</TR>
</TABLE>
<P>
</TD></TR></TBODY></TABLE>
</BODY>
</HTML>

60
re-repetition.html Normal file
View file

@ -0,0 +1,60 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
<HTML>
<HEAD>
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
<META name="keywords" content="regular expression repetition tests">
<TITLE> ../re/re-repetition.mm mm document </TITLE>
<META name="author" content="gsf">
</HEAD>
<BODY bgcolor=white link=slateblue vlink=teal >
<TABLE border=0 align=center width=96%>
<TBODY><TR><TD valign=top align=left>
<!--INDEX--><!--/INDEX-->
<P>
<HR>
<CENTER>
<H3><CENTER><FONT color=red><FONT face=courier>regular expression repetition tests</FONT></FONT></CENTER></H3>
<BR>Glenn Fowler <SMALL>&lt;<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>&gt;</SMALL>
<P><I>AT&amp;T Labs Research - Florham Park NJ</I>
</CENTER>
<P><HR><P>
The
<STRONG>regex</STRONG>
tests in
<A href="http://web.archive.org/web/20080726033833id_/http://www.research.att.com/~gsf/testregex/repetition.dat">repetition.dat</A>
exercise explicit and implicit repetition.
<P>
<HR>
<TABLE border=0 align=center width=96%>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-repetition.mm mm document">Glenn Fowler</A></TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Information and Software Systems Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>AT&amp;T Labs Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Florham Park NJ</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>August 04, 2002</TD>
</TR>
</TABLE>
<P>
</TD></TR></TBODY></TABLE>
</BODY>
</HTML>

79
repetition.dat Normal file
View file

@ -0,0 +1,79 @@
NOTE implicit vs. explicit repetitions : 2002-08-01
#
# Glenn Fowler <gsf@research.att.com>
# conforming matches (column 4) must match one of the following BREs
# NOMATCH
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
# i.e., each 3-tuple has two identical elements and one (?,?)
#
E ((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.)){1} NULL NOMATCH
E ((..)|(.)){2} NULL NOMATCH
E ((..)|(.)){3} NULL NOMATCH
E ((..)|(.))* NULL (0,0)
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)){2} a NOMATCH
E ((..)|(.)){3} a NOMATCH
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
E ((..)|(.)){3} aa NOMATCH
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)

16
rightassoc.dat Normal file
View file

@ -0,0 +1,16 @@
NOTE left-assoc:pass-none right-assoc:pass-all : 2002-04-29
E (a|ab)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4)
E (a|ab)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4)
E (ab|a)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4)
E (ab|a)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4)
E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3)
E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3)
E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3)
E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3)
E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)
E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)
E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)
E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)

2121
testregex.c Normal file

File diff suppressed because it is too large Load diff

241
testregex.html Normal file
View file

@ -0,0 +1,241 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
<HTML>
<HEAD>
<META name="generator" content="mm2html (AT&T Research) 2010-09-10">
<META name="keywords" content="regular expression pattern match regression test">
<TITLE> ../re/testregex.mm mm document </TITLE>
<META name="author" content="gsf">
</HEAD>
<BODY bgcolor=white link=slateblue vlink=teal >
<TABLE border=0 align=center width=96%>
<TBODY><TR><TD valign=top align=left>
<!--INDEX--><!--/INDEX-->
<B><FONT size=-1 face="verdana,arial,helvetica,geneva,sans-serif">
<TABLE align=center cellpadding=2 border=4 bgcolor=lightgrey><TR>
<TD><A href="testregex.html#Reference Implementations">Reference Implementations</A></TD>
<TD><A href="testregex.html#Test Data Repository">Test Data Repository</A></TD>
<TD><A href="testregex.html#Usage">Usage</A></TD>
<TD><A href="testregex.html#Reference Implementation Notes">Reference Implementation Notes</A></TD>
<TD><A href="testregex.html#testregex Notes">testregex Notes</A></TD>
</TR></TABLE>
</FONT></B>
<P>
<HR>
<CENTER>
<H3><CENTER><FONT color=red><FONT face=courier>AT&amp;T Research regex(3) regression tests</FONT></FONT></CENTER></H3>
<BR>Glenn Fowler <SMALL>&lt;<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>&gt;</SMALL>
<P><I>AT&amp;T Research - Florham Park NJ</I>
</CENTER>
<P><HR><P>
<A href="testregex.c">testregex.c 2004-05-31</A>
is the latest source for the AT&amp;T Research regression test
harness for the
<A href="http://www.opengroup.org/onlinepubs/007904975/functions/regcomp.html" target=_top>X/Open regex</A>
pattern match interface.
See
<NOBR><A href="http://web.archive.org/~gsf/man/man1/testregex.html"><STRONG>testregex</STRONG></A>(1)</NOBR>
for option and test input details.
The source and test data posted here are license free.
<P>
<STRONG>testregex</STRONG>
can:
<UL type=square>
<LI>
verify stability for a particular implementation in the face of
source code and/or compilation environment changes
<LI>
verify standard compliance for all implementations
<LI>
provide a basis for discussions on what
<EM>compliance</EM>
means
</UL>
<P>
See
<A href="re-interpretation.html">An Interpretation of the POSIX regex Standards</A>
for an analysis of the POSIX-X/Open
<STRONG>regex</STRONG>
standards.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Reference Implementations">Reference Implementations</A></H3></FONT></FONT></CENTER>
<STRONG>testregex</STRONG>
is currently built against these reference implementations:
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 frame=void rules=none >
<TBODY>
<TR><TD align=right>NAME&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;LABEL&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;AUTHORS</TD></TR>
<TR><TD align=right>
AT&amp;T ast&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://www.research.att.com/sw/download/" target=_top>A</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;Glenn Fowler and Doug McIlroy</TD></TR>
<TR><TD align=right>
bsd&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="ftp://ftp.netbsd.org/pub/NetBSD/NetBSD-1.5.2/source/sets/src.tgz" target=_top>B</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;</TD></TR>
<TR><TD align=right>
Bell Labs&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://www.bell-labs.com/" target=_top>D</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;Doug McIlroy</TD></TR>
<TR><TD align=right>
old gnu&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://www.gnu.org" target=_top>G</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;</TD></TR>
<TR><TD align=right>
gnu&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://www.gnu.org" target=_top>H</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;Isamu Hasegawa</TD></TR>
<TR><TD align=right>
irix&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://www.sgi.com" target=_top>I</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;</TD></TR>
<TR><TD align=right>
boost&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://www.boost.org/libs/regex/" target=_top>J</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;John Maddock</TD></TR>
<TR><TD align=right>
regex++&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://ourworld.compuserve.com/homepages/John_Maddock/regexpp.htm" target=_top>M</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;John Maddock</TD></TR>
<TR><TD align=right>
pcre perl compatible&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://www.pcre.org/" target=_top>P</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;Philip Hazel</TD></TR>
<TR><TD align=right>
rx&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="ftp://regexps.com/pub/src/hackerlab/" target=_top>R</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;Tom Lord</TD></TR>
<TR><TD align=right>
spencer&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://arglist.com/regex/rxspencer-alpha3.8.g2.tar.gz" target=_top>S</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;Henry Spencer</TD></TR>
<TR><TD align=right>
libtre&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://kouli.iki.fi/~vlaurika/libtre/" target=_top>T</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;Ville Laurikari</TD></TR>
<TR><TD align=right>
unix caldera&nbsp;&nbsp;</TD><TD align=center>&nbsp;&nbsp;<A href="http://unixtools.sourceforge.net/" target=_top>U</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;</TD></TR>
</TBODY></TABLE></TD></TR></TBODY></TABLE>
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Test Data Repository">Test Data Repository</A></H3></FONT></FONT></CENTER>
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 frame=void rules=none >
<TBODY>
<TR><TD align=right>
<A href="basic.dat">basic.dat</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;&nbsp;basic regex(3) -- all implementations should pass these</TD></TR>
<TR><TD align=right>
<A href="categorize.dat">categorize.dat</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;&nbsp;<A href="re-categorize.html">implementation categorization</A></TD></TR>
<TR><TD align=right>
<A href="nullsubexpr.dat">nullsubexpr.dat</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;&nbsp;<A href="re-nullsubexpr.html">null (...)* tests</A></TD></TR>
<TR><TD align=right>
<A href="leftassoc.dat">leftassoc.dat</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;&nbsp;<A href="re-assoc.html">left associative catenation implementation must pass these</A></TD></TR>
<TR><TD align=right>
<A href="rightassoc.dat">rightassoc.dat</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;&nbsp;<A href="re-assoc.html">right associative catenation implementation must pass these</A></TD></TR>
<TR><TD align=right>
<A href="forcedassoc.dat">forcedassoc.dat</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;&nbsp;<A href="re-assoc.html">subexpression grouping to force associativity</A></TD></TR>
<TR><TD align=right>
<A href="repetition.dat">repetition.dat</A>&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;&nbsp;&nbsp;<A href="re-repetition.html">explicit vs. implicit repetitions</A></TD></TR>
</TBODY></TABLE></TD></TR></TBODY></TABLE>
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Usage">Usage</A></H3></FONT></FONT></CENTER>
To run the
<STRONG>basic.dat</STRONG>
tests:
<DIV style="padding-left:16px;text-indent:0px">
<PRE>
testregex &lt; basic.dat
</DIV>
</PRE>
<P>
If the local implementation hangs or dumps on some tests then run with
the <STRONG>-c</STRONG> option.
The <STRONG>-h</STRONG> option lists the test data format details.
The test data files exercise all features;
the test harness detects and ignores features not
supported by the local implementation.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Reference Implementation Notes">Reference Implementation Notes</A></H3></FONT></FONT></CENTER>
<P>
<H4><A name="D: diet libc">D: diet libc</A></H4>
The
<A href="http://www.fefe.de/dietlibc/" target=_top>diet libc</A>
implementation is currently omitted because it fails all but one
<STRONG>basic.dat</STRONG>
test.
<P>
<H4><A name="P: PCRE">P: PCRE</A></H4>
The
<STRONG>P</STRONG>
implementation emulates
<NOBR><A href="http://web.archive.org/~gsf/man/man1/perl.html"><STRONG>perl</STRONG></A>(1)</NOBR>
and is not X/Open compliant by design.
The main differences are:
<UL type=square>
<LI>
<STRONG>P</STRONG>
<EM>leftmost-first</EM>
matching as opposed to the X/Open
<EM>leftmost-longest</EM>.
<LI>
<STRONG>REG_EXTENDED</STRONG>
patterns only.
</UL>
<P>
However, the
<STRONG>P</STRONG>
package regression tests, and
<NOBR><A href="http://web.archive.org/~gsf/man/man1/perl.html"><STRONG>perl</STRONG></A>(1)</NOBR>
features creeping into other implementations,
make it reasonable to include here.
<P>
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="testregex Notes">testregex Notes</A></H3></FONT></FONT></CENTER>
Extensions to the standard terminology are derived from the AT&amp;T
implementation, unified under
<STRONG>&lt;regex.h&gt;</STRONG>
with these modes:
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
<TABLE align=center bgcolor=papayawhip border=1 bordercolor=white cellpadding=2 cellspacing=2 frame=box rules=all >
<TBODY>
<TR><TD align=center>MODE&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;FLAGS&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;DESCRIPTION</TD></TR>
<TR><TD align=right>
BRE&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;0&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;basic RE</TD></TR>
<TR><TD align=right>
ERE&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;REG_EXTENDED&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;egrep RE with perl (...) extensions</TD></TR>
<TR><TD align=right>
ARE&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;REG_AUGMENTED&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;ERE with ! negation, &lt;&gt; word boundaries</TD></TR>
<TR><TD align=right>
SRE&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;REG_SHELL&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;sh patterns</TD></TR>
<TR><TD align=right>
KRE&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;REG_SHELL|REG_AUGMENTED&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;ksh93 patterns: ! @ ( | &amp; ) { }</TD></TR>
<TR><TD align=right>
LRE&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;REG_LITERAL&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;fgrep patterns</TD></TR>
</TBODY></TABLE></TD></TR></TBODY></TABLE>
<P>
and a few flags to handle
<NOBR><A href="http://web.archive.org/~gsf/man/man3/fnmatch.html"><STRONG>fnmatch</STRONG></A>(3):</NOBR>
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
<TABLE align=center bgcolor=papayawhip border=1 bordercolor=white cellpadding=2 cellspacing=2 frame=box rules=all >
<TBODY>
<TR><TD align=left>regex FLAG&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;fnmatch FLAG</TD></TR>
<TR><TD align=left>
REG_SHELL_ESCAPED&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;FNM_NOESCAPE</TD></TR>
<TR><TD align=left>
REG_SHELL_PATH&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;FNM_PATHNAME</TD></TR>
<TR><TD align=left>
REG_SHELL_DOT&nbsp;&nbsp;</TD><TD align=left>&nbsp;&nbsp;FNM_PERIOD</TD></TR>
</TBODY></TABLE></TD></TR></TBODY></TABLE>
<P>
The original
<TT>testregex.c</TT>
was done by Doug McIlroy at Bell Labs.
The current implementation is maintained by Glenn Fowler <SMALL>&lt;<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>&gt;</SMALL>.
<P>
<HR>
<TABLE border=0 align=center width=96%>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/testregex.mm mm document">Glenn Fowler</A></TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Information and Software Systems Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>AT&amp;T Labs Research</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>Florham Park NJ</TD>
</TR>
<TR>
<TD align=left></TD>
<TD align=center></TD>
<TD align=right>March 22, 2011</TD>
</TR>
</TABLE>
<P>
</TD></TR></TBODY></TABLE>
</BODY>
</HTML>