Initial import from Archive.org Wayback Machine. (wget -mkEpnp http://web.archive.org/web/20130420020035id_/http://www2.research.att.com/~gsf/testregex/ and wget -mkEpnp http://web.archive.org/web/20130420020018id_/http://www2.research.att.com/~gsf/man/man1/testregex.html)
This commit is contained in:
commit
786fb2e904
15 changed files with 4388 additions and 0 deletions
216
basic.dat
Normal file
216
basic.dat
Normal file
|
@ -0,0 +1,216 @@
|
|||
NOTE all standard compliant implementations should pass these : 2002-05-31
|
||||
|
||||
BE abracadabra$ abracadabracadabra (7,18)
|
||||
BE a...b abababbb (2,7)
|
||||
BE XXXXXX ..XXXXXX (2,8)
|
||||
E \) () (1,2)
|
||||
BE a] a]a (0,2)
|
||||
B } } (0,1)
|
||||
E \} } (0,1)
|
||||
BE \] ] (0,1)
|
||||
B ] ] (0,1)
|
||||
E ] ] (0,1)
|
||||
B { { (0,1)
|
||||
B } } (0,1)
|
||||
BE ^a ax (0,1)
|
||||
BE \^a a^a (1,3)
|
||||
BE a\^ a^ (0,2)
|
||||
BE a$ aa (1,2)
|
||||
BE a\$ a$ (0,2)
|
||||
BE ^$ NULL (0,0)
|
||||
E $^ NULL (0,0)
|
||||
E a($) aa (1,2)(2,2)
|
||||
E a*(^a) aa (0,1)(0,1)
|
||||
E (..)*(...)* a (0,0)
|
||||
E (..)*(...)* abcd (0,4)(2,4)
|
||||
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
|
||||
E (ab)c|abc abc (0,3)(0,2)
|
||||
E a{0}b ab (1,2)
|
||||
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E a{9876543210} NULL BADBR
|
||||
E ((a|a)|a) a (0,1)(0,1)(0,1)
|
||||
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
|
||||
E a*(a.|aa) aaaa (0,4)(2,4)
|
||||
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
|
||||
E (a|b)?.* b (0,1)(0,1)
|
||||
E (a|b)c|a(b|c) ac (0,2)(0,1)
|
||||
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
|
||||
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
|
||||
E (a|b)*c|(a|ab)*c xc (1,2)
|
||||
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
|
||||
E a?(ab|ba)ab abab (0,4)(0,2)
|
||||
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
|
||||
E ab|abab abbabab (0,2)
|
||||
E aba|bab|bba baaabbbaba (5,8)
|
||||
E aba|bab baaabbbaba (6,9)
|
||||
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
|
||||
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
|
||||
E ab|a xabc (1,3)
|
||||
E ab|a xxabc (2,4)
|
||||
Ei (Ab|cD)* aBcD (0,4)(2,4)
|
||||
BE [^-] --a (2,3)
|
||||
BE [a-]* --a (0,3)
|
||||
BE [a-m-]* --amoma-- (0,4)
|
||||
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
|
||||
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
|
||||
{E [[:upper:]] A (0,1) [[<element>]] not supported
|
||||
E [[:lower:]]+ `az{ (1,3)
|
||||
E [[:upper:]]+ @AZ[ (1,3)
|
||||
BE [[-]] [[-]] (2,4)
|
||||
BE [[.NIL.]] NULL ECOLLATE
|
||||
BE [[=aleph=]] NULL ECOLLATE
|
||||
}
|
||||
BE$ \n \n (0,1)
|
||||
BEn$ \n \n (0,1)
|
||||
BE$ [^a] \n (0,1)
|
||||
BE$ \na \na (0,2)
|
||||
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
|
||||
BE xxx xxx (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
|
||||
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
|
||||
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
|
||||
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
|
||||
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
|
||||
BE$ .* \x01\xff (0,2)
|
||||
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
|
||||
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
|
||||
E a*a*a*a*a*b aaaaaaaaab (0,10)
|
||||
BE ^ NULL (0,0)
|
||||
BE $ NULL (0,0)
|
||||
BE ^$ NULL (0,0)
|
||||
BE ^a$ a (0,1)
|
||||
BE abc abc (0,3)
|
||||
BE abc xabcy (1,4)
|
||||
BE abc ababc (2,5)
|
||||
BE ab*c abc (0,3)
|
||||
BE ab*bc abc (0,3)
|
||||
BE ab*bc abbc (0,4)
|
||||
BE ab*bc abbbbc (0,6)
|
||||
E ab+bc abbc (0,4)
|
||||
E ab+bc abbbbc (0,6)
|
||||
E ab?bc abbc (0,4)
|
||||
E ab?bc abc (0,3)
|
||||
E ab?c abc (0,3)
|
||||
BE ^abc$ abc (0,3)
|
||||
BE ^abc abcc (0,3)
|
||||
BE abc$ aabc (1,4)
|
||||
BE ^ abc (0,0)
|
||||
BE $ abc (3,3)
|
||||
BE a.c abc (0,3)
|
||||
BE a.c axc (0,3)
|
||||
BE a.*c axyzc (0,5)
|
||||
BE a[bc]d abd (0,3)
|
||||
BE a[b-d]e ace (0,3)
|
||||
BE a[b-d] aac (1,3)
|
||||
BE a[-b] a- (0,2)
|
||||
BE a[b-] a- (0,2)
|
||||
BE a] a] (0,2)
|
||||
BE a[]]b a]b (0,3)
|
||||
BE a[^bc]d aed (0,3)
|
||||
BE a[^-b]c adc (0,3)
|
||||
BE a[^]b]c adc (0,3)
|
||||
E ab|cd abc (0,2)
|
||||
E ab|cd abcd (0,2)
|
||||
E a\(b a(b (0,3)
|
||||
E a\(*b ab (0,2)
|
||||
E a\(*b a((b (0,4)
|
||||
E ((a)) abc (0,1)(0,1)(0,1)
|
||||
E (a)b(c) abc (0,3)(0,1)(2,3)
|
||||
E a+b+c aabbabc (4,7)
|
||||
E a* aaa (0,3)
|
||||
E (a*)* - (0,0)(0,0)
|
||||
E (a*)+ - (0,0)(0,0)
|
||||
E (a*|b)* - (0,0)(0,0)
|
||||
E (a+|b)* ab (0,2)(1,2)
|
||||
E (a+|b)+ ab (0,2)(1,2)
|
||||
E (a+|b)? ab (0,1)(0,1)
|
||||
BE [^ab]* cde (0,3)
|
||||
E (^)* - (0,0)(0,0)
|
||||
BE a* NULL (0,0)
|
||||
E ([abc])*d abbbcd (0,6)(4,5)
|
||||
E ([abc])*bcd abcd (0,4)(0,1)
|
||||
E a|b|c|d|e e (0,1)
|
||||
E (a|b|c|d|e)f ef (0,2)(0,1)
|
||||
E ((a*|b))* - (0,0)(0,0)(0,0)
|
||||
BE abcd*efg abcdefg (0,7)
|
||||
BE ab* xabyabbbz (1,3)
|
||||
BE ab* xayabbbz (1,2)
|
||||
E (ab|cd)e abcde (2,5)(2,4)
|
||||
BE [abhgefdc]ij hij (0,3)
|
||||
E (a|b)c*d abcd (1,4)(1,2)
|
||||
E (ab|ab*)bc abc (0,3)(0,1)
|
||||
E a([bc]*)c* abc (0,3)(1,3)
|
||||
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
|
||||
E a[bcd]*dcdcde adcdcde (0,7)
|
||||
E (ab|a)b*c abc (0,3)(0,2)
|
||||
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
|
||||
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
|
||||
E ^a(bc+|b[eh])g|.h$ abh (1,3)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
|
||||
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
|
||||
BE multiple words multiple words yeah (0,14)
|
||||
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
|
||||
BE abcd abcd (0,4)
|
||||
E a(bc)d abcd (0,4)(1,3)
|
||||
E a[-]?c ac (0,3)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
|
||||
E a+(b|c)*d+ aabcdd (0,6)(3,4)
|
||||
E ^.+$ vivi (0,4)
|
||||
E ^(.+)$ vivi (0,4)(0,4)
|
||||
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
|
||||
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
|
||||
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
|
||||
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
|
||||
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
|
||||
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
|
||||
E (foo|(bar))!bas foo!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas bar!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E (foo|bar)!bas foo!bas (0,7)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E .*(/XXX).* /XXX (0,4)(0,4)
|
||||
E .*(\\XXX).* \XXX (0,4)(0,4)
|
||||
E \\XXX \XXX (0,4)
|
||||
E .*(/000).* /000 (0,4)(0,4)
|
||||
E .*(\\000).* \000 (0,4)(0,4)
|
||||
E \\000 \000 (0,4)
|
62
categorize.dat
Normal file
62
categorize.dat
Normal file
|
@ -0,0 +1,62 @@
|
|||
NOTE regex implementation categorization 2004-05-31
|
||||
|
||||
?E aa* xaxaax (1,2) POSITION=leftmost
|
||||
; POSITION=bug
|
||||
|
||||
?E (a*)(ab)*(b*) abc (0,2)(0,1)(?,?)(1,2) ASSOCIATIVITY=right
|
||||
|E (a*)(ab)*(b*) abc (0,2)(0,0)(0,2)(2,2) ASSOCIATIVITY=left
|
||||
; ASSOCIATIVITY=bug
|
||||
|
||||
?E ((a*)(ab)*)((b*)(a*)) aba (0,3)(0,2)(0,0)(0,2)(2,3)(2,2)(2,3) SUBEXPRESSION=precedence
|
||||
|E ((a*)(ab)*)((b*)(a*)) aba (0,3)(0,1)(0,1)(?,?)(1,3)(1,2)(2,3) SUBEXPRESSION=grouping
|
||||
; SUBEXPRESSION=bug
|
||||
|
||||
?E (...?.?)* xxxxxx (0,6)(4,6) REPEAT_LONGEST=first
|
||||
|E (...?.?)* xxxxxx (0,6)(2,6) REPEAT_LONGEST=last
|
||||
|E (...?.?)* xxxxxx OK REPEAT_LONGEST=unknown
|
||||
; REPEAT_LONGEST=bug
|
||||
|
||||
?E (a|ab)(bc|c) abcabc (0,3)(0,2)(2,3) EXPECTED
|
||||
|E (a|ab)(bc|c) abcabc (0,3)(0,1)(1,3) BUG=alternation-order
|
||||
; BUG=alternation-order-UNKNOWN
|
||||
|
||||
?E (aba|a*b)(aba|a*b) ababa (0,5)(0,2)(2,5) EXPECTED
|
||||
|E (aba|a*b)(aba|a*b) ababa (0,4)(0,3)(3,4) BUG=first-match
|
||||
; BUG=unknown-match
|
||||
|
||||
?B a\(b\)*\1 a NOMATCH EXPECTED
|
||||
|B a\(b\)*\1 a (0,1) BUG=nomatch-match
|
||||
|B a\(b\)*\1 abab (0,2)(1,2) # BUG=repeat-any
|
||||
; BUG=nomatch-match-UNKNOWN
|
||||
|
||||
?E (a*){2} xxxxx (0,0)(0,0) EXPECTED
|
||||
|E (a*){2} xxxxx (5,5)(5,5) BUG=range-null
|
||||
; BUG=range-null-UNKNOWN
|
||||
|
||||
?B a\(b\)*\1 abab NOMATCH EXPECTED
|
||||
|B a\(b\)*\1 abab (0,1) # BUG=nomatch-match
|
||||
|B a\(b\)*\1 abab (0,2)(1,2) BUG=repeat-any
|
||||
; BUG=repeat-any-UNKNOWN
|
||||
|
||||
?E (a*)* a (0,1)(0,1) EXPECTED
|
||||
|E (a*)* ax (0,1)(0,1) BUG=repeat-null-unknown
|
||||
|E (a*)* a (0,1)(1,1) BUG=repeat-null
|
||||
; BUG=repeat-null-UNKNOWN
|
||||
|
||||
?E (aba|a*b)* ababa (0,5)(2,5) EXPECTED
|
||||
|E (aba|a*b)* ababa (0,5)(3,4) BUG=repeat-short
|
||||
|E (aba|a*b)* ababa (0,4)(3,4) # LENGTH=first
|
||||
; BUG=repeat-short-UNKNOWN
|
||||
|
||||
?E (a(b)?)+ aba (0,3)(2,3) EXPECTED
|
||||
|E (a(b)?)+ aba (0,3)(2,3)(1,2) BUG=repeat-artifact
|
||||
; BUG=repeat-artifact-UNKNOWN
|
||||
|
||||
?B \(a\(b\)*\)*\2 abab NOMATCH EXPECTED
|
||||
|B \(a\(b\)*\)*\2 abab (0,4)(2,3)(1,2) BUG=repeat-artifact-nomatch
|
||||
; BUG=repeat-artifact-nomatch-UNKNOWN
|
||||
|
||||
?E (a?)((ab)?)(b?)a?(ab)?b? abab (0,4)(0,1)(1,1)(?,?)(1,2)(2,4) BUG=subexpression-first
|
||||
|E .*(.*) ab (0,2)(2,2) EXPECTED
|
||||
|E .*(.*) ab (0,2)(0,2) BUG=subexpression-first
|
||||
; BUG=subexpression-first-UNKNOWN
|
30
forcedassoc.dat
Normal file
30
forcedassoc.dat
Normal file
|
@ -0,0 +1,30 @@
|
|||
NOTE left-assoc:pass-all right-assoc:pass-all : 2002-04-29
|
||||
|
||||
E (a|ab)(c|bcd) abcd (0,4)(0,1)(1,4)
|
||||
E (a|ab)(bcd|c) abcd (0,4)(0,1)(1,4)
|
||||
E (ab|a)(c|bcd) abcd (0,4)(0,1)(1,4)
|
||||
E (ab|a)(bcd|c) abcd (0,4)(0,1)(1,4)
|
||||
E ((a|ab)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
|
||||
E ((a|ab)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
|
||||
E ((ab|a)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
|
||||
E ((ab|a)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4)
|
||||
E (a|ab)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
|
||||
E (a|ab)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
|
||||
E (ab|a)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
|
||||
E (ab|a)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4)
|
||||
E (a*)(b|abc) abc (0,3)(0,0)(0,3)
|
||||
E (a*)(abc|b) abc (0,3)(0,0)(0,3)
|
||||
E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
|
||||
E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
|
||||
E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
|
||||
E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
|
||||
E (a*)(b|abc) abc (0,3)(0,0)(0,3)
|
||||
E (a*)(abc|b) abc (0,3)(0,0)(0,3)
|
||||
E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
|
||||
E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3)
|
||||
E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
|
||||
E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3)
|
||||
E (a|ab) ab (0,2)(0,2)
|
||||
E (ab|a) ab (0,2)(0,2)
|
||||
E (a|ab)(b*) ab (0,2)(0,2)(2,2)
|
||||
E (ab|a)(b*) ab (0,2)(0,2)(2,2)
|
16
leftassoc.dat
Normal file
16
leftassoc.dat
Normal file
|
@ -0,0 +1,16 @@
|
|||
NOTE left-assoc:pass-all right-assoc:pass-none : 2002-04-29
|
||||
|
||||
E (a|ab)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4)
|
||||
E (a|ab)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4)
|
||||
E (ab|a)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4)
|
||||
E (ab|a)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4)
|
||||
|
||||
E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3)
|
||||
E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3)
|
||||
E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3)
|
||||
E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3)
|
||||
|
||||
E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)
|
||||
E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)
|
||||
E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)
|
||||
E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4)
|
142
man/man1/testregex.html
Normal file
142
man/man1/testregex.html
Normal file
|
@ -0,0 +1,142 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<TITLE>testregex man document</TITLE>
|
||||
</HEAD>
|
||||
<BODY bgcolor=white>
|
||||
<PRE>
|
||||
NAME
|
||||
testregex - regex(3) test harness
|
||||
|
||||
SYNOPSIS
|
||||
testregex [ options ]
|
||||
|
||||
DESCRIPTION
|
||||
testregex reads regex(3) test specifications, one per line, from the
|
||||
standard input and writes one output line for each failed test. A
|
||||
summary line is written after all tests are done. Each successful
|
||||
test is run again with REG_NOSUB. Unsupported features are noted
|
||||
before the first test, and tests requiring these features are
|
||||
silently ignored.
|
||||
|
||||
OPTIONS
|
||||
-c catch signals and non-terminating calls
|
||||
-e ignore error return mismatches
|
||||
-h list help on standard error
|
||||
-n do not repeat successful tests with regnexec()
|
||||
-o ignore match[] overrun errors
|
||||
-p ignore negative position mismatches
|
||||
-s use stack instead of malloc
|
||||
-x do not repeat successful tests with REG_NOSUB
|
||||
-v list each test line
|
||||
-A list failed test lines with actual answers
|
||||
-B list all test lines with actual answers
|
||||
-F list failed test lines
|
||||
-P list passed test lines
|
||||
-S output one summary line
|
||||
|
||||
INPUT FORMAT
|
||||
Input lines may be blank, a comment beginning with #, or a test
|
||||
specification. A specification is five fields separated by one
|
||||
or more tabs. NULL denotes the empty string and NIL denotes the
|
||||
0 pointer.
|
||||
|
||||
Field 1: the regex(3) flags to apply, one character per REG_feature
|
||||
flag. The test is skipped if REG_feature is not supported by the
|
||||
implementation. If the first character is not [BEASKLP] then the
|
||||
specification is a global control line. One or more of [BEASKLP] may be
|
||||
specified; the test will be repeated for each mode.
|
||||
|
||||
B basic BRE (grep, ed, sed)
|
||||
E REG_EXTENDED ERE (egrep)
|
||||
A REG_AUGMENTED ARE (egrep with negation)
|
||||
S REG_SHELL SRE (sh glob)
|
||||
K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
|
||||
L REG_LITERAL LRE (fgrep)
|
||||
|
||||
a REG_LEFT|REG_RIGHT implicit ^...$
|
||||
b REG_NOTBOL lhs does not match ^
|
||||
c REG_COMMENT ignore space and #...\n
|
||||
d REG_SHELL_DOT explicit leading . match
|
||||
e REG_NOTEOL rhs does not match $
|
||||
f REG_MULTIPLE multiple \n separated patterns
|
||||
g FNM_LEADING_DIR testfnmatch only -- match until /
|
||||
h REG_MULTIREF multiple digit backref
|
||||
i REG_ICASE ignore case
|
||||
j REG_SPAN . matches \n
|
||||
k REG_ESCAPE \ to ecape [...] delimiter
|
||||
l REG_LEFT implicit ^...
|
||||
m REG_MINIMAL minimal match
|
||||
n REG_NEWLINE explicit \n match
|
||||
o REG_ENCLOSED (|&) magic inside [@|&](...)
|
||||
p REG_SHELL_PATH explicit / match
|
||||
q REG_DELIMITED delimited pattern
|
||||
r REG_RIGHT implicit ...$
|
||||
s REG_SHELL_ESCAPED \ not special
|
||||
t REG_MUSTDELIM all delimiters must be specified
|
||||
u standard unspecified behavior -- errors not counted
|
||||
v REG_CLASS_ESCAPE \ special inside [...]
|
||||
w REG_NOSUB no subexpression match array
|
||||
x REG_LENIENT let some errors slide
|
||||
y REG_LEFT regexec() implicit ^...
|
||||
z REG_NULL NULL subexpressions ok
|
||||
$ expand C \c escapes in fields 2 and 3
|
||||
/ field 2 is a regsubcomp() expression
|
||||
= field 3 is a regdecomp() expression
|
||||
|
||||
Field 1 control lines:
|
||||
|
||||
C set LC_COLLATE and LC_CTYPE to locale in field 2
|
||||
|
||||
?test ... output field 5 if passed and != EXPECTED, silent otherwise
|
||||
&test ... output field 5 if current and previous passed
|
||||
|test ... output field 5 if current passed and previous failed
|
||||
; ... output field 2 if previous failed
|
||||
{test ... skip if failed until }
|
||||
} end of skip
|
||||
|
||||
: comment comment copied as output NOTE
|
||||
:comment:test :comment: ignored
|
||||
N[OTE] comment comment copied as output NOTE
|
||||
T[EST] comment comment
|
||||
|
||||
number use number for nmatch (20 by default)
|
||||
|
||||
Field 2: the regular expression pattern; SAME uses the pattern from
|
||||
the previous specification.
|
||||
|
||||
Field 3: the string to match.
|
||||
|
||||
Field 4: the test outcome. This is either one of the posix error
|
||||
codes (with REG_ omitted) or the match array, a list of (m,n)
|
||||
entries with m and n being first and last+1 positions in the
|
||||
field 3 string, or NULL if REG_NOSUB is in effect and success
|
||||
is expected. BADPAT is acceptable in place of any regcomp(3)
|
||||
error code. The match[] array is initialized to (-2,-2) before
|
||||
each test. All array elements from 0 to nmatch-1 must be specified
|
||||
in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
|
||||
Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
|
||||
matched (?{...}) expression, where x is the text enclosed by {...},
|
||||
o is the expression ordinal counting from 1, and n is the length of
|
||||
the unmatched portion of the subject string. If x starts with a
|
||||
number then that is the return value of re_execf(), otherwise 0 is
|
||||
returned.
|
||||
|
||||
Field 5: optional comment appended to the report.
|
||||
|
||||
CAVEAT
|
||||
If a regex implementation misbehaves with memory then all bets are off.
|
||||
|
||||
CONTRIBUTORS
|
||||
Glenn Fowler gsf@research.att.com (ksh strmatch, regex extensions)
|
||||
David Korn dgk@research.att.com (ksh glob matcher)
|
||||
Doug McIlroy mcilroy@dartmouth.edu (ast regex/testre in C++)
|
||||
Tom Lord lord@regexps.com (rx tests)
|
||||
Henry Spencer henry@zoo.toronto.edu (original public regex)
|
||||
Andrew Hume andrew@research.att.com (gre tests)
|
||||
John Maddock John_Maddock@compuserve.com (regex++ tests)
|
||||
Philip Hazel ph10@cam.ac.uk (pcre tests)
|
||||
Ville Laurikari vl@iki.fi (libtre tests)
|
||||
</PRE>
|
||||
</BODY>
|
||||
</HTML>
|
73
nullsubexpr.dat
Normal file
73
nullsubexpr.dat
Normal file
|
@ -0,0 +1,73 @@
|
|||
NOTE null subexpression matches : 2002-06-06
|
||||
|
||||
E (a*)* a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)* a (0,1)(0,1)
|
||||
E SAME x (0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)+ a (0,1)(0,1)
|
||||
E SAME x NOMATCH
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
|
||||
E ([a]*)* a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([a]*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([^b]*)* a (0,1)(0,1)
|
||||
E SAME b (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaab (0,6)(0,6)
|
||||
E ([ab]*)* a (0,1)(0,1)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME ababab (0,6)(0,6)
|
||||
E SAME bababa (0,6)(0,6)
|
||||
E SAME b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
E SAME aaaabcde (0,5)(0,5)
|
||||
E ([^a]*)* b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
E SAME aaaaaa (0,0)(0,0)
|
||||
E ([^ab]*)* ccccxx (0,6)(0,6)
|
||||
E SAME ababab (0,0)(0,0)
|
||||
|
||||
E ((z)+|a)* zabcde (0,2)(1,2)
|
||||
|
||||
{E a+? aaaaaa (0,1) no *? +? mimimal match ops
|
||||
E (a) aaa (0,1)(0,1)
|
||||
E (a*?) aaa (0,0)(0,0)
|
||||
E (a)*? aaa (0,0)
|
||||
E (a*?)*? aaa (0,0)
|
||||
}
|
||||
|
||||
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
|
||||
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
|
||||
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
|
||||
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
|
||||
|
||||
E (a*)*(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)*(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)*(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*)+(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)+(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)+(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*){2}(x) x (0,1)(0,0)(0,1)
|
||||
E (a*){2}(x) ax (0,2)(1,1)(1,2)
|
||||
E (a*){2}(x) axa (0,2)(1,1)(1,2)
|
64
re-assoc.html
Normal file
64
re-assoc.html
Normal file
|
@ -0,0 +1,64 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
|
||||
<META name="keywords" content="regex catenation associativity tests">
|
||||
<TITLE> ../re/re-assoc.mm mm document </TITLE>
|
||||
<META name="author" content="gsf">
|
||||
</HEAD>
|
||||
<BODY bgcolor=white link=slateblue vlink=teal >
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TBODY><TR><TD valign=top align=left>
|
||||
<!--INDEX--><!--/INDEX-->
|
||||
<P>
|
||||
<HR>
|
||||
<CENTER>
|
||||
<H3><CENTER><FONT color=red><FONT face=courier>regex catenation associativity tests</FONT></FONT></CENTER></H3>
|
||||
<BR>Glenn Fowler <SMALL><<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>></SMALL>
|
||||
<P><I>AT&T Labs Research - Florham Park NJ</I>
|
||||
</CENTER>
|
||||
<P><HR><P>
|
||||
The
|
||||
<STRONG>regex</STRONG>
|
||||
tests in
|
||||
{
|
||||
<A href="http://web.archive.org/web/20080724204655id_/http://www.research.att.com/~gsf/testregex/leftassoc.dat">leftassoc.dat</A>
|
||||
<A href="http://web.archive.org/web/20080724204655id_/http://www.research.att.com/~gsf/testregex/rightassoc.dat">rightassoc.dat</A>
|
||||
<A href="http://web.archive.org/web/20080724204655id_/http://www.research.att.com/~gsf/testregex/forcedassoc.dat">forcedassoc.dat</A>
|
||||
}
|
||||
exercise the associativity of catenation.
|
||||
<P>
|
||||
<HR>
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-assoc.mm mm document">Glenn Fowler</A></TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Information and Software Systems Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>AT&T Labs Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Florham Park NJ</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>August 04, 2002</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<P>
|
||||
|
||||
</TD></TR></TBODY></TABLE>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
209
re-categorize.html
Normal file
209
re-categorize.html
Normal file
|
@ -0,0 +1,209 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
|
||||
<META name="keywords" content="regex implementation categorization">
|
||||
<TITLE> ../re/re-categorize.mm mm document </TITLE>
|
||||
<META name="author" content="gsf">
|
||||
</HEAD>
|
||||
<BODY bgcolor=white link=slateblue vlink=teal >
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TBODY><TR><TD valign=top align=left>
|
||||
<!--INDEX--><!--/INDEX-->
|
||||
<P>
|
||||
<HR>
|
||||
<CENTER>
|
||||
<H3><CENTER><FONT color=red><FONT face=courier>regex implementation categorization</FONT></FONT></CENTER></H3>
|
||||
<BR>Glenn Fowler <SMALL><<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>></SMALL>
|
||||
<P><I>AT&T Labs Research - Florham Park NJ</I>
|
||||
</CENTER>
|
||||
<P><HR><P>
|
||||
The
|
||||
<STRONG>regex</STRONG>
|
||||
tests in
|
||||
<A href="http://web.archive.org/web/20080726034626id_/http://www.research.att.com/~gsf/testregex/categorize.dat">categorize.dat</A>
|
||||
attempt to categorize
|
||||
<STRONG>regex</STRONG>
|
||||
implementations.
|
||||
The tests do not address internationalization.
|
||||
All implementations report the leftmost match; this is omitted from the table.
|
||||
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
|
||||
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 frame=void rules=none >
|
||||
<TBODY>
|
||||
<TR><TD align=center>LABEL </TD><TD align=center> ASSOC </TD><TD align=center> SUBEXPR </TD><TD align=center> REP_LONGEST </TD><TD align=center> BUGS</TD></TR>
|
||||
<TR><TD align=center>
|
||||
A </TD><TD align=center> right </TD><TD align=center> precedence </TD><TD align=center> first </TD><TD align=center> -</TD></TR>
|
||||
<TR><TD align=center>
|
||||
B </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-null repeat-short repeat-artifact-nomatch</TD></TR>
|
||||
<TR><TD align=center>
|
||||
D </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> -</TD></TR>
|
||||
<TR><TD align=center>
|
||||
G </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> alternation-order repeat-null repeat-artifact repeat-artifact-nomatch</TD></TR>
|
||||
<TR><TD align=center>
|
||||
H </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> alternation-order nomatch-match repeat-null repeat-artifact repeat-artifact-nomatch</TD></TR>
|
||||
<TR><TD align=center>
|
||||
I </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-any repeat-short repeat-artifact-nomatch</TD></TR>
|
||||
<TR><TD align=center>
|
||||
J </TD><TD align=center> right </TD><TD align=center> precedence </TD><TD align=center> last </TD><TD align=center> nomatch-match repeat-artifact repeat-artifact-nomatch subexpression-first</TD></TR>
|
||||
<TR><TD align=center>
|
||||
M </TD><TD align=center> right </TD><TD align=center> precedence </TD><TD align=center> last </TD><TD align=center> range-null repeat-artifact repeat-artifact-nomatch subexpression-first</TD></TR>
|
||||
<TR><TD align=center>
|
||||
O </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-null repeat-short repeat-artifact-nomatch</TD></TR>
|
||||
<TR><TD align=center>
|
||||
P </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> alternation-order first-match repeat-null repeat-artifact</TD></TR>
|
||||
<TR><TD align=center>
|
||||
R </TD><TD align=center> left </TD><TD align=center> precedence </TD><TD align=center> last </TD><TD align=center> -</TD></TR>
|
||||
<TR><TD align=center>
|
||||
S </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-null repeat-short repeat-artifact-nomatch</TD></TR>
|
||||
<TR><TD align=center>
|
||||
T </TD><TD align=center> left </TD><TD align=center> precedence </TD><TD align=center> last </TD><TD align=center> -</TD></TR>
|
||||
<TR><TD align=center>
|
||||
U </TD><TD align=center> right </TD><TD align=center> precedence </TD><TD align=center> first </TD><TD align=center> repeat-null subexpression-first</TD></TR>
|
||||
<TR><TD align=center>
|
||||
darwin.ppc </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-null repeat-short</TD></TR>
|
||||
<TR><TD align=center>
|
||||
freebsd.i386 </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-null repeat-short</TD></TR>
|
||||
<TR><TD align=center>
|
||||
hp.pa </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-artifact</TD></TR>
|
||||
<TR><TD align=center>
|
||||
ibm.risc </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> alternation-order nomatch-match repeat-artifact repeat-artifact-nomatch</TD></TR>
|
||||
<TR><TD align=center>
|
||||
linux.i386 </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> alternation-order repeat-artifact repeat-null</TD></TR>
|
||||
<TR><TD align=center>
|
||||
sgi.mips3 </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> repeat-short</TD></TR>
|
||||
<TR><TD align=center>
|
||||
sol8.sun4 </TD><TD align=center> right </TD><TD align=center> grouping </TD><TD align=center> first </TD><TD align=center> alternation-order nomatch-match repeat-artifact</TD></TR>
|
||||
<TR><TD align=center>
|
||||
unixware.i386 </TD><TD align=center> right </TD><TD align=center> precedence </TD><TD align=center> first </TD><TD align=center> repeat-null subexpression-first</TD></TR>
|
||||
</TBODY></TABLE></TD></TR></TBODY></TABLE>
|
||||
<P>
|
||||
The categories are:
|
||||
<DL COMPACT>
|
||||
<DL COMPACT>
|
||||
<DT><STRONG>LABEL</STRONG><DD>
|
||||
The implementation label from
|
||||
<A href="http://web.archive.org/web/20080726034626id_/http://www.research.att.com/~gsf/testregex/">testregex.</A>
|
||||
<DT><STRONG>ASSOC</STRONG><DD>
|
||||
Subpattern (or atom) associativity: either
|
||||
<STRONG>left</STRONG>
|
||||
or
|
||||
<STRONG>right</STRONG>.
|
||||
The subexpression match rule in the rationale requires
|
||||
<STRONG>right</STRONG>
|
||||
for expressions where each concatenated part is a subexpression.
|
||||
There is no definition for
|
||||
<EM>subpattern</EM>,
|
||||
but it would be inconsistent for any definition to require different
|
||||
associativity than that for subexpressions.
|
||||
Some claim that the BRE and ERE grammars specify
|
||||
<STRONG>left</STRONG>
|
||||
associativity, but this interpretation disregards
|
||||
the subexpression match rule in the rationale.
|
||||
The grammar can also be interpreted to support
|
||||
<STRONG>right</STRONG>
|
||||
associativity, and this interpretation is in accord with the rationale.
|
||||
<DT><STRONG>SUBEXPR</STRONG><DD>
|
||||
Subexpression semantics:
|
||||
<STRONG>precedence</STRONG>
|
||||
if subexpressions can override the default associativity;
|
||||
<STRONG>grouping</STRONG>
|
||||
if subexpressions are for repetition and
|
||||
<STRONG>regmatch_t</STRONG>
|
||||
grouping only.
|
||||
The subexpression match rule in the rationale requires
|
||||
<STRONG>precedence</STRONG>.
|
||||
<DT><STRONG>REP_LONGEST</STRONG><DD>
|
||||
How repeated subexpressions that match more than once are handled:
|
||||
<STRONG>first</STRONG>
|
||||
if the longest possible matches occur first;
|
||||
<STRONG>last</STRONG>
|
||||
if the longest possible matches occur last;
|
||||
<STRONG>unknown</STRONG>
|
||||
otherwise.
|
||||
The subexpression match rule in the rationale requires
|
||||
<STRONG>first</STRONG>.
|
||||
<DT><STRONG>BUGS</STRONG><DD>
|
||||
Miscellaneous bugs (see
|
||||
<A href="http://web.archive.org/web/20080726034626id_/http://www.research.att.com/~gsf/testregex/categorize.dat">categorize.dat</A>
|
||||
for specific examples):
|
||||
<DL COMPACT>
|
||||
<DL COMPACT>
|
||||
<DT><STRONG>alternation-order</STRONG><DD>
|
||||
A change in the order of subexpression alternation operands,
|
||||
<EM>not involved in a tie</EM>,
|
||||
changes
|
||||
<STRONG>regmatch_t</STRONG>
|
||||
values.
|
||||
Some implementations with this bug can be coaxed into missing the
|
||||
overall longest match.
|
||||
<DT><STRONG>first-match</STRONG><DD>
|
||||
The first of the leftmost matches, instead of the longest of the
|
||||
leftmost matches, is returned.
|
||||
<DT><STRONG>nomatch-match</STRONG><DD>
|
||||
A back-reference to a
|
||||
<STRONG>regmatch_t</STRONG>
|
||||
(-1,-1) value is treated as matching.
|
||||
<DT><STRONG>range-null</STRONG><DD>
|
||||
A range-repeated subexpression that matches null does not report the match
|
||||
at offset (0,0).
|
||||
<DT><STRONG>repeat-artifact</STRONG><DD>
|
||||
A
|
||||
<STRONG>regmatch_t</STRONG>
|
||||
value is reported for a repeated match that is not the last match.
|
||||
<DT><STRONG>repeat-artifact-nomatch</STRONG><DD>
|
||||
To prevent not matching,
|
||||
a
|
||||
<STRONG>regmatch_t</STRONG>
|
||||
value is reported for a repeated match that is not the last match.
|
||||
<DT><STRONG>repeat-null</STRONG><DD>
|
||||
A repeated subexpression matches the null string even though it is not
|
||||
the only match and is not necessary to satisfy the exact or minimum
|
||||
number of occurrences for an interval expression.
|
||||
<DT><STRONG>repeat-short</STRONG><DD>
|
||||
Incorrect
|
||||
<STRONG>regmatch_t</STRONG>
|
||||
values for a repeated subexpression.
|
||||
This may be a variant of
|
||||
<STRONG>repeat-artifact</STRONG>.
|
||||
<DT><STRONG>subexpression-first</STRONG><DD>
|
||||
A subexpression match takes precedence over a subpattern
|
||||
to its left.
|
||||
</DL>
|
||||
</DL>
|
||||
</DL>
|
||||
</DL>
|
||||
<P>
|
||||
<HR>
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-categorize.mm mm document">Glenn Fowler</A></TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Information and Software Systems Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>AT&T Labs Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Florham Park NJ</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>June 01, 2004</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<P>
|
||||
|
||||
</TD></TR></TBODY></TABLE>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
997
re-interpretation.html
Normal file
997
re-interpretation.html
Normal file
|
@ -0,0 +1,997 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META name="generator" content="mm2html (AT&T Research) 2010-09-10">
|
||||
<META name="keywords" content="regex regular expression standard interpretation">
|
||||
<TITLE> ../re/re-interpretation.mm mm document </TITLE>
|
||||
<META name="author" content="gsf">
|
||||
</HEAD>
|
||||
<BODY bgcolor=white link=slateblue vlink=teal >
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TBODY><TR><TD valign=top align=left>
|
||||
<!--INDEX--><!--/INDEX-->
|
||||
<B><FONT size=-1 face="verdana,arial,helvetica,geneva,sans-serif">
|
||||
<TABLE align=center cellpadding=2 border=4 bgcolor=lightgrey><TR>
|
||||
<TD><A href="re-interpretation.html#Abstract">Abstract</A></TD>
|
||||
<TD><A href="re-interpretation.html#Background">Background</A></TD>
|
||||
<TD><A href="re-interpretation.html#Notation">Notation</A></TD>
|
||||
<TD><A href="re-interpretation.html#regex Glossary">regex Glossary</A></TD>
|
||||
<TD><A href="re-interpretation.html#A subexpression is ">A subexpression is </A></TD>
|
||||
<TD><A href="re-interpretation.html#A subpattern is ">A subpattern is </A></TD>
|
||||
<TD><A href="re-interpretation.html#The Dark Corners ">The Dark Corners </A></TD>
|
||||
<TD><A href="re-interpretation.html#Conclusion">Conclusion</A></TD>
|
||||
</TR></TABLE>
|
||||
</FONT></B>
|
||||
<P>
|
||||
<HR>
|
||||
<CENTER>
|
||||
<H3><CENTER><FONT color=red><FONT face=courier>An Interpretation of the POSIX regex Standard</FONT></FONT></CENTER></H3>
|
||||
<BR>Glenn Fowler <SMALL><<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>></SMALL>
|
||||
<P><I>AT&T Research - Florham Park NJ</I>
|
||||
</CENTER>
|
||||
<P>
|
||||
<CENTER><FONT color=red><FONT face=courier><H3 align=center><A name="Abstract">Abstract</A></H3></FONT></FONT></CENTER>
|
||||
Many passages in the POSIX
|
||||
<STRONG>regex</STRONG>
|
||||
standard seem to be open for interpretation.
|
||||
Differences between several published
|
||||
<A href="http://www.research.att.com/~gsf/testregex/" target=_top>implementations</A>
|
||||
of the
|
||||
<STRONG>regex</STRONG>
|
||||
API bear this out.
|
||||
Instead of relegating these differences to the
|
||||
<EM>undefined behavior</EM>
|
||||
bucket, this paper proposes a resolution to each
|
||||
by direct application of the standard text.
|
||||
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Background">Background</A></H3></FONT></FONT></CENTER>
|
||||
The POSIX
|
||||
<STRONG>regex</STRONG>
|
||||
standard is spread across four documents:
|
||||
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
|
||||
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 >
|
||||
<TBODY>
|
||||
<TR><TD align=right>
|
||||
glossary </TD><TD align=center> G </TD><TD align=left> <A href="http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap03.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap03.html</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
api </TD><TD align=center> A </TD><TD align=left> <A href="http://www.opengroup.org/onlinepubs/007904975/functions/regcomp.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/functions/regcomp.html</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
definition </TD><TD align=center> D </TD><TD align=left> <A href="http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap09.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap09.html</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
rationale </TD><TD align=center> R </TD><TD align=left> <A href="http://www.opengroup.org/onlinepubs/007904975/xrat/xbd_chap09.html" target=_top>http://www.opengroup.org/onlinepubs/007904975/xrat/xbd_chap09.html</A></TD></TR>
|
||||
</TBODY></TABLE></TD></TR></TBODY></TABLE>
|
||||
<P>
|
||||
It describes
|
||||
<STRONG>BRE</STRONG>s
|
||||
(basic regular expressions, a.k.a.,
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man1/grep.html"><STRONG>grep</STRONG></A>(1)</NOBR>
|
||||
style) and
|
||||
<STRONG>ERE</STRONG>s
|
||||
(extended regular expressions, a.k.a.,
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man1/egrep.html"><STRONG>egrep</STRONG></A>(1)</NOBR>
|
||||
style)
|
||||
and how an RE of each type matches subject strings.
|
||||
The standard also provides an API:
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man3/regcomp.html"><STRONG>regcomp</STRONG></A>(3)</NOBR>
|
||||
for compiling an RE, and
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man3/regexec.html"><STRONG>regexec</STRONG></A>(3)</NOBR>
|
||||
for matching a compiled RE against a subject string.
|
||||
The
|
||||
<STRONG>regexec</STRONG>
|
||||
API
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
int regexec(const regex_t* restrict preg, const char* restrict string,
|
||||
size_t nmatch, regmatch_t pmatch[restrict], int eflags);
|
||||
</DIV>
|
||||
</PRE>
|
||||
is at the center of multiple, conflicting interpretations of the standard.
|
||||
These interpretations differ on the setting of the
|
||||
<TT>pmatch[]</TT>
|
||||
array for index values > 0.
|
||||
This note presents examples that demonstrate interpretation conflicts,
|
||||
and then provides standard references that,
|
||||
<EM>when taken as a whole</EM>,
|
||||
resolve the conflicts.
|
||||
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Notation">Notation</A></H3></FONT></FONT></CENTER>
|
||||
Standard references use the notation
|
||||
[<EM>document</EM>:<EM>begin</EM>[-<EM>end</EM>]]
|
||||
where
|
||||
<EM>document</EM>
|
||||
is the document letter, { A D G R }, from the table above,
|
||||
<EM>begin</EM>
|
||||
is the beginning line number, and
|
||||
<EM>end</EM>
|
||||
is the ending line number.
|
||||
Line numbers are taken from the 2001 X/Open printing.
|
||||
Unfortunately the online links do not display line numbers.
|
||||
For example, [A:37179-37180] is the reference for the
|
||||
<STRONG>regexec</STRONG>
|
||||
API prototype above.
|
||||
<P>
|
||||
Example patterns, subject strings, and
|
||||
<TT>pmatch[]</TT>
|
||||
array values use the regression test notation of
|
||||
<A href="http://www.research.att.com/~gsf/testregex/" target=_top>testregex.</A>
|
||||
You can download the source and compile it against your favorite regex
|
||||
implementation.
|
||||
All of the examples in this note have been placed in the file
|
||||
<A href="http://www.research.att.com/~gsf/testregex/interpretation.dat" target=_top>interpretation.dat;</A>
|
||||
you can download this file and use it as input to
|
||||
<STRONG>testregex</STRONG>.
|
||||
For example, the
|
||||
<STRONG>testregex</STRONG>
|
||||
input
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#01:E a+ xaax (1,3)
|
||||
</DIV>
|
||||
</PRE>
|
||||
specifies that the ERE pattern "a+" matched against the
|
||||
subject string "xaax" yields
|
||||
<TT>pmatch[0].rm_so==1</TT>
|
||||
and
|
||||
<TT>pmatch[0].rm_eo==3</TT>.
|
||||
The example is labeled RE#01 for indexing and referencing.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#02:B .\(a*\). xaax (0,4)(1,3)
|
||||
</DIV>
|
||||
</PRE>
|
||||
specifies that the BRE pattern ".\(a*\)." matched against the subject
|
||||
string "xaax" yields
|
||||
<TT>pmatch[0].rm_so==0</TT>,
|
||||
<TT>pmatch[0].rm_eo==4</TT>,
|
||||
<TT>pmatch[1].rm_so==1</TT>,
|
||||
<TT>pmatch[1].rm_eo==3</TT>.
|
||||
(?,?) denotes
|
||||
<TT>rm_so</TT>
|
||||
and
|
||||
<TT>rm_eo</TT>
|
||||
values of -1, i.e., a non-match.
|
||||
The first field allows additional flags that exercise all of the
|
||||
<STRONG>REG_*</STRONG>
|
||||
<STRONG>regcomp</STRONG>
|
||||
and
|
||||
<STRONG>regexec</STRONG>
|
||||
flags; see
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man1/testregex.html"><STRONG>testregex</STRONG></A>(1)</NOBR>
|
||||
or
|
||||
<STRONG>testregex --man</STRONG>
|
||||
for details.
|
||||
Note that
|
||||
<STRONG>tab</STRONG>
|
||||
is the field separator in the
|
||||
<STRONG>testregex</STRONG>
|
||||
syntax; if you mouse snarf then make sure that
|
||||
<STRONG>tabs</STRONG>
|
||||
are preserved.
|
||||
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="regex Glossary">regex Glossary</A></H3></FONT></FONT></CENTER>
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT>[G:41]<STRONG>Basic Regular Expression (BRE)</STRONG><DD>
|
||||
A regular expression used by the majority of utilities that select strings
|
||||
from a set of character strings.
|
||||
<DT>[G:148]<STRONG>Entire Regular Expression</STRONG><DD>
|
||||
The concatenated set of one or more basic regular expressions or extended
|
||||
regular expressions that make up the pattern specified for string selection.
|
||||
<DT>[G:158]<STRONG>Extended Regular Expression (ERE)</STRONG><DD>
|
||||
A regular expression that is an alternative to the Basic Regular
|
||||
Expression using a more extensive syntax, occasionally used by some utilities.
|
||||
<DT>[G:269]<STRONG>Pattern</STRONG><DD>
|
||||
A sequence of characters used either with regular expression notation or for
|
||||
pathname expansion, as a means of selecting various character strings or
|
||||
pathnames, respectively.
|
||||
<DT>[G:316]<STRONG>Regular Expression</STRONG><DD>
|
||||
A pattern that selects specific strings from a set of character strings.
|
||||
</DL>
|
||||
</DIV>
|
||||
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="A subexpression is ">A subexpression is </A></H3></FONT></FONT></CENTER>
|
||||
The
|
||||
<STRONG>regex</STRONG>
|
||||
standard is surprisingly cavalier with terminology:
|
||||
some terms are used interchangeably, some are used in a general context
|
||||
in one section and a specific context in another, and some are
|
||||
used without any definition whatsoever.
|
||||
Acutely subject to this abuse are:
|
||||
<EM>RE</EM>,
|
||||
<EM>pattern</EM>,
|
||||
<EM>subpattern</EM>,
|
||||
<EM>expression</EM>,
|
||||
and
|
||||
<EM>subexpression</EM>.
|
||||
In particular,
|
||||
<EM>subpattern</EM>
|
||||
and
|
||||
<EM>subexpression</EM>
|
||||
are central to the description of the matching algorithm and how
|
||||
<TT>pmatch[]</TT>
|
||||
is assigned.
|
||||
Any interpretation of the
|
||||
<STRONG>regex</STRONG>
|
||||
standard involving these terms, absent a precise and accurate definition
|
||||
for each, is useless.
|
||||
<P>
|
||||
<EM>subexpression</EM>
|
||||
appears 70 times, and each reference is in the context of parenthesis grouping:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT>[D:5909-5911]<DD>
|
||||
For example, matching the BRE "\(.*\).*" against "abcdef" , the
|
||||
subexpression "(\1)" is "abcdef" , and matching the BRE
|
||||
"\(a*\)*" against "bc" , the subexpression "(\1)" is the null
|
||||
string.
|
||||
<DT>[D:5984-5988]<DD>
|
||||
The asterisk shall be special except when used: As the first
|
||||
character of a subexpression (after an initial '^' , if any);
|
||||
<DT>[D:6094-6097]<DD>
|
||||
A subexpression can be defined within a BRE by enclosing it
|
||||
between the character pairs "\(" and "\)" . Subexpressions can
|
||||
be arbitrarily nested.
|
||||
<DT>[D:6100-6109]<DD>
|
||||
The character 'n' shall be a digit from 1 through 9, specifying
|
||||
the nth subexpression (the one that begins with the nth "\("
|
||||
from the beginning of the pattern and ends with the
|
||||
corresponding paired "\)" ). The expression is invalid if less
|
||||
than n subexpressions precede the '\n' . For example, the
|
||||
expression "\(.*\)\1$" matches a line consisting of two
|
||||
adjacent appearances of the same string, and the expression
|
||||
"\(a\)*\1" fails to match 'a' . When the referenced
|
||||
subexpression matched more than one string, the back-referenced
|
||||
expression shall refer to the last matched string. If the
|
||||
subexpression referenced by the back-reference matches more
|
||||
than one string because of an asterisk ( '*' ) or an interval
|
||||
expression (see item (5)), the back-reference shall match the
|
||||
last (rightmost) of these strings.
|
||||
<DT>[D:6110-6112]<DD>
|
||||
When a BRE matching a single character, a subexpression, or a
|
||||
back-reference is followed by the special character asterisk ('*' ),
|
||||
together with that asterisk it shall match what zero or
|
||||
more consecutive occurrences of the BRE would match.
|
||||
<DT>[D:6114-6117]<DD>
|
||||
When a BRE matching a single character, a subexpression, or a
|
||||
back-reference is followed by an interval expression of the
|
||||
format "\{m\}" , "\{m,\}" , or "\{m,n\}" , together with that
|
||||
interval expression it shall match what repeated consecutive
|
||||
occurrences of the BRE would match. "\{m,n\}" , together with
|
||||
that interval expression it shall match what repeated
|
||||
consecutive occurrences of the BRE would match.
|
||||
<DT>[D:6127-6129]<DD>
|
||||
A subexpression repeated by an asterisk ('*') or an interval expression
|
||||
shall not match a null expression unless this is the only match for the
|
||||
repetition or it is necessary to satisfy the exact or minimum number of
|
||||
occurrences for the interval expression.
|
||||
<DT>[D:6136]<DD>
|
||||
Subexpressions/back-references \(\) \n
|
||||
<DT>[D:6145-6151]<DD>
|
||||
The implementation may treat the circumflex as an anchor when
|
||||
used as the first character of a subexpression. The circumflex
|
||||
shall anchor the
|
||||
expression (or optionally subexpression) to the beginning of a
|
||||
string; only sequences starting at the first character of a
|
||||
string shall be matched by the BRE. For example, the BRE "^ab"
|
||||
matches "ab" in the string "abcdef" , but fails to match in the
|
||||
string "cdefab" . The BRE "\(^ab\)" may match the former
|
||||
string. A portable BRE shall escape a leading circumflex in a
|
||||
subexpression to match a literal circumflex.
|
||||
<DT>[D:6152-6156]<DD>
|
||||
A dollar sign ( '$' ) shall be an anchor when used as the last
|
||||
character of an entire BRE. The implementation may treat a
|
||||
dollar sign as an anchor when used as the last character of a
|
||||
subexpression. The dollar sign shall anchor the expression (or
|
||||
optionally subexpression) to the end of the string being matched;
|
||||
the dollar sign can be said to match the end-of-string following
|
||||
the last character.
|
||||
<DT>[D:6265-6270]<DD>
|
||||
A circumflex ( '^' ) outside a bracket expression shall anchor
|
||||
the expression or subexpression it begins to the beginning of a
|
||||
string; such an expression or subexpression can match only a
|
||||
sequence starting at the first character of a string. For
|
||||
example, the EREs "^ab" and "(^ab)" match "ab" in the string
|
||||
"abcdef" , but fail to match in the string "cdefab" , and the
|
||||
ERE "a^b" is valid, but can never match because the 'a'
|
||||
prevents the expression "^b" from matching starting at the
|
||||
first character.
|
||||
<DT>[D:6271-6276]<DD>
|
||||
A dollar sign ( '$' ) outside a bracket expression shall anchor
|
||||
the expression or subexpression it ends to the end of a string;
|
||||
such an expression or subexpression can match only a sequence
|
||||
ending at the last character of a string. For example, the EREs
|
||||
"ef$" and "(ef$)" match "ef" in the string "abcdef" , but fail
|
||||
to match in the string "cdefab" , and the ERE "e$f" is valid,
|
||||
but can never match because the 'f' prevents the expression
|
||||
"e$" from matching ending at the last character.
|
||||
<DT>[R:2359-2370]<DD>
|
||||
It is possible to determine what strings correspond to
|
||||
subexpressions by recursively applying the leftmost longest
|
||||
rule to each subexpression, but only with the proviso that the
|
||||
overall match is leftmost longest. For example, matching
|
||||
"\(ac*\)c*d[ac]*\1" against acdacaaa matches acdacaaa (with
|
||||
\1=a); simply matching the longest match for "\(ac*\)" would
|
||||
yield \1=ac, but the overall match would be smaller (acdac).
|
||||
Conceptually, the implementation must examine every possible
|
||||
match and among those that yield the leftmost longest total
|
||||
matches, pick the one that does the longest match for the
|
||||
leftmost subexpression, and so on. Note that this means that
|
||||
matching by subexpressions is context-dependent: a
|
||||
subexpression within a larger RE may match a different string
|
||||
from the one it would match as an independent RE, and two
|
||||
instances of the same subexpression within the same larger RE
|
||||
may match different lengths even in similar sequences of
|
||||
characters. For example, in the ERE "(a.*b)(a.*b)" , the two
|
||||
identical subexpressions would match four and six characters,
|
||||
respectively, of accbaccccb.
|
||||
<DT>[R:2512-2520]<DD>
|
||||
The limit of nine back-references to subexpressions in the RE
|
||||
is based on the use of a single-digit identifier; increasing
|
||||
this to multiple digits would break historical applications.
|
||||
This does not imply that only nine subexpressions are allowed
|
||||
in REs. The following is a valid BRE with ten subexpressions:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
\(\(\(ab\)*c\)*d\)\(ef\)*\(gh\)\{2\}\(ij\)*\(kl\)*\(mn\)*\(op\)*\(qr\)*
|
||||
</DIV>
|
||||
</PRE>
|
||||
The standard developers regarded the common historical
|
||||
behavior, which supported "\n*" , but not "\n\{min,max\}" ,
|
||||
"\(...\)*" , or "\(...\)\{min,max\}" , as a non-intentional
|
||||
result of a specific implementation, and they supported both
|
||||
duplication and interval expressions following subexpressions
|
||||
and back-references.
|
||||
<DT>[R:2537-2544]<DD>
|
||||
However, one relatively uncommon case was changed to allow an
|
||||
extension used on some implementations. Historically, the BREs
|
||||
"^foo" and "\(^foo\)" did not match the same string, despite
|
||||
the general rule that subexpressions and entire BREs match the
|
||||
same strings. To increase consensus, IEEE Std 1003.1-2001 has
|
||||
allowed an extension on some implementations to treat these two
|
||||
cases in the same way by declaring that anchoring may occur at
|
||||
the beginning or end of a subexpression. Therefore, portable
|
||||
BREs that require a literal circumflex at the beginning or a
|
||||
dollar sign at the end of a subexpression must escape them.
|
||||
Note that a BRE such as "a\(^bc\)" will either match "a^bc" or
|
||||
nothing on different systems under the rules.
|
||||
<DT>[R:2549-2554]<DD>
|
||||
Some implementations have extended the BRE syntax to add
|
||||
alternation. For example, the subexpression "\(foo$\|bar\)"
|
||||
would match either "foo" at the end of the string or "bar"
|
||||
anywhere. The extension is triggered by the use of the
|
||||
undefined "\|" sequence. Because the BRE is undefined for
|
||||
portable scripts, the extending system is free to make other
|
||||
assumptions, such that the '$' represents the end-of-line
|
||||
anchor in the middle of a subexpression. If it were not for the
|
||||
extension, the '$' would match a literal dollar sign under the
|
||||
rules.
|
||||
<DT>[R:2617-2620]<DD>
|
||||
The removal of the Back_open_paren Back_close_paren option from
|
||||
the nondupl_RE specification is the result of PASC
|
||||
Interpretation 1003.2-92 #43 submitted for the ISO POSIX-2:1993
|
||||
standard. Although the grammar required support for null
|
||||
subexpressions, this section does not describe the meaning of,
|
||||
and historical practice did not support, this construct.
|
||||
<DT>[A:37188]<DD>
|
||||
size_t re_nsub Number of parenthesized subexpressions
|
||||
<DT>[A:37206-37208]<DD>
|
||||
If the REG_NOSUB flag was not set in cflags, then regcomp()
|
||||
shall set re_nsub to the number of parenthesized subexpressions
|
||||
(delimited by "\(\)" in basic regular expressions or "()" in
|
||||
extended regular expressions) found in pattern.
|
||||
<DT>[A:37220-37257]<DD>
|
||||
If nmatch is 0 or REG_NOSUB was set in the cflags argument to
|
||||
regcomp(), then regexec() shall ignore the pmatch argument.
|
||||
Otherwise, the application shall ensure that the pmatch
|
||||
argument points to an array with at least nmatch elements, and
|
||||
regexec() shall fill in the elements of that array with offsets
|
||||
of the substrings of string that correspond to the
|
||||
parenthesized subexpressions of pattern: pmatch[i].rm_so
|
||||
shall be the byte offset of the beginning and pmatch[i].rm_eo
|
||||
shall be one greater than the byte offset of the end of
|
||||
substring i. (Subexpression i begins at the ith matched open
|
||||
parenthesis, counting from 1.) Offsets in pmatch[0] identify
|
||||
the substring that corresponds to the entire regular
|
||||
expression. Unused elements of pmatch up to pmatch[nmatch-1]
|
||||
shall be filled with -1. If there are more than nmatch
|
||||
subexpressions in pattern ( pattern itself counts as a
|
||||
subexpression), then regexec() shall still do the match, but
|
||||
shall record only the first nmatch substrings.
|
||||
<P>
|
||||
When matching a basic or extended regular expression, any given
|
||||
parenthesized subexpression of pattern might participate in the
|
||||
match of several different substrings of string, or it might
|
||||
not match any substring even though the pattern as a whole did
|
||||
match. The following rules shall be used to determine which
|
||||
substrings to report in pmatch when matching regular
|
||||
expressions:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<OL>
|
||||
<LI>
|
||||
If subexpression i in a regular expression is not contained
|
||||
within another subexpression, and it participated in the match
|
||||
several times, then the byte offsets in pmatch[i] shall
|
||||
delimit the last such match.
|
||||
<LI>
|
||||
If subexpression i is not contained within another
|
||||
subexpression, and it did not participate in an otherwise
|
||||
successful match, the byte offsets in pmatch[i] shall be -1. A
|
||||
subexpression does not participate in the match when:
|
||||
<PRE>
|
||||
'*' or "\{\}" appears immediately after the
|
||||
subexpression in a basic regular expression, or '*' ,
|
||||
'?' , or "{}" appears immediately after the
|
||||
subexpression in an extended regular expression, and
|
||||
the subexpression did not match (matched 0 times)
|
||||
<P>
|
||||
or:
|
||||
<P>
|
||||
'|' is used in an extended regular expression to select
|
||||
this subexpression or another, and the other
|
||||
subexpression matched.
|
||||
</PRE>
|
||||
<LI>
|
||||
If subexpression i is contained within another subexpression
|
||||
j, and i is not contained within any other subexpression that
|
||||
is contained within j, and a match of subexpression j is
|
||||
reported in pmatch[j], then the match or non-match of
|
||||
subexpression i reported in pmatch[i] shall be as described in
|
||||
1. and 2. above, but within the substring reported in pmatch[
|
||||
j] rather than the whole string. The offsets in pmatch[i] are
|
||||
still relative to the start of string.
|
||||
<LI>
|
||||
If subexpression i is contained in subexpression j, and the
|
||||
byte offsets in pmatch[j] are -1, then the pointers in pmatch[
|
||||
i] shall also be -1.
|
||||
<LI>
|
||||
If subexpression i matched a zero-length string, then both
|
||||
byte offsets in pmatch[i] shall be the byte offset of the
|
||||
character or null terminator immediately following the
|
||||
zero-length string.
|
||||
</OL>
|
||||
</DIV>
|
||||
<DT>[A:37363-37366]<DD>
|
||||
The regexec() function must fill in all nmatch elements of
|
||||
pmatch, where nmatch and pmatch are supplied by the
|
||||
application, even if some elements of pmatch do not correspond
|
||||
to subexpressions in pattern. The application writer should
|
||||
note that there is probably no reason for using a value of
|
||||
nmatch that is larger than preg-> re_nsub+1.
|
||||
<DT>[A:37407-37413]<DD>
|
||||
The number of subexpressions in the RE is reported in re_nsub
|
||||
in preg. With this change to regexec(), consideration was given
|
||||
to dropping the REG_NOSUB flag since the user can now specify
|
||||
this with a zero nmatch argument to regexec(). However, keeping
|
||||
REG_NOSUB allows an implementation to use a different (perhaps
|
||||
more efficient) algorithm if it knows in regcomp() that no
|
||||
subexpressions need be reported. The implementation is only
|
||||
required to fill in pmatch if nmatch is not zero and if
|
||||
REG_NOSUB is not specified.
|
||||
</DL>
|
||||
</DIV>
|
||||
<P>
|
||||
This sentence is as close as the standard gets to a definition:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT>[A:37225-37226]<DD>
|
||||
Subexpression i begins at the ith matched open parenthesis, counting from 1.
|
||||
</DL>
|
||||
</DIV>
|
||||
<P>
|
||||
Using nonterminals from the BRE [D:6371-6731] and ERE [D:6452-6452] grammar
|
||||
productions (text not listed in this document) yields the following:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT><STRONG>DEFINITION</STRONG><DD>
|
||||
A
|
||||
<EM>subexpression</EM>
|
||||
corresponds to the
|
||||
<TT>Back_open_paren RE_expression Back_close_paren</TT>
|
||||
form of the
|
||||
<TT>nondupl_RE</TT>
|
||||
BRE grammar production or
|
||||
the
|
||||
<TT>'(' extended_reg_exp ')'</TT>
|
||||
form of the
|
||||
<TT>ERE_expression</TT>
|
||||
ERE grammar production.
|
||||
Subexpression i begins at the ith matched open parenthesis
|
||||
(<TT>Back_open_paren</TT>
|
||||
for BREs and '(' for EREs),
|
||||
starting from the left and counting from 1.
|
||||
Subexpression 0 is the entire RE.
|
||||
</DL>
|
||||
</DIV>
|
||||
<P>
|
||||
This definition and the subexpression match rule [R:2359-2370] can be used to
|
||||
to examine a class of EREs where the top level catenation operands are
|
||||
subexpressions.
|
||||
(A top level subexpression is not contained in any other subexpression
|
||||
except subexpression 0.)
|
||||
The subexpression match rule in pseudo code is:
|
||||
<UL type=square>
|
||||
<LI>
|
||||
determine the longest of the leftmost matches for subexpression-0
|
||||
[R:2359-2361]
|
||||
<LI>
|
||||
for 1<=<EM>i</EM><=<STRONG>re_nsub</STRONG>
|
||||
determine the longest match for
|
||||
subexpression-<EM>i</EM>
|
||||
consistent with the matches already determined for
|
||||
subexpression-<EM>j,</EM>
|
||||
0<=<EM>j</EM><<EM>i</EM>.
|
||||
[R:2359-2370] [A:37235-37257]
|
||||
</UL>
|
||||
For example, given
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#03:E (a?)((ab)?) ab (0,2)(0,0)(0,2)(0,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
the subexpressions are:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
subexpression-0 (a?)((ab)?)
|
||||
subexpression-1 (a?)
|
||||
subexpression-2 ((ab)?)
|
||||
subexpression-3 (ab)
|
||||
</DIV>
|
||||
</PRE>
|
||||
The longest of the leftmost matches for subexpression-0 is (0,2).
|
||||
The longest match for subexpression-1, consistent with the match
|
||||
for subexpression-0, is (0,0); otherwise if it had matched (0,1) then
|
||||
subexpression-2 would not match and the subexpression-0 match would be
|
||||
limited to (0,1).
|
||||
The longest match for subexpression-2, consistent with the matches
|
||||
for subexpression-0 and subexpression-1, is (0,2).
|
||||
The longest match for subexpression-3, consistent with the matches
|
||||
for subexpression-0, subexpression-1 and subexpression-2, is (0,2).
|
||||
This table illustrates the matching:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
subexpr pattern match
|
||||
0 (a?)((ab)?) (0,2)
|
||||
1 (a?) (0,0)
|
||||
2 ((ab)?) (0,2)
|
||||
3 (ab) (0,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#04 is a similar example that exposes the associativity of subexpression
|
||||
concatenation:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#04:E (a?)((ab)?)(b?) ab (0,2)(0,1)(1,1)(?,?)(1,2)
|
||||
|
||||
subexpr pattern match
|
||||
0 (a?)((ab)?)(b?) (0,2)
|
||||
1 (a?) (0,1)
|
||||
2 ((ab)?) (1,1)
|
||||
3 (ab) (?,?)
|
||||
4 (b?) (1,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
[R:2363-2365] also shows that parenthesis can be used to alter the
|
||||
order of matching:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#05:E ((a?)((ab)?))(b?) ab (0,2)(0,2)(0,0)(0,2)(0,2)(2,2)
|
||||
|
||||
subexpr pattern match
|
||||
0 ((a?)((ab)?))(b?) (0,2)
|
||||
1 ((a?)((ab)?)) (0,2)
|
||||
2 (a?) (0,0)
|
||||
3 ((ab)?) (0,2)
|
||||
4 (ab) (0,2)
|
||||
5 (b?) (2,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
In RE#05 the extra parenthesis (around subexpression-1 and subexpression-2 in
|
||||
RE#04) form a new subexpression-1, and change the
|
||||
match for the last subexpression
|
||||
<TT>(b?)</TT>
|
||||
to (2,2) (from (1,2) in RE#04.)
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#06:E (a?)(((ab)?)(b?)) ab (0,2)(0,1)(1,2)(1,1)(?,?)(1,2)
|
||||
|
||||
subexpr pattern match
|
||||
0 (a?)(((ab)?)(b?)) (0,2)
|
||||
1 (a?) (0,1)
|
||||
2 (((ab)?)(b?)) (1,2)
|
||||
3 ((ab)?) (1,1)
|
||||
4 (ab) (?,?)
|
||||
5 (b?) (1,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
In RE#06 the extra parenthesis pair forces right associativity and results
|
||||
in the same match of (1,2) for the last subexpression
|
||||
<TT>(b?)</TT>
|
||||
as in RE#04.
|
||||
These examples show that:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT><STRONG>PROPERTY</STRONG><DD>
|
||||
Subexpression grouping can alter the precedence of concatenation.
|
||||
<DT><STRONG>PROPERTY</STRONG><DD>
|
||||
Subexpression concatenation is right associative.
|
||||
</DL>
|
||||
</DIV>
|
||||
<P>
|
||||
The following examples examine replicated subexpressions.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#07:E (.?) x (0,1)(0,1)
|
||||
:RE#08:E (.?){1} x (0,1)(0,1)
|
||||
:RE#09:E (.?)(.?) x (0,1)(0,1)(1,1)
|
||||
:RE#10:E (.?){2} x (0,1)(1,1)
|
||||
:RE#11:E (.?)* x (0,1)(0,1)
|
||||
</DIV>
|
||||
</PRE>
|
||||
[D:6227-6234] specifies that RE#07 and RE#08 are equivalent, and that
|
||||
RE#09 and RE#10 are equivalent, and
|
||||
[D:6217-6219] specifies that RE#09 and RE#11 are equivalent.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT>[D:6227-6234]<DD>
|
||||
When an ERE matching a single character or an ERE enclosed in
|
||||
parentheses is followed by an interval expression of the format "{m}" ,
|
||||
"{m,}" , or "{m,n}" , together with that interval expression it shall
|
||||
match what repeated consecutive occurrences of the ERE would match. The
|
||||
values of m and n are decimal integers in the range 0 <= m<= n<=
|
||||
{RE_DUP_MAX}, where m specifies the exact or minimum number of
|
||||
occurrences and n specifies the maximum number of occurrences. The
|
||||
expression "{m}" matches exactly m occurrences of the preceding ERE,
|
||||
"{m,}" matches at least m occurrences, and "{m,n}" matches any number
|
||||
of occurrences between m and n, inclusive.
|
||||
<DT>[D:6217-6219]<DD>
|
||||
When an ERE matching a single character or an ERE enclosed in
|
||||
parentheses is followed by the special character asterisk ( '*' ),
|
||||
together with that asterisk it shall match what zero or more
|
||||
consecutive occurrences of the ERE would match.
|
||||
</DL>
|
||||
</DIV>
|
||||
In RE#09 subexpression-1 matches (0,1), leaving the null string at (1,1) for
|
||||
subexpression-2.
|
||||
In RE#10 the first iteration of subexpression-1 matches (0,1), the same
|
||||
as subexpression-1 in RE#09, and the second iteration of subexpression-1
|
||||
matches (1,1), the same as subexpression-2 in RE#09.
|
||||
RE#07 and RE#08 show that only one iteration is needed to match the subject
|
||||
string, so the match in RE#11 requires only one iteration, and as such is the
|
||||
last iteration of [D:6107-6109] [A:37235-37237].
|
||||
RE#10 and RE#11 also illustrate [D:6127-6129] [D:6239-6241], which
|
||||
specify that a repeated RE matches the null string only if it is the only
|
||||
match (not this case) or if it is necessary to satisfy an interval expression
|
||||
minimum (2 in this case.)
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT>[D:6239-6241]<DD>
|
||||
An ERE matching a single character repeated by an '*' , '?' , or an
|
||||
interval expression shall not match a null expression unless this is
|
||||
the only match for the repetition or it is necessary to satisfy the
|
||||
exact or minimum number of occurrences for the interval expression.
|
||||
</DL>
|
||||
</DIV>
|
||||
<P>
|
||||
The following examples dig deeper into replicated subexpressions.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#12:E (.?.?) xxx (0,2)(0,2)
|
||||
:RE#13:E (.?.?){1} xxx (0,2)(0,2)
|
||||
:RE#14:E (.?.?)(.?.?) xxx (0,3)(0,2)(2,3)
|
||||
:RE#15:E (.?.?){2} xxx (0,3)(2,3)
|
||||
:RE#16:E (.?.?)(.?.?)(.?.?) xxx (0,3)(0,2)(2,3)(3,3)
|
||||
:RE#17:E (.?.?){3} xxx (0,3)(3,3)
|
||||
:RE#18:E (.?.?)* xxx (0,3)(2,3)
|
||||
</DIV>
|
||||
</PRE>
|
||||
Here RE#14 shows that only two iterations are needed for a complete match,
|
||||
making the last iteration match for RE#18 (2,3), since the first
|
||||
iteration matched (0,2), as in RE#14.
|
||||
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="A subpattern is ">A subpattern is </A></H3></FONT></FONT></CENTER>
|
||||
The term
|
||||
<EM>subpattern</EM>
|
||||
appears exactly once:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT>[D:5907-5908]<DD>
|
||||
Consistent with the whole match being the longest of the leftmost matches,
|
||||
each subpattern, from left to right, shall match the longest possible string.
|
||||
</DL>
|
||||
</DIV>
|
||||
Consider RE#04 and RE#05 again:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#04:E (a?)((ab)?)(b?) ab (0,2)(0,1)(1,1)(?,?)(1,2)
|
||||
:RE#05:E ((a?)((ab)?))(b?) ab (0,2)(0,2)(0,0)(0,2)(0,2)(2,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
If a subpattern were an entity that combined adjacent subexpressions,
|
||||
e.g.,
|
||||
<TT>(a?)((ab)?)</TT>
|
||||
in RE#04, then [D:5907-5908] would violate [R:2359-2370].
|
||||
Similarly, if a subpattern were an entity that "went inside" subexpressions,
|
||||
e.g.,
|
||||
<TT>(a?)</TT>
|
||||
in RE#05, then again [D:5907-5908] would violate [R:2359-2370].
|
||||
In other words, a subpattern can be neither larger than nor smaller than
|
||||
a subexpression;
|
||||
a subpattern must be a grammatical entity equivalent to a subexpression.
|
||||
This corresponds to the nonterminal
|
||||
<TT>nondupl_RE</TT>
|
||||
in the BRE grammar; there is no direct correspondence to a nonterminal
|
||||
in the ERE grammar.
|
||||
However, if the optional duplication operator (*,+,?,range) is included then
|
||||
subpattern corresponds to
|
||||
<TT>simple_RE</TT>
|
||||
in the BRE grammar and
|
||||
<TT>ERE_expression</TT>
|
||||
in the ERE grammar, and both [D:5907-5908] and [R:2359-2370] are satisfied.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT><STRONG>DEFINITION</STRONG><DD>
|
||||
A
|
||||
<EM>subpattern</EM>
|
||||
corresponds to the
|
||||
<TT>simple_RE</TT>
|
||||
nonterminal in the BRE grammar or the
|
||||
<TT>ERE_expression</TT>
|
||||
nonterminal in the ERE grammar.
|
||||
</DL>
|
||||
</DIV>
|
||||
This means that subexpressions and subpatterns are of equal importance
|
||||
in RE matching.
|
||||
Also note that any other definition for subpattern will put
|
||||
[D:5907-5908] in direct conflict with [R:2359-2370].
|
||||
<P>
|
||||
RE#19, RE#20 and RE#21 examine the relationship between subexpression
|
||||
and subpattern:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#19:E a?((ab)?)(b?) ab (0,2)(1,1)(?,?)(1,2)
|
||||
:RE#20:E (a?)((ab)?)b? ab (0,2)(0,1)(1,1)(?,?)
|
||||
:RE#21:E a?((ab)?)b? ab (0,2)(1,1)(?,?)
|
||||
</DIV>
|
||||
</PRE>
|
||||
<P>
|
||||
These are all variations of RE#04.
|
||||
Other than subexpression renumbering, the match for the subexpression
|
||||
<TT>((ab)?)</TT>
|
||||
must be the same in RE#04, RE#19, RE#20 and RE#21.
|
||||
<TT>a?</TT>
|
||||
is a subpattern in RE#19 and RE#21, of equal matching importance to
|
||||
<TT>(a?)</TT>
|
||||
in RE#04, and
|
||||
<TT>b?</TT>
|
||||
is a subpattern in RE#20 and RE#21, of equal matching
|
||||
importance to
|
||||
<TT>(b?)</TT>
|
||||
in RE#04.
|
||||
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="The Dark Corners ">The Dark Corners </A></H3></FONT></FONT></CENTER>
|
||||
The remaining examples explore dark corners of the standard
|
||||
and implementations.
|
||||
Although the differences between some of the examples are subtle,
|
||||
for some implementations it may mean the difference between an answer and
|
||||
a core dump.
|
||||
<P>
|
||||
In RE#22 subexpression
|
||||
<TT>(a*)</TT>
|
||||
matches the null string at (0,0), and continues to match at that position
|
||||
until the minimal range count is satisfied.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#22:E (a*){2} xxxxx (0,0)(0,0)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#23 through RE#27 expose implementations that sometimes do
|
||||
<EM>first match</EM>
|
||||
for alternation within subexpressions.
|
||||
Some implementations erroneously match the first iteration of
|
||||
subexpression-1 in RE#24 through RE#27 to (0,1).
|
||||
RE#27 is equivalent to RE#26; the match requires two iterations, the first
|
||||
matching (0,2) and the last matching (2,3).
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#23:E (ab?)(b?a) aba (0,3)(0,2)(2,3)
|
||||
:RE#24:E (a|ab)(ba|a) aba (0,3)(0,2)(2,3)
|
||||
:RE#25:E (a|ab|ba) aba (0,2)(0,2)
|
||||
:RE#26:E (a|ab|ba)(a|ab|ba) aba (0,3)(0,2)(2,3)
|
||||
:RE#27:E (a|ab|ba)* aba (0,3)(2,3)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#28 through RE#33 expose implementations that report short matches
|
||||
for some repeated subexpressions.
|
||||
Some implementations report incorrect matches for
|
||||
subexpression-1 in RE#30 and RE#33.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#28:E (aba|a*b) ababa (0,3)(0,3)
|
||||
:RE#29:E (aba|a*b)(aba|a*b) ababa (0,5)(0,2)(2,5)
|
||||
:RE#30:E (aba|a*b)* ababa (0,5)(2,5)
|
||||
:RE#31:E (aba|ab|a) ababa (0,3)(0,3)
|
||||
:RE#32:E (aba|ab|a)(aba|ab|a) ababa (0,5)(0,2)(2,5)
|
||||
:RE#33:E (aba|ab|a)* ababa (0,5)(2,5)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#34 through RE#36 expose implementations that report subexpression matches
|
||||
for earlier iterations of the subexpression.
|
||||
Some implementations report a match for subexpression-2 in RE#36
|
||||
while reporting the (2,3) match for subexpression-1: clearly a bug.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#34:E (a(b)?) aba (0,2)(0,2)(1,2)
|
||||
:RE#35:E (a(b)?)(a(b)?) aba (0,3)(0,2)(1,2)(2,3)(?,?)
|
||||
:RE#36:E (a(b)?)+ aba (0,3)(2,3)(?,?)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#37 and RE#38 expose implementations that give priority to subexpression
|
||||
matching over subpattern matching.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#37:E (.*)(.*) xx (0,2)(0,2)(2,2)
|
||||
:RE#38:E .*(.*) xx (0,2)(2,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#39 through RE#41 expose implementations that treat explicit vs. implicit
|
||||
subexpression repetition differently.
|
||||
This is a theme common to many of the previous examples.
|
||||
Again, the subexpression in RE#41 requires two iterations to match,
|
||||
and the second iteration matches (5,7), as illustrated by RE#40.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#39:E (a.*z|b.*y) azbazby (0,5)(0,5)
|
||||
:RE#40:E (a.*z|b.*y)(a.*z|b.*y) azbazby (0,7)(0,5)(5,7)
|
||||
:RE#41:E (a.*z|b.*y)* azbazby (0,7)(5,7)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#42 is another
|
||||
<EM>first match</EM>
|
||||
test.
|
||||
Some implementations erroneously report a match of (0,1) for subexpression-1.
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#42:E (.|..)(.*) ab (0,2)(0,2)(2,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#43 through RE#45 require only one iteration of subexpression-1 to
|
||||
match the entire subject string.
|
||||
RE#45 exposes three separate bugs in the implementations that were tested.
|
||||
The most common was
|
||||
<EM>over iteration</EM>,
|
||||
where subexpression-1 is matched for a second iteration to the null string
|
||||
at (3,3).
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#43:E ((..)*(...)*) xxx (0,3)(0,3)(?,?)(0,3)
|
||||
:RE#44:E ((..)*(...)*)((..)*(...)*) xxx (0,3)(0,3)(?,?)(0,3)(3,3)(?,?)(?,?)
|
||||
:RE#45:E ((..)*(...)*)* xxx (0,3)(0,3)(?,?)(0,3)
|
||||
</DIV>
|
||||
</PRE>
|
||||
RE#46 through RE#82 are nasty;
|
||||
backreferences are intuitive neither for the implementor nor the user.
|
||||
<P>
|
||||
RE#49, RE#53, RE#67 and RE#68 illustrate the second part of the
|
||||
<EM>subpattern</EM>
|
||||
rule:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<DL COMPACT>
|
||||
<DT>[D:5908-5909]<DD>
|
||||
For this purpose, a null string shall be considered to be longer than
|
||||
no match at all.
|
||||
</DL>
|
||||
</DIV>
|
||||
RE#53 requires close examination to see why the match is (0,2)(1,1)(2,2)
|
||||
instead of (0,2)(0,1)(?,?).
|
||||
The match of (0,1) for subexpression-1 is longer than (1,1), but
|
||||
subexpression-1 can be repeated, and that second iteration allows
|
||||
subexpression-2 to match (2,2), which is longer than (?,?) by [D:5908-5909].
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
:RE#46:B \(a\{0,1\}\)*b\1 ab (0,2)(1,1)
|
||||
:RE#47:B \(a*\)*b\1 ab (0,2)(1,1)
|
||||
:RE#48:B \(a*\)b\1* ab (0,2)(0,1)
|
||||
:RE#49:B \(a*\)*b\1* ab (0,2)(1,1)
|
||||
:RE#50:B \(a\{0,1\}\)*b\(\1\) ab (0,2)(1,1)(2,2)
|
||||
:RE#51:B \(a*\)*b\(\1\) ab (0,2)(1,1)(2,2)
|
||||
:RE#52:B \(a*\)b\(\1\)* ab (0,2)(0,1)(?,?)
|
||||
:RE#53:B \(a*\)*b\(\1\)* ab (0,2)(1,1)(2,2)
|
||||
:RE#54:B \(a\{0,1\}\)*b\1 aba (0,3)(0,1)
|
||||
:RE#55:B \(a*\)*b\1 aba (0,3)(0,1)
|
||||
:RE#56:B \(a*\)b\1* aba (0,3)(0,1)
|
||||
:RE#57:B \(a*\)*b\1* aba (0,3)(0,1)
|
||||
:RE#58:B \(a*\)*b\(\1\)* aba (0,3)(0,1)(2,3)
|
||||
:RE#59:B \(a\{0,1\}\)*b\1 abaa (0,3)(0,1)
|
||||
:RE#60:B \(a*\)*b\1 abaa (0,3)(0,1)
|
||||
:RE#61:B \(a*\)b\1* abaa (0,4)(0,1)
|
||||
:RE#62:B \(a*\)*b\1* abaa (0,4)(0,1)
|
||||
:RE#63:B \(a*\)*b\(\1\)* abaa (0,4)(0,1)(3,4)
|
||||
:RE#64:B \(a\{0,1\}\)*b\1 aab (0,3)(2,2)
|
||||
:RE#65:B \(a*\)*b\1 aab (0,3)(2,2)
|
||||
:RE#66:B \(a*\)b\1* aab (0,3)(0,2)
|
||||
:RE#67:B \(a*\)*b\1* aab (0,3)(2,2)
|
||||
:RE#68:B \(a*\)*b\(\1\)* aab (0,3)(2,2)(3,3)
|
||||
:RE#69:B \(a\{0,1\}\)*b\1 aaba (0,4)(1,2)
|
||||
:RE#70:B \(a*\)*b\1 aaba (0,4)(1,2)
|
||||
:RE#71:B \(a*\)b\1* aaba (0,3)(0,2)
|
||||
:RE#72:B \(a*\)*b\1* aaba (0,4)(1,2)
|
||||
:RE#73:B \(a*\)*b\(\1\)* aaba (0,4)(1,2)(3,4)
|
||||
:RE#74:B \(a\{0,1\}\)*b\1 aabaa (0,4)(1,2)
|
||||
:RE#75:B \(a*\)*b\1 aabaa (0,5)(0,2)
|
||||
:RE#76:B \(a*\)b\1* aabaa (0,5)(0,2)
|
||||
:RE#77:B \(a*\)*b\1* aabaa (0,5)(0,2)
|
||||
:RE#78:B \(a*\)*b\(\1\)* aabaa (0,5)(0,2)(3,5)
|
||||
:RE#79:B \(x\)*a\1 a NOMATCH
|
||||
:RE#80:B \(x\)*a\1* a (0,1)(?,?)
|
||||
:RE#81:B \(x\)*a\(\1\) a NOMATCH
|
||||
:RE#82:B \(x\)*a\(\1\)* a (0,1)(?,?)(?,?)
|
||||
:RE#83:E (aa(b(b))?)+ aabbaa (0,6)(4,6)(?,?)(?,?)
|
||||
:RE#84:E (a(b)?)+ aba (0,3)(2,3)(?,?)
|
||||
:RE#85:E ([ab]+)([bc]+)([cd]*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
:RE#86:B \([ab]*\)\([bc]*\)\([cd]*\)\1 abcdaa (0,5)(0,1)(1,3)(3,4)
|
||||
:RE#87:B \([ab]*\)\([bc]*\)\([cd]*\)\1 abcdab (0,6)(0,2)(2,3)(3,4)
|
||||
:RE#88:B \([ab]*\)\([bc]*\)\([cd]*\)\1* abcdaa (0,6)(0,1)(1,3)(3,4)
|
||||
:RE#89:B \([ab]*\)\([bc]*\)\([cd]*\)\1* abcdab (0,6)(0,2)(2,3)(3,4)
|
||||
:RE#90:E ^(A([^B]*))?(B(.*))? Aa (0,2)(0,2)(1,2)
|
||||
:RE#91:E ^(A([^B]*))?(B(.*))? Bb (0,2)(?,?)(?,?)(0,2)(1,2)
|
||||
:RE#92:B .*\([AB]\).*\1 ABA (0,3)(0,1)
|
||||
:RE#93:B$ [^A]*A \nA (0,2)
|
||||
</DIV>
|
||||
</PRE>
|
||||
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Conclusion">Conclusion</A></H3></FONT></FONT></CENTER>
|
||||
It is possible to use the 2001 issue of the POSIX
|
||||
<STRONG>regex</STRONG>
|
||||
standard,
|
||||
<EM>with the addition of one sentence</EM>,
|
||||
to resolve the interpretation differences that have surfaced since 1995.
|
||||
That key sentence is a precise and consistent definition for the term
|
||||
<EM>subpattern</EM>.
|
||||
By noting the relationship between
|
||||
<EM>subpatterns</EM>
|
||||
and
|
||||
<EM>subexpressions</EM>,
|
||||
the proposed definition is shown to be the only one that can be
|
||||
consistent with all parts of the standard.
|
||||
<P>
|
||||
<HR>
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-interpretation.mm mm document">Glenn Fowler</A></TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Information and Software Systems Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>AT&T Labs Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Florham Park NJ</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>January 2003</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<P>
|
||||
|
||||
</TD></TR></TBODY></TABLE>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
62
re-nullsubexpr.html
Normal file
62
re-nullsubexpr.html
Normal file
|
@ -0,0 +1,62 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
|
||||
<META name="keywords" content="regular expression null subexpression tests">
|
||||
<TITLE> ../re/re-nullsubexpr.mm mm document </TITLE>
|
||||
<META name="author" content="gsf">
|
||||
</HEAD>
|
||||
<BODY bgcolor=white link=slateblue vlink=teal >
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TBODY><TR><TD valign=top align=left>
|
||||
<!--INDEX--><!--/INDEX-->
|
||||
<P>
|
||||
<HR>
|
||||
<CENTER>
|
||||
<H3><CENTER><FONT color=red><FONT face=courier>regular expression null subexpression tests</FONT></FONT></CENTER></H3>
|
||||
<BR>Glenn Fowler <SMALL><<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>></SMALL>
|
||||
<P><I>AT&T Labs Research - Florham Park NJ</I>
|
||||
</CENTER>
|
||||
<P><HR><P>
|
||||
The
|
||||
<STRONG>regex</STRONG>
|
||||
tests in
|
||||
<A href="http://web.archive.org/web/20080709091423id_/http://www.research.att.com/~gsf/testregex/nullsubexpr.dat">nullsubexpr.dat</A>
|
||||
exercise
|
||||
<STRONG>regex</STRONG>
|
||||
null subexpression matching.
|
||||
<P>
|
||||
<HR>
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-nullsubexpr.mm mm document">Glenn Fowler</A></TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Information and Software Systems Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>AT&T Labs Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Florham Park NJ</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>August 04, 2002</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<P>
|
||||
|
||||
</TD></TR></TBODY></TABLE>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
60
re-repetition.html
Normal file
60
re-repetition.html
Normal file
|
@ -0,0 +1,60 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META name="generator" content="mm2html (AT&T Labs Research) 2005-10-15">
|
||||
<META name="keywords" content="regular expression repetition tests">
|
||||
<TITLE> ../re/re-repetition.mm mm document </TITLE>
|
||||
<META name="author" content="gsf">
|
||||
</HEAD>
|
||||
<BODY bgcolor=white link=slateblue vlink=teal >
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TBODY><TR><TD valign=top align=left>
|
||||
<!--INDEX--><!--/INDEX-->
|
||||
<P>
|
||||
<HR>
|
||||
<CENTER>
|
||||
<H3><CENTER><FONT color=red><FONT face=courier>regular expression repetition tests</FONT></FONT></CENTER></H3>
|
||||
<BR>Glenn Fowler <SMALL><<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>></SMALL>
|
||||
<P><I>AT&T Labs Research - Florham Park NJ</I>
|
||||
</CENTER>
|
||||
<P><HR><P>
|
||||
The
|
||||
<STRONG>regex</STRONG>
|
||||
tests in
|
||||
<A href="http://web.archive.org/web/20080726033833id_/http://www.research.att.com/~gsf/testregex/repetition.dat">repetition.dat</A>
|
||||
exercise explicit and implicit repetition.
|
||||
<P>
|
||||
<HR>
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/re-repetition.mm mm document">Glenn Fowler</A></TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Information and Software Systems Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>AT&T Labs Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Florham Park NJ</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>August 04, 2002</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<P>
|
||||
|
||||
</TD></TR></TBODY></TABLE>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
79
repetition.dat
Normal file
79
repetition.dat
Normal file
|
@ -0,0 +1,79 @@
|
|||
NOTE implicit vs. explicit repetitions : 2002-08-01
|
||||
#
|
||||
# Glenn Fowler <gsf@research.att.com>
|
||||
# conforming matches (column 4) must match one of the following BREs
|
||||
# NOMATCH
|
||||
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
|
||||
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
|
||||
# i.e., each 3-tuple has two identical elements and one (?,?)
|
||||
#
|
||||
|
||||
E ((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
|
||||
E ((..)|(.)){1} NULL NOMATCH
|
||||
E ((..)|(.)){2} NULL NOMATCH
|
||||
E ((..)|(.)){3} NULL NOMATCH
|
||||
|
||||
E ((..)|(.))* NULL (0,0)
|
||||
|
||||
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.))((..)|(.)) a NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
|
||||
|
||||
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.)){2} a NOMATCH
|
||||
E ((..)|(.)){3} a NOMATCH
|
||||
|
||||
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
|
||||
|
||||
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
|
||||
|
||||
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.)){3} aa NOMATCH
|
||||
|
||||
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
|
||||
|
||||
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
|
||||
|
||||
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
|
||||
E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
|
||||
|
||||
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
|
||||
|
||||
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
|
||||
|
||||
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
|
||||
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
|
||||
|
||||
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
|
||||
E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
|
||||
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
|
16
rightassoc.dat
Normal file
16
rightassoc.dat
Normal file
|
@ -0,0 +1,16 @@
|
|||
NOTE left-assoc:pass-none right-assoc:pass-all : 2002-04-29
|
||||
|
||||
E (a|ab)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
E (a|ab)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
E (ab|a)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
E (ab|a)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
|
||||
E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3)
|
||||
E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3)
|
||||
E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3)
|
||||
E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3)
|
||||
|
||||
E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)
|
||||
E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4)
|
2121
testregex.c
Normal file
2121
testregex.c
Normal file
File diff suppressed because it is too large
Load diff
241
testregex.html
Normal file
241
testregex.html
Normal file
|
@ -0,0 +1,241 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">
|
||||
<HTML>
|
||||
<HEAD>
|
||||
<META name="generator" content="mm2html (AT&T Research) 2010-09-10">
|
||||
<META name="keywords" content="regular expression pattern match regression test">
|
||||
<TITLE> ../re/testregex.mm mm document </TITLE>
|
||||
<META name="author" content="gsf">
|
||||
</HEAD>
|
||||
<BODY bgcolor=white link=slateblue vlink=teal >
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TBODY><TR><TD valign=top align=left>
|
||||
<!--INDEX--><!--/INDEX-->
|
||||
<B><FONT size=-1 face="verdana,arial,helvetica,geneva,sans-serif">
|
||||
<TABLE align=center cellpadding=2 border=4 bgcolor=lightgrey><TR>
|
||||
<TD><A href="testregex.html#Reference Implementations">Reference Implementations</A></TD>
|
||||
<TD><A href="testregex.html#Test Data Repository">Test Data Repository</A></TD>
|
||||
<TD><A href="testregex.html#Usage">Usage</A></TD>
|
||||
<TD><A href="testregex.html#Reference Implementation Notes">Reference Implementation Notes</A></TD>
|
||||
<TD><A href="testregex.html#testregex Notes">testregex Notes</A></TD>
|
||||
</TR></TABLE>
|
||||
</FONT></B>
|
||||
<P>
|
||||
<HR>
|
||||
<CENTER>
|
||||
<H3><CENTER><FONT color=red><FONT face=courier>AT&T Research regex(3) regression tests</FONT></FONT></CENTER></H3>
|
||||
<BR>Glenn Fowler <SMALL><<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>></SMALL>
|
||||
<P><I>AT&T Research - Florham Park NJ</I>
|
||||
</CENTER>
|
||||
<P><HR><P>
|
||||
<A href="testregex.c">testregex.c 2004-05-31</A>
|
||||
is the latest source for the AT&T Research regression test
|
||||
harness for the
|
||||
<A href="http://www.opengroup.org/onlinepubs/007904975/functions/regcomp.html" target=_top>X/Open regex</A>
|
||||
pattern match interface.
|
||||
See
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man1/testregex.html"><STRONG>testregex</STRONG></A>(1)</NOBR>
|
||||
for option and test input details.
|
||||
The source and test data posted here are license free.
|
||||
<P>
|
||||
<STRONG>testregex</STRONG>
|
||||
can:
|
||||
<UL type=square>
|
||||
<LI>
|
||||
verify stability for a particular implementation in the face of
|
||||
source code and/or compilation environment changes
|
||||
<LI>
|
||||
verify standard compliance for all implementations
|
||||
<LI>
|
||||
provide a basis for discussions on what
|
||||
<EM>compliance</EM>
|
||||
means
|
||||
</UL>
|
||||
<P>
|
||||
See
|
||||
<A href="re-interpretation.html">An Interpretation of the POSIX regex Standards</A>
|
||||
for an analysis of the POSIX-X/Open
|
||||
<STRONG>regex</STRONG>
|
||||
standards.
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Reference Implementations">Reference Implementations</A></H3></FONT></FONT></CENTER>
|
||||
<STRONG>testregex</STRONG>
|
||||
is currently built against these reference implementations:
|
||||
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
|
||||
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 frame=void rules=none >
|
||||
<TBODY>
|
||||
<TR><TD align=right>NAME </TD><TD align=center> LABEL </TD><TD align=left> AUTHORS</TD></TR>
|
||||
<TR><TD align=right>
|
||||
AT&T ast </TD><TD align=center> <A href="http://www.research.att.com/sw/download/" target=_top>A</A> </TD><TD align=left> Glenn Fowler and Doug McIlroy</TD></TR>
|
||||
<TR><TD align=right>
|
||||
bsd </TD><TD align=center> <A href="ftp://ftp.netbsd.org/pub/NetBSD/NetBSD-1.5.2/source/sets/src.tgz" target=_top>B</A> </TD><TD align=left> </TD></TR>
|
||||
<TR><TD align=right>
|
||||
Bell Labs </TD><TD align=center> <A href="http://www.bell-labs.com/" target=_top>D</A> </TD><TD align=left> Doug McIlroy</TD></TR>
|
||||
<TR><TD align=right>
|
||||
old gnu </TD><TD align=center> <A href="http://www.gnu.org" target=_top>G</A> </TD><TD align=left> </TD></TR>
|
||||
<TR><TD align=right>
|
||||
gnu </TD><TD align=center> <A href="http://www.gnu.org" target=_top>H</A> </TD><TD align=left> Isamu Hasegawa</TD></TR>
|
||||
<TR><TD align=right>
|
||||
irix </TD><TD align=center> <A href="http://www.sgi.com" target=_top>I</A> </TD><TD align=left> </TD></TR>
|
||||
<TR><TD align=right>
|
||||
boost </TD><TD align=center> <A href="http://www.boost.org/libs/regex/" target=_top>J</A> </TD><TD align=left> John Maddock</TD></TR>
|
||||
<TR><TD align=right>
|
||||
regex++ </TD><TD align=center> <A href="http://ourworld.compuserve.com/homepages/John_Maddock/regexpp.htm" target=_top>M</A> </TD><TD align=left> John Maddock</TD></TR>
|
||||
<TR><TD align=right>
|
||||
pcre perl compatible </TD><TD align=center> <A href="http://www.pcre.org/" target=_top>P</A> </TD><TD align=left> Philip Hazel</TD></TR>
|
||||
<TR><TD align=right>
|
||||
rx </TD><TD align=center> <A href="ftp://regexps.com/pub/src/hackerlab/" target=_top>R</A> </TD><TD align=left> Tom Lord</TD></TR>
|
||||
<TR><TD align=right>
|
||||
spencer </TD><TD align=center> <A href="http://arglist.com/regex/rxspencer-alpha3.8.g2.tar.gz" target=_top>S</A> </TD><TD align=left> Henry Spencer</TD></TR>
|
||||
<TR><TD align=right>
|
||||
libtre </TD><TD align=center> <A href="http://kouli.iki.fi/~vlaurika/libtre/" target=_top>T</A> </TD><TD align=left> Ville Laurikari</TD></TR>
|
||||
<TR><TD align=right>
|
||||
unix caldera </TD><TD align=center> <A href="http://unixtools.sourceforge.net/" target=_top>U</A> </TD><TD align=left> </TD></TR>
|
||||
</TBODY></TABLE></TD></TR></TBODY></TABLE>
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Test Data Repository">Test Data Repository</A></H3></FONT></FONT></CENTER>
|
||||
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
|
||||
<TABLE align=center bgcolor=papayawhip border=0 bordercolor=white cellpadding=2 cellspacing=2 frame=void rules=none >
|
||||
<TBODY>
|
||||
<TR><TD align=right>
|
||||
<A href="basic.dat">basic.dat</A> </TD><TD align=left> basic regex(3) -- all implementations should pass these</TD></TR>
|
||||
<TR><TD align=right>
|
||||
<A href="categorize.dat">categorize.dat</A> </TD><TD align=left> <A href="re-categorize.html">implementation categorization</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
<A href="nullsubexpr.dat">nullsubexpr.dat</A> </TD><TD align=left> <A href="re-nullsubexpr.html">null (...)* tests</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
<A href="leftassoc.dat">leftassoc.dat</A> </TD><TD align=left> <A href="re-assoc.html">left associative catenation implementation must pass these</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
<A href="rightassoc.dat">rightassoc.dat</A> </TD><TD align=left> <A href="re-assoc.html">right associative catenation implementation must pass these</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
<A href="forcedassoc.dat">forcedassoc.dat</A> </TD><TD align=left> <A href="re-assoc.html">subexpression grouping to force associativity</A></TD></TR>
|
||||
<TR><TD align=right>
|
||||
<A href="repetition.dat">repetition.dat</A> </TD><TD align=left> <A href="re-repetition.html">explicit vs. implicit repetitions</A></TD></TR>
|
||||
</TBODY></TABLE></TD></TR></TBODY></TABLE>
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Usage">Usage</A></H3></FONT></FONT></CENTER>
|
||||
To run the
|
||||
<STRONG>basic.dat</STRONG>
|
||||
tests:
|
||||
<DIV style="padding-left:16px;text-indent:0px">
|
||||
<PRE>
|
||||
testregex < basic.dat
|
||||
</DIV>
|
||||
</PRE>
|
||||
<P>
|
||||
If the local implementation hangs or dumps on some tests then run with
|
||||
the <STRONG>-c</STRONG> option.
|
||||
The <STRONG>-h</STRONG> option lists the test data format details.
|
||||
The test data files exercise all features;
|
||||
the test harness detects and ignores features not
|
||||
supported by the local implementation.
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="Reference Implementation Notes">Reference Implementation Notes</A></H3></FONT></FONT></CENTER>
|
||||
<P>
|
||||
<H4><A name="D: diet libc">D: diet libc</A></H4>
|
||||
The
|
||||
<A href="http://www.fefe.de/dietlibc/" target=_top>diet libc</A>
|
||||
implementation is currently omitted because it fails all but one
|
||||
<STRONG>basic.dat</STRONG>
|
||||
test.
|
||||
<P>
|
||||
<H4><A name="P: PCRE">P: PCRE</A></H4>
|
||||
The
|
||||
<STRONG>P</STRONG>
|
||||
implementation emulates
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man1/perl.html"><STRONG>perl</STRONG></A>(1)</NOBR>
|
||||
and is not X/Open compliant by design.
|
||||
The main differences are:
|
||||
<UL type=square>
|
||||
<LI>
|
||||
<STRONG>P</STRONG>
|
||||
<EM>leftmost-first</EM>
|
||||
matching as opposed to the X/Open
|
||||
<EM>leftmost-longest</EM>.
|
||||
<LI>
|
||||
<STRONG>REG_EXTENDED</STRONG>
|
||||
patterns only.
|
||||
</UL>
|
||||
<P>
|
||||
However, the
|
||||
<STRONG>P</STRONG>
|
||||
package regression tests, and
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man1/perl.html"><STRONG>perl</STRONG></A>(1)</NOBR>
|
||||
features creeping into other implementations,
|
||||
make it reasonable to include here.
|
||||
<P>
|
||||
<P><HR><CENTER><FONT color=red><FONT face=courier><H3><A name="testregex Notes">testregex Notes</A></H3></FONT></FONT></CENTER>
|
||||
Extensions to the standard terminology are derived from the AT&T
|
||||
implementation, unified under
|
||||
<STRONG><regex.h></STRONG>
|
||||
with these modes:
|
||||
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
|
||||
<TABLE align=center bgcolor=papayawhip border=1 bordercolor=white cellpadding=2 cellspacing=2 frame=box rules=all >
|
||||
<TBODY>
|
||||
<TR><TD align=center>MODE </TD><TD align=left> FLAGS </TD><TD align=left> DESCRIPTION</TD></TR>
|
||||
<TR><TD align=right>
|
||||
BRE </TD><TD align=left> 0 </TD><TD align=left> basic RE</TD></TR>
|
||||
<TR><TD align=right>
|
||||
ERE </TD><TD align=left> REG_EXTENDED </TD><TD align=left> egrep RE with perl (...) extensions</TD></TR>
|
||||
<TR><TD align=right>
|
||||
ARE </TD><TD align=left> REG_AUGMENTED </TD><TD align=left> ERE with ! negation, <> word boundaries</TD></TR>
|
||||
<TR><TD align=right>
|
||||
SRE </TD><TD align=left> REG_SHELL </TD><TD align=left> sh patterns</TD></TR>
|
||||
<TR><TD align=right>
|
||||
KRE </TD><TD align=left> REG_SHELL|REG_AUGMENTED </TD><TD align=left> ksh93 patterns: ! @ ( | & ) { }</TD></TR>
|
||||
<TR><TD align=right>
|
||||
LRE </TD><TD align=left> REG_LITERAL </TD><TD align=left> fgrep patterns</TD></TR>
|
||||
</TBODY></TABLE></TD></TR></TBODY></TABLE>
|
||||
<P>
|
||||
and a few flags to handle
|
||||
<NOBR><A href="http://web.archive.org/~gsf/man/man3/fnmatch.html"><STRONG>fnmatch</STRONG></A>(3):</NOBR>
|
||||
<P></P><TABLE border=0 frame=void rules=none width=100%><TBODY><TR><TD>
|
||||
<TABLE align=center bgcolor=papayawhip border=1 bordercolor=white cellpadding=2 cellspacing=2 frame=box rules=all >
|
||||
<TBODY>
|
||||
<TR><TD align=left>regex FLAG </TD><TD align=left> fnmatch FLAG</TD></TR>
|
||||
<TR><TD align=left>
|
||||
REG_SHELL_ESCAPED </TD><TD align=left> FNM_NOESCAPE</TD></TR>
|
||||
<TR><TD align=left>
|
||||
REG_SHELL_PATH </TD><TD align=left> FNM_PATHNAME</TD></TR>
|
||||
<TR><TD align=left>
|
||||
REG_SHELL_DOT </TD><TD align=left> FNM_PERIOD</TD></TR>
|
||||
</TBODY></TABLE></TD></TR></TBODY></TABLE>
|
||||
<P>
|
||||
The original
|
||||
<TT>testregex.c</TT>
|
||||
was done by Doug McIlroy at Bell Labs.
|
||||
The current implementation is maintained by Glenn Fowler <SMALL><<A href=mailto:gsf@research.att.com>gsf@research.att.com</A>></SMALL>.
|
||||
<P>
|
||||
<HR>
|
||||
<TABLE border=0 align=center width=96%>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right><A href="mailto:gsf@research.att.com?subject= ../re/testregex.mm mm document">Glenn Fowler</A></TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Information and Software Systems Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>AT&T Labs Research</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>Florham Park NJ</TD>
|
||||
</TR>
|
||||
<TR>
|
||||
<TD align=left></TD>
|
||||
<TD align=center></TD>
|
||||
<TD align=right>March 22, 2011</TD>
|
||||
</TR>
|
||||
</TABLE>
|
||||
<P>
|
||||
|
||||
</TD></TR></TBODY></TABLE>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
Loading…
Reference in a new issue