@@ -114,6 +114,7 @@ def __init__(self, pattern, data=None):
114
114
data = []
115
115
self .data = data
116
116
self .width = None
117
+
117
118
def dump (self , level = 0 ):
118
119
nl = True
119
120
seqtypes = (tuple , list )
@@ -404,6 +405,15 @@ def _escape(source, escape, state):
404
405
pass
405
406
raise source .error ("bad escape %s" % escape , len (escape ))
406
407
408
+ def _uniq (items ):
409
+ if len (set (items )) == len (items ):
410
+ return items
411
+ newitems = []
412
+ for item in items :
413
+ if item not in newitems :
414
+ newitems .append (item )
415
+ return newitems
416
+
407
417
def _parse_sub (source , state , verbose , nested = True ):
408
418
# parse an alternation: a|b|c
409
419
@@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
420
430
return items [0 ]
421
431
422
432
subpattern = SubPattern (state )
423
- subpatternappend = subpattern .append
424
433
425
434
# check if all items share a common prefix
426
435
while True :
@@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
437
446
# move it out of the branch
438
447
for item in items :
439
448
del item [0 ]
440
- subpatternappend (prefix )
449
+ subpattern . append (prefix )
441
450
continue # check next one
442
451
break
443
452
444
453
# check if the branch can be replaced by a character set
454
+ set = []
445
455
for item in items :
446
- if len (item ) != 1 or item [0 ][0 ] is not LITERAL :
456
+ if len (item ) != 1 :
457
+ break
458
+ op , av = item [0 ]
459
+ if op is LITERAL :
460
+ set .append ((op , av ))
461
+ elif op is IN and av [0 ][0 ] is not NEGATE :
462
+ set .extend (av )
463
+ else :
447
464
break
448
465
else :
449
466
# we can store this as a character set instead of a
450
467
# branch (the compiler may optimize this even more)
451
- subpatternappend ((IN , [ item [ 0 ] for item in items ] ))
468
+ subpattern . append ((IN , _uniq ( set ) ))
452
469
return subpattern
453
470
454
471
subpattern .append ((BRANCH , (None , items )))
455
472
return subpattern
456
473
457
- def _parse_sub_cond (source , state , condgroup , verbose ):
458
- item_yes = _parse (source , state , verbose )
459
- if source .match ("|" ):
460
- item_no = _parse (source , state , verbose )
461
- if source .next == "|" :
462
- raise source .error ("conditional backref with more than two branches" )
463
- else :
464
- item_no = None
465
- subpattern = SubPattern (state )
466
- subpattern .append ((GROUPREF_EXISTS , (condgroup , item_yes , item_no )))
467
- return subpattern
468
-
469
474
def _parse (source , state , verbose , first = False ):
470
475
# parse a simple pattern
471
476
subpattern = SubPattern (state )
@@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
511
516
setappend = set .append
512
517
## if sourcematch(":"):
513
518
## pass # handle character classes
514
- if sourcematch ("^" ):
515
- setappend ((NEGATE , None ))
519
+ negate = sourcematch ("^" )
516
520
# check remaining characters
517
- start = set [:]
518
521
while True :
519
522
this = sourceget ()
520
523
if this is None :
521
524
raise source .error ("unterminated character set" ,
522
525
source .tell () - here )
523
- if this == "]" and set != start :
526
+ if this == "]" and set :
524
527
break
525
528
elif this [0 ] == "\\ " :
526
529
code1 = _class_escape (source , this )
@@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
556
559
code1 = code1 [1 ][0 ]
557
560
setappend (code1 )
558
561
562
+ set = _uniq (set )
559
563
# XXX: <fl> should move set optimization to compiler!
560
- if _len (set )== 1 and set [0 ][0 ] is LITERAL :
561
- subpatternappend (set [0 ]) # optimization
562
- elif _len (set )== 2 and set [0 ][0 ] is NEGATE and set [1 ][0 ] is LITERAL :
563
- subpatternappend ((NOT_LITERAL , set [1 ][1 ])) # optimization
564
+ if _len (set ) == 1 and set [0 ][0 ] is LITERAL :
565
+ # optimization
566
+ if negate :
567
+ subpatternappend ((NOT_LITERAL , set [0 ][1 ]))
568
+ else :
569
+ subpatternappend (set [0 ])
564
570
else :
565
- # XXX: <fl> should add charmap optimization here
571
+ if negate :
572
+ set .insert (0 , (NEGATE , None ))
573
+ # charmap optimization can't be added here because
574
+ # global flags still are not known
566
575
subpatternappend ((IN , set ))
567
576
568
577
elif this in REPEAT_CHARS :
@@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
579
588
if source .next == "}" :
580
589
subpatternappend ((LITERAL , _ord (this )))
581
590
continue
591
+
582
592
min , max = 0 , MAXREPEAT
583
593
lo = hi = ""
584
594
while source .next in DIGITS :
@@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
592
602
subpatternappend ((LITERAL , _ord (this )))
593
603
source .seek (here )
594
604
continue
605
+
595
606
if lo :
596
607
min = int (lo )
597
608
if min >= MAXREPEAT :
@@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
610
621
item = subpattern [- 1 :]
611
622
else :
612
623
item = None
613
- if not item or ( _len ( item ) == 1 and item [0 ][0 ] is AT ) :
624
+ if not item or item [0 ][0 ] is AT :
614
625
raise source .error ("nothing to repeat" ,
615
626
source .tell () - here + len (this ))
616
627
if item [0 ][0 ] in _REPEATCODES :
617
628
raise source .error ("multiple repeat" ,
618
629
source .tell () - here + len (this ))
630
+ if item [0 ][0 ] is SUBPATTERN :
631
+ group , add_flags , del_flags , p = item [0 ][1 ]
632
+ if group is None and not add_flags and not del_flags :
633
+ item = p
619
634
if sourcematch ("?" ):
620
635
subpattern [- 1 ] = (MIN_REPEAT , (min , max , item ))
621
636
else :
@@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
628
643
start = source .tell () - 1
629
644
group = True
630
645
name = None
631
- condgroup = None
632
646
add_flags = 0
633
647
del_flags = 0
634
648
if sourcematch ("?" ):
@@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
660
674
state .checklookbehindgroup (gid , source )
661
675
subpatternappend ((GROUPREF , gid ))
662
676
continue
677
+
663
678
else :
664
679
char = sourceget ()
665
680
if char is None :
@@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
678
693
if sourceget () == ")" :
679
694
break
680
695
continue
696
+
681
697
elif char in "=!<" :
682
698
# lookahead assertions
683
699
dir = 1
@@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
704
720
else :
705
721
subpatternappend ((ASSERT_NOT , (dir , p )))
706
722
continue
723
+
707
724
elif char == "(" :
708
725
# conditional backreference group
709
726
condname = source .getuntil (")" )
710
- group = None
711
727
if condname .isidentifier ():
712
728
condgroup = state .groupdict .get (condname )
713
729
if condgroup is None :
@@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
728
744
msg = "invalid group reference %d" % condgroup
729
745
raise source .error (msg , len (condname ) + 1 )
730
746
state .checklookbehindgroup (condgroup , source )
747
+ item_yes = _parse (source , state , verbose )
748
+ if source .match ("|" ):
749
+ item_no = _parse (source , state , verbose )
750
+ if source .next == "|" :
751
+ raise source .error ("conditional backref with more than two branches" )
752
+ else :
753
+ item_no = None
754
+ if not source .match (")" ):
755
+ raise source .error ("missing ), unterminated subpattern" ,
756
+ source .tell () - start )
757
+ subpatternappend ((GROUPREF_EXISTS , (condgroup , item_yes , item_no )))
758
+ continue
759
+
731
760
elif char in FLAGS or char == "-" :
732
761
# flags
733
762
flags = _parse_flags (source , state , char )
@@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
744
773
if (state .flags & SRE_FLAG_VERBOSE ) and not verbose :
745
774
raise Verbose
746
775
continue
776
+
747
777
add_flags , del_flags = flags
748
778
group = None
749
779
else :
@@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
756
786
group = state .opengroup (name )
757
787
except error as err :
758
788
raise source .error (err .msg , len (name ) + 1 ) from None
759
- if condgroup :
760
- p = _parse_sub_cond (source , state , condgroup , verbose )
761
- else :
762
- sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE )) and
763
- not (del_flags & SRE_FLAG_VERBOSE ))
764
- p = _parse_sub (source , state , sub_verbose )
789
+ sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE )) and
790
+ not (del_flags & SRE_FLAG_VERBOSE ))
791
+ p = _parse_sub (source , state , sub_verbose )
765
792
if not source .match (")" ):
766
793
raise source .error ("missing ), unterminated subpattern" ,
767
794
source .tell () - start )
@@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
773
800
subpatternappend ((AT , AT_BEGINNING ))
774
801
775
802
elif this == "$" :
776
- subpattern . append ((AT , AT_END ))
803
+ subpatternappend ((AT , AT_END ))
777
804
778
805
else :
779
806
raise AssertionError ("unsupported special character %r" % (char ,))
780
807
808
+ # unpack non-capturing groups
809
+ for i in range (len (subpattern ))[::- 1 ]:
810
+ op , av = subpattern [i ]
811
+ if op is SUBPATTERN :
812
+ group , add_flags , del_flags , p = av
813
+ if group is None and not add_flags and not del_flags :
814
+ subpattern [i : i + 1 ] = p
815
+
781
816
return subpattern
782
817
783
818
def _parse_flags (source , state , char ):
0 commit comments