@@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen {
65
65
emitDot ( )
66
66
67
67
case let . char( c) :
68
- try emitCharacter ( c)
68
+ emitCharacter ( c)
69
69
70
70
case let . scalar( s) :
71
- try emitScalar ( s)
71
+ if options. semanticLevel == . graphemeCluster {
72
+ emitCharacter ( Character ( s) )
73
+ } else {
74
+ emitMatchScalar ( s)
75
+ }
72
76
73
77
case let . assertion( kind) :
74
78
try emitAssertion ( kind)
@@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen {
94
98
}
95
99
}
96
100
101
+ mutating func emitQuotedLiteral( _ s: String ) {
102
+ guard options. semanticLevel == . graphemeCluster else {
103
+ for char in s {
104
+ for scalar in char. unicodeScalars {
105
+ emitMatchScalar ( scalar)
106
+ }
107
+ }
108
+ return
109
+ }
110
+
111
+ // Fast path for eliding boundary checks for an all ascii quoted literal
112
+ if optimizationsEnabled && s. allSatisfy ( \. isASCII) {
113
+ let lastIdx = s. unicodeScalars. indices. last!
114
+ for idx in s. unicodeScalars. indices {
115
+ let boundaryCheck = idx == lastIdx
116
+ let scalar = s. unicodeScalars [ idx]
117
+ if options. isCaseInsensitive && scalar. properties. isCased {
118
+ builder. buildMatchScalarCaseInsensitive ( scalar, boundaryCheck: boundaryCheck)
119
+ } else {
120
+ builder. buildMatchScalar ( scalar, boundaryCheck: boundaryCheck)
121
+ }
122
+ }
123
+ return
124
+ }
125
+
126
+ for c in s { emitCharacter ( c) }
127
+ }
128
+
97
129
mutating func emitBackreference(
98
130
_ ref: AST . Reference
99
131
) throws {
@@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen {
257
289
}
258
290
}
259
291
260
- mutating func emitScalar( _ s: UnicodeScalar ) throws {
261
- // TODO: Native instruction buildMatchScalar(s)
262
- if options. isCaseInsensitive {
263
- // TODO: e.g. buildCaseInsensitiveMatchScalar(s)
264
- builder. buildConsume ( by: consumeScalar {
265
- $0. properties. lowercaseMapping == s. properties. lowercaseMapping
266
- } )
292
+ mutating func emitMatchScalar( _ s: UnicodeScalar ) {
293
+ assert ( options. semanticLevel == . unicodeScalar)
294
+ if options. isCaseInsensitive && s. properties. isCased {
295
+ builder. buildMatchScalarCaseInsensitive ( s, boundaryCheck: false )
267
296
} else {
268
- builder. buildConsume ( by: consumeScalar {
269
- $0 == s
270
- } )
297
+ builder. buildMatchScalar ( s, boundaryCheck: false )
271
298
}
272
299
}
273
300
274
- mutating func emitCharacter( _ c: Character ) throws {
275
- // Unicode scalar matches the specific scalars that comprise a character
301
+ mutating func emitCharacter( _ c: Character ) {
302
+ // Unicode scalar mode matches the specific scalars that comprise a character
276
303
if options. semanticLevel == . unicodeScalar {
277
304
for scalar in c. unicodeScalars {
278
- try emitScalar ( scalar)
305
+ emitMatchScalar ( scalar)
279
306
}
280
307
return
281
308
}
282
309
283
310
if options. isCaseInsensitive && c. isCased {
284
- // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
285
- builder. buildConsume { input, bounds in
286
- let inputChar = input [ bounds. lowerBound] . lowercased ( )
287
- let matchChar = c. lowercased ( )
288
- return inputChar == matchChar
289
- ? input. index ( after: bounds. lowerBound)
290
- : nil
311
+ if optimizationsEnabled && c. isASCII {
312
+ // c.isCased ensures that c is not CR-LF,
313
+ // so we know that c is a single scalar
314
+ assert ( c. unicodeScalars. count == 1 )
315
+ builder. buildMatchScalarCaseInsensitive (
316
+ c. unicodeScalars. last!,
317
+ boundaryCheck: true )
318
+ } else {
319
+ builder. buildMatch ( c, isCaseInsensitive: true )
291
320
}
292
- } else {
293
- builder. buildMatch ( c)
321
+ return
322
+ }
323
+
324
+ if optimizationsEnabled && c. isASCII {
325
+ let lastIdx = c. unicodeScalars. indices. last!
326
+ for idx in c. unicodeScalars. indices {
327
+ builder. buildMatchScalar ( c. unicodeScalars [ idx] , boundaryCheck: idx == lastIdx)
328
+ }
329
+ return
294
330
}
331
+
332
+ builder. buildMatch ( c, isCaseInsensitive: false )
295
333
}
296
334
297
335
mutating func emitAny( ) {
@@ -567,7 +605,12 @@ fileprivate extension Compiler.ByteCodeGen {
567
605
decrement %minTrips and fallthrough
568
606
569
607
loop-body:
608
+ <if can't guarantee forward progress && extraTrips = nil>:
609
+ mov currentPosition %pos
570
610
evaluate the subexpression
611
+ <if can't guarantee forward progress && extraTrips = nil>:
612
+ if %pos is currentPosition:
613
+ goto exit
571
614
goto min-trip-count control block
572
615
573
616
exit-policy control block:
@@ -670,7 +713,28 @@ fileprivate extension Compiler.ByteCodeGen {
670
713
// <subexpression>
671
714
// branch min-trip-count
672
715
builder. label ( loopBody)
716
+
717
+ // if we aren't sure if the child node will have forward progress and
718
+ // we have an unbounded quantification
719
+ let startPosition : PositionRegister ?
720
+ let emitPositionChecking =
721
+ ( !optimizationsEnabled || !child. guaranteesForwardProgress) &&
722
+ extraTrips == nil
723
+
724
+ if emitPositionChecking {
725
+ startPosition = builder. makePositionRegister ( )
726
+ builder. buildMoveCurrentPosition ( into: startPosition!)
727
+ } else {
728
+ startPosition = nil
729
+ }
673
730
try emitNode ( child)
731
+ if emitPositionChecking {
732
+ // in all quantifier cases, no matter what minTrips or extraTrips is,
733
+ // if we have a successful non-advancing match, branch to exit because it
734
+ // can match an arbitrary number of times
735
+ builder. buildCondBranch ( to: exit, ifSamePositionAs: startPosition!)
736
+ }
737
+
674
738
if minTrips <= 1 {
675
739
// fallthrough
676
740
} else {
@@ -715,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen {
715
779
_ ccc: DSLTree . CustomCharacterClass
716
780
) throws {
717
781
if let asciiBitset = ccc. asAsciiBitset ( options) ,
718
- options. semanticLevel == . graphemeCluster,
719
782
optimizationsEnabled {
720
- // future work: add a bit to .matchBitset to consume either a character
721
- // or a scalar so we can have this optimization in scalar mode
722
- builder. buildMatchAsciiBitset ( asciiBitset)
783
+ if options. semanticLevel == . unicodeScalar {
784
+ builder. buildScalarMatchAsciiBitset ( asciiBitset)
785
+ } else {
786
+ builder. buildMatchAsciiBitset ( asciiBitset)
787
+ }
723
788
} else {
724
789
let consumer = try ccc. generateConsumer ( options)
725
790
builder. buildConsume ( by: consumer)
@@ -796,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen {
796
861
try emitAtom ( a)
797
862
798
863
case let . quotedLiteral( s) :
799
- if options. semanticLevel == . graphemeCluster {
800
- if options. isCaseInsensitive {
801
- // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
802
- builder. buildConsume { input, bounds in
803
- var iterator = s. makeIterator ( )
804
- var currentIndex = bounds. lowerBound
805
- while let ch = iterator. next ( ) {
806
- guard currentIndex < bounds. upperBound,
807
- ch. lowercased ( ) == input [ currentIndex] . lowercased ( )
808
- else { return nil }
809
- input. formIndex ( after: & currentIndex)
810
- }
811
- return currentIndex
812
- }
813
- } else {
814
- builder. buildMatchSequence ( s)
815
- }
816
- } else {
817
- builder. buildConsume {
818
- [ caseInsensitive = options. isCaseInsensitive] input, bounds in
819
- // TODO: Case folding
820
- var iterator = s. unicodeScalars. makeIterator ( )
821
- var currentIndex = bounds. lowerBound
822
- while let scalar = iterator. next ( ) {
823
- guard currentIndex < bounds. upperBound else { return nil }
824
- if caseInsensitive {
825
- if scalar. properties. lowercaseMapping != input. unicodeScalars [ currentIndex] . properties. lowercaseMapping {
826
- return nil
827
- }
828
- } else {
829
- if scalar != input. unicodeScalars [ currentIndex] {
830
- return nil
831
- }
832
- }
833
- input. unicodeScalars. formIndex ( after: & currentIndex)
834
- }
835
- return currentIndex
836
- }
837
- }
864
+ emitQuotedLiteral ( s)
838
865
839
866
case let . convertedRegexLiteral( n, _) :
840
867
return try emitNode ( n)
@@ -856,3 +883,42 @@ fileprivate extension Compiler.ByteCodeGen {
856
883
return nil
857
884
}
858
885
}
886
+
887
+ extension DSLTree . Node {
888
+ var guaranteesForwardProgress : Bool {
889
+ switch self {
890
+ case . orderedChoice( let children) :
891
+ return children. allSatisfy { $0. guaranteesForwardProgress }
892
+ case . concatenation( let children) :
893
+ return children. contains ( where: { $0. guaranteesForwardProgress } )
894
+ case . capture( _, _, let node, _) :
895
+ return node. guaranteesForwardProgress
896
+ case . nonCapturingGroup( let kind, let child) :
897
+ switch kind. ast {
898
+ case . lookahead, . negativeLookahead, . lookbehind, . negativeLookbehind:
899
+ return false
900
+ default : return child. guaranteesForwardProgress
901
+ }
902
+ case . atom( let atom) :
903
+ switch atom {
904
+ case . changeMatchingOptions, . assertion: return false
905
+ default : return true
906
+ }
907
+ case . trivia, . empty:
908
+ return false
909
+ case . quotedLiteral( let string) :
910
+ return !string. isEmpty
911
+ case . convertedRegexLiteral( let node, _) :
912
+ return node. guaranteesForwardProgress
913
+ case . consumer, . matcher:
914
+ // Allow zero width consumers and matchers
915
+ return false
916
+ case . customCharacterClass:
917
+ return true
918
+ case . quantification( let amount, _, let child) :
919
+ let ( atLeast, _) = amount. ast. bounds
920
+ return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
921
+ default : return false
922
+ }
923
+ }
924
+ }
0 commit comments