Skip to content

Commit 7aedd69

Browse files
authored
Merge pull request #569 from rctcwyvrn/uhoh-infinite-loop-5.7
2 parents e94ecea + 5f5848e commit 7aedd69

16 files changed

+981
-297
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 133 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen {
6565
emitDot()
6666

6767
case let .char(c):
68-
try emitCharacter(c)
68+
emitCharacter(c)
6969

7070
case let .scalar(s):
71-
try emitScalar(s)
71+
if options.semanticLevel == .graphemeCluster {
72+
emitCharacter(Character(s))
73+
} else {
74+
emitMatchScalar(s)
75+
}
7276

7377
case let .assertion(kind):
7478
try emitAssertion(kind)
@@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen {
9498
}
9599
}
96100

101+
mutating func emitQuotedLiteral(_ s: String) {
102+
guard options.semanticLevel == .graphemeCluster else {
103+
for char in s {
104+
for scalar in char.unicodeScalars {
105+
emitMatchScalar(scalar)
106+
}
107+
}
108+
return
109+
}
110+
111+
// Fast path for eliding boundary checks for an all ascii quoted literal
112+
if optimizationsEnabled && s.allSatisfy(\.isASCII) {
113+
let lastIdx = s.unicodeScalars.indices.last!
114+
for idx in s.unicodeScalars.indices {
115+
let boundaryCheck = idx == lastIdx
116+
let scalar = s.unicodeScalars[idx]
117+
if options.isCaseInsensitive && scalar.properties.isCased {
118+
builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck)
119+
} else {
120+
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
121+
}
122+
}
123+
return
124+
}
125+
126+
for c in s { emitCharacter(c) }
127+
}
128+
97129
mutating func emitBackreference(
98130
_ ref: AST.Reference
99131
) throws {
@@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen {
257289
}
258290
}
259291

260-
mutating func emitScalar(_ s: UnicodeScalar) throws {
261-
// TODO: Native instruction buildMatchScalar(s)
262-
if options.isCaseInsensitive {
263-
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
264-
builder.buildConsume(by: consumeScalar {
265-
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
266-
})
292+
mutating func emitMatchScalar(_ s: UnicodeScalar) {
293+
assert(options.semanticLevel == .unicodeScalar)
294+
if options.isCaseInsensitive && s.properties.isCased {
295+
builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false)
267296
} else {
268-
builder.buildConsume(by: consumeScalar {
269-
$0 == s
270-
})
297+
builder.buildMatchScalar(s, boundaryCheck: false)
271298
}
272299
}
273300

274-
mutating func emitCharacter(_ c: Character) throws {
275-
// Unicode scalar matches the specific scalars that comprise a character
301+
mutating func emitCharacter(_ c: Character) {
302+
// Unicode scalar mode matches the specific scalars that comprise a character
276303
if options.semanticLevel == .unicodeScalar {
277304
for scalar in c.unicodeScalars {
278-
try emitScalar(scalar)
305+
emitMatchScalar(scalar)
279306
}
280307
return
281308
}
282309

283310
if options.isCaseInsensitive && c.isCased {
284-
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
285-
builder.buildConsume { input, bounds in
286-
let inputChar = input[bounds.lowerBound].lowercased()
287-
let matchChar = c.lowercased()
288-
return inputChar == matchChar
289-
? input.index(after: bounds.lowerBound)
290-
: nil
311+
if optimizationsEnabled && c.isASCII {
312+
// c.isCased ensures that c is not CR-LF,
313+
// so we know that c is a single scalar
314+
assert(c.unicodeScalars.count == 1)
315+
builder.buildMatchScalarCaseInsensitive(
316+
c.unicodeScalars.last!,
317+
boundaryCheck: true)
318+
} else {
319+
builder.buildMatch(c, isCaseInsensitive: true)
291320
}
292-
} else {
293-
builder.buildMatch(c)
321+
return
322+
}
323+
324+
if optimizationsEnabled && c.isASCII {
325+
let lastIdx = c.unicodeScalars.indices.last!
326+
for idx in c.unicodeScalars.indices {
327+
builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx)
328+
}
329+
return
294330
}
331+
332+
builder.buildMatch(c, isCaseInsensitive: false)
295333
}
296334

297335
mutating func emitAny() {
@@ -567,7 +605,12 @@ fileprivate extension Compiler.ByteCodeGen {
567605
decrement %minTrips and fallthrough
568606
569607
loop-body:
608+
<if can't guarantee forward progress && extraTrips = nil>:
609+
mov currentPosition %pos
570610
evaluate the subexpression
611+
<if can't guarantee forward progress && extraTrips = nil>:
612+
if %pos is currentPosition:
613+
goto exit
571614
goto min-trip-count control block
572615
573616
exit-policy control block:
@@ -670,7 +713,28 @@ fileprivate extension Compiler.ByteCodeGen {
670713
// <subexpression>
671714
// branch min-trip-count
672715
builder.label(loopBody)
716+
717+
// if we aren't sure if the child node will have forward progress and
718+
// we have an unbounded quantification
719+
let startPosition: PositionRegister?
720+
let emitPositionChecking =
721+
(!optimizationsEnabled || !child.guaranteesForwardProgress) &&
722+
extraTrips == nil
723+
724+
if emitPositionChecking {
725+
startPosition = builder.makePositionRegister()
726+
builder.buildMoveCurrentPosition(into: startPosition!)
727+
} else {
728+
startPosition = nil
729+
}
673730
try emitNode(child)
731+
if emitPositionChecking {
732+
// in all quantifier cases, no matter what minTrips or extraTrips is,
733+
// if we have a successful non-advancing match, branch to exit because it
734+
// can match an arbitrary number of times
735+
builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!)
736+
}
737+
674738
if minTrips <= 1 {
675739
// fallthrough
676740
} else {
@@ -715,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen {
715779
_ ccc: DSLTree.CustomCharacterClass
716780
) throws {
717781
if let asciiBitset = ccc.asAsciiBitset(options),
718-
options.semanticLevel == .graphemeCluster,
719782
optimizationsEnabled {
720-
// future work: add a bit to .matchBitset to consume either a character
721-
// or a scalar so we can have this optimization in scalar mode
722-
builder.buildMatchAsciiBitset(asciiBitset)
783+
if options.semanticLevel == .unicodeScalar {
784+
builder.buildScalarMatchAsciiBitset(asciiBitset)
785+
} else {
786+
builder.buildMatchAsciiBitset(asciiBitset)
787+
}
723788
} else {
724789
let consumer = try ccc.generateConsumer(options)
725790
builder.buildConsume(by: consumer)
@@ -796,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen {
796861
try emitAtom(a)
797862

798863
case let .quotedLiteral(s):
799-
if options.semanticLevel == .graphemeCluster {
800-
if options.isCaseInsensitive {
801-
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
802-
builder.buildConsume { input, bounds in
803-
var iterator = s.makeIterator()
804-
var currentIndex = bounds.lowerBound
805-
while let ch = iterator.next() {
806-
guard currentIndex < bounds.upperBound,
807-
ch.lowercased() == input[currentIndex].lowercased()
808-
else { return nil }
809-
input.formIndex(after: &currentIndex)
810-
}
811-
return currentIndex
812-
}
813-
} else {
814-
builder.buildMatchSequence(s)
815-
}
816-
} else {
817-
builder.buildConsume {
818-
[caseInsensitive = options.isCaseInsensitive] input, bounds in
819-
// TODO: Case folding
820-
var iterator = s.unicodeScalars.makeIterator()
821-
var currentIndex = bounds.lowerBound
822-
while let scalar = iterator.next() {
823-
guard currentIndex < bounds.upperBound else { return nil }
824-
if caseInsensitive {
825-
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
826-
return nil
827-
}
828-
} else {
829-
if scalar != input.unicodeScalars[currentIndex] {
830-
return nil
831-
}
832-
}
833-
input.unicodeScalars.formIndex(after: &currentIndex)
834-
}
835-
return currentIndex
836-
}
837-
}
864+
emitQuotedLiteral(s)
838865

839866
case let .convertedRegexLiteral(n, _):
840867
return try emitNode(n)
@@ -856,3 +883,42 @@ fileprivate extension Compiler.ByteCodeGen {
856883
return nil
857884
}
858885
}
886+
887+
extension DSLTree.Node {
888+
var guaranteesForwardProgress: Bool {
889+
switch self {
890+
case .orderedChoice(let children):
891+
return children.allSatisfy { $0.guaranteesForwardProgress }
892+
case .concatenation(let children):
893+
return children.contains(where: { $0.guaranteesForwardProgress })
894+
case .capture(_, _, let node, _):
895+
return node.guaranteesForwardProgress
896+
case .nonCapturingGroup(let kind, let child):
897+
switch kind.ast {
898+
case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind:
899+
return false
900+
default: return child.guaranteesForwardProgress
901+
}
902+
case .atom(let atom):
903+
switch atom {
904+
case .changeMatchingOptions, .assertion: return false
905+
default: return true
906+
}
907+
case .trivia, .empty:
908+
return false
909+
case .quotedLiteral(let string):
910+
return !string.isEmpty
911+
case .convertedRegexLiteral(let node, _):
912+
return node.guaranteesForwardProgress
913+
case .consumer, .matcher:
914+
// Allow zero width consumers and matchers
915+
return false
916+
case .customCharacterClass:
917+
return true
918+
case .quantification(let amount, _, let child):
919+
let (atLeast, _) = amount.ast.bounds
920+
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
921+
default: return false
922+
}
923+
}
924+
}

0 commit comments

Comments
 (0)