Cache word indices

Azoy · Azoy · commit 8f5756338373 · 2022-07-05T13:43:18.000-07:00
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -118,12 +118,13 @@ fileprivate extension Compiler.ByteCodeGen {
     // need to supply both a slice bounds and a per-search bounds.
     switch kind {
     case .startOfSubject:
-      builder.buildAssert { (input, pos, subjectBounds) in
+      builder.buildAssert { (_, _, input, pos, subjectBounds) in
         pos == subjectBounds.lowerBound
       }
 
     case .endOfSubjectBeforeNewline:
-      builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+      builder.buildAssert { [semanticLevel = options.semanticLevel]
+          (_, _, input, pos, subjectBounds) in
         if pos == subjectBounds.upperBound { return true }
         switch semanticLevel {
         case .graphemeCluster:
@@ -136,7 +137,7 @@ fileprivate extension Compiler.ByteCodeGen {
       }
 
     case .endOfSubject:
-      builder.buildAssert { (input, pos, subjectBounds) in
+      builder.buildAssert { (_, _, input, pos, subjectBounds) in
         pos == subjectBounds.upperBound
       }
 
@@ -149,16 +150,16 @@ fileprivate extension Compiler.ByteCodeGen {
       
       // FIXME: This needs to be based on `searchBounds`,
       // not the `subjectBounds` given as an argument here
-      builder.buildAssert { (input, pos, subjectBounds) in false }
+      builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
 
     case .textSegment:
-      builder.buildAssert { (input, pos, _) in
+      builder.buildAssert { (_, _, input, pos, _) in
         // FIXME: Grapheme or word based on options
         input.isOnGraphemeClusterBoundary(pos)
       }
 
     case .notTextSegment:
-      builder.buildAssert { (input, pos, _) in
+      builder.buildAssert { (_, _, input, pos, _) in
         // FIXME: Grapheme or word based on options
         !input.isOnGraphemeClusterBoundary(pos)
       }
@@ -169,7 +170,8 @@ fileprivate extension Compiler.ByteCodeGen {
       // the DSL-based `.startOfLine` anchor should always match the start
       // of a line. Right now we don't distinguish between those anchors.
       if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+        builder.buildAssert { [semanticLevel = options.semanticLevel]
+            (_, _, input, pos, subjectBounds) in
           if pos == subjectBounds.lowerBound { return true }
           switch semanticLevel {
           case .graphemeCluster:
@@ -179,7 +181,7 @@ fileprivate extension Compiler.ByteCodeGen {
           }
         }
       } else {
-        builder.buildAssert { (input, pos, subjectBounds) in
+        builder.buildAssert { (_, _, input, pos, subjectBounds) in
           pos == subjectBounds.lowerBound
         }
       }
@@ -190,7 +192,8 @@ fileprivate extension Compiler.ByteCodeGen {
       // the DSL-based `.endOfLine` anchor should always match the end
       // of a line. Right now we don't distinguish between those anchors.
       if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+        builder.buildAssert { [semanticLevel = options.semanticLevel]
+            (_, _, input, pos, subjectBounds) in
           if pos == subjectBounds.upperBound { return true }
           switch semanticLevel {
           case .graphemeCluster:
@@ -200,13 +203,14 @@ fileprivate extension Compiler.ByteCodeGen {
           }
         }
       } else {
-        builder.buildAssert { (input, pos, subjectBounds) in
+        builder.buildAssert { (_, _, input, pos, subjectBounds) in
           pos == subjectBounds.upperBound
         }
       }
 
     case .wordBoundary:
-      builder.buildAssert { [options] (input, pos, subjectBounds) in
+      builder.buildAssert { [options]
+          (cache, maxIndex, input, pos, subjectBounds) in
         if options.usesSimpleUnicodeBoundaries {
           // TODO: How should we handle bounds?
           return _CharacterClassModel.word.isBoundary(
@@ -216,12 +220,13 @@ fileprivate extension Compiler.ByteCodeGen {
             with: options
           )
         } else {
-          return input.isOnWordBoundary(at: pos)
+          return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
         }
       }
 
     case .notWordBoundary:
-      builder.buildAssert { [options] (input, pos, subjectBounds) in
+      builder.buildAssert { [options]
+          (cache, maxIndex, input, pos, subjectBounds) in
         if options.usesSimpleUnicodeBoundaries {
           // TODO: How should we handle bounds?
           return !_CharacterClassModel.word.isBoundary(
@@ -231,7 +236,7 @@ fileprivate extension Compiler.ByteCodeGen {
             with: options
           )
         } else {
-          return !input.isOnWordBoundary(at: pos)
+          return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
         }
       }
     }
diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift
@@ -16,7 +16,13 @@ struct MEProgram {
 
   typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
   typealias AssertionFunction =
-    (Input, Input.Index, Range<Input.Index>) throws -> Bool
+    (
+      inout Set<String.Index>?,
+      inout String.Index?,
+      Input,
+      Input.Index,
+      Range<Input.Index>
+    ) throws -> Bool
   typealias TransformFunction =
     (Input, Processor._StoredCapture) throws -> Any?
   typealias MatcherFunction =
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
@@ -78,6 +78,9 @@ struct Processor {
 
   var storedCaptures: Array<_StoredCapture>
 
+  var wordIndexCache: Set<String.Index>? = nil
+  var wordIndexMaxIndex: String.Index? = nil
+  
   var state: State = .inProgress
 
   var failureReason: Error? = nil
@@ -401,7 +404,13 @@ extension Processor {
       let reg = payload.assertion
       let assertion = registers[reg]
       do {
-        guard try assertion(input, currentPosition, subjectBounds) else {
+        guard try assertion(
+          &wordIndexCache,
+          &wordIndexMaxIndex,
+          input,
+          currentPosition,
+          subjectBounds
+        ) else {
           signalFailure()
           return
         }
diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift
@@ -13,20 +13,42 @@
 import Swift
 
 extension String {
-  func isOnWordBoundary(at i: String.Index) -> Bool {
+  func isOnWordBoundary(
+    at i: String.Index,
+    using cache: inout Set<String.Index>?,
+    _ maxIndex: inout String.Index?
+  ) -> Bool {
     guard i != startIndex, i != endIndex else {
       return true
     }
     
+    // If our index is already in our cache, then this is obviously on a
+    // boundary.
+    if let cache = cache, cache.contains(i) {
+      return true
+    }
+    
+    // If its not in the cache AND our max index is larger than our index, it
+    // means this index is never on a word boundary in our string. If our index
+    // is larger than max index, we may need to still do work to determine if
+    // i is on a boundary. If it's equal to max index, then it should've been
+    // taken the cache path.
+    if let maxIndex = maxIndex, i < maxIndex {
+      return false
+    }
+    
     if #available(SwiftStdlib 5.7, *) {
       var indices: Set<String.Index> = []
-      var j = startIndex
+      var j = maxIndex ?? startIndex
       
       while j < endIndex, j <= i {
         indices.insert(j)
         j = _wordIndex(after: j)
       }
       
+      cache = indices
+      maxIndex = j
+      
       return indices.contains(i)
     } else {
       return false

Original file line number	Diff line number	Diff line change
`@@ -118,12 +118,13 @@ fileprivate extension Compiler.ByteCodeGen {`
`118`	`118`	`// need to supply both a slice bounds and a per-search bounds.`
`119`	`119`	`switch kind {`
`120`	`120`	`case .startOfSubject:`
`121`		`- builder.buildAssert { (input, pos, subjectBounds) in`
	`121`	`+ builder.buildAssert { (_, _, input, pos, subjectBounds) in`
`122`	`122`	`pos == subjectBounds.lowerBound`
`123`	`123`	`}`
`124`	`124`
`125`	`125`	`case .endOfSubjectBeforeNewline:`
`126`		`- builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in`
	`126`	`+ builder.buildAssert { [semanticLevel = options.semanticLevel]`
	`127`	`+ (_, _, input, pos, subjectBounds) in`
`127`	`128`	`if pos == subjectBounds.upperBound { return true }`
`128`	`129`	`switch semanticLevel {`
`129`	`130`	`case .graphemeCluster:`
`@@ -136,7 +137,7 @@ fileprivate extension Compiler.ByteCodeGen {`
`136`	`137`	`}`
`137`	`138`
`138`	`139`	`case .endOfSubject:`
`139`		`- builder.buildAssert { (input, pos, subjectBounds) in`
	`140`	`+ builder.buildAssert { (_, _, input, pos, subjectBounds) in`
`140`	`141`	`pos == subjectBounds.upperBound`
`141`	`142`	`}`
`142`	`143`
`@@ -149,16 +150,16 @@ fileprivate extension Compiler.ByteCodeGen {`
`149`	`150`
`150`	`151`	// FIXME: This needs to be based on `searchBounds`,
`151`	`152`	// not the `subjectBounds` given as an argument here
`152`		`- builder.buildAssert { (input, pos, subjectBounds) in false }`
	`153`	`+ builder.buildAssert { (_, _, input, pos, subjectBounds) in false }`
`153`	`154`
`154`	`155`	`case .textSegment:`
`155`		`- builder.buildAssert { (input, pos, _) in`
	`156`	`+ builder.buildAssert { (_, _, input, pos, _) in`
`156`	`157`	`// FIXME: Grapheme or word based on options`
`157`	`158`	`input.isOnGraphemeClusterBoundary(pos)`
`158`	`159`	`}`
`159`	`160`
`160`	`161`	`case .notTextSegment:`
`161`		`- builder.buildAssert { (input, pos, _) in`
	`162`	`+ builder.buildAssert { (_, _, input, pos, _) in`
`162`	`163`	`// FIXME: Grapheme or word based on options`
`163`	`164`	`!input.isOnGraphemeClusterBoundary(pos)`
`164`	`165`	`}`
`@@ -169,7 +170,8 @@ fileprivate extension Compiler.ByteCodeGen {`
`169`	`170`	// the DSL-based `.startOfLine` anchor should always match the start
`170`	`171`	`// of a line. Right now we don't distinguish between those anchors.`
`171`	`172`	`if options.anchorsMatchNewlines {`
`172`		`- builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in`
	`173`	`+ builder.buildAssert { [semanticLevel = options.semanticLevel]`
	`174`	`+ (_, _, input, pos, subjectBounds) in`
`173`	`175`	`if pos == subjectBounds.lowerBound { return true }`
`174`	`176`	`switch semanticLevel {`
`175`	`177`	`case .graphemeCluster:`
`@@ -179,7 +181,7 @@ fileprivate extension Compiler.ByteCodeGen {`
`179`	`181`	`}`
`180`	`182`	`}`
`181`	`183`	`} else {`
`182`		`- builder.buildAssert { (input, pos, subjectBounds) in`
	`184`	`+ builder.buildAssert { (_, _, input, pos, subjectBounds) in`
`183`	`185`	`pos == subjectBounds.lowerBound`
`184`	`186`	`}`
`185`	`187`	`}`
`@@ -190,7 +192,8 @@ fileprivate extension Compiler.ByteCodeGen {`
`190`	`192`	// the DSL-based `.endOfLine` anchor should always match the end
`191`	`193`	`// of a line. Right now we don't distinguish between those anchors.`
`192`	`194`	`if options.anchorsMatchNewlines {`
`193`		`- builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in`
	`195`	`+ builder.buildAssert { [semanticLevel = options.semanticLevel]`
	`196`	`+ (_, _, input, pos, subjectBounds) in`
`194`	`197`	`if pos == subjectBounds.upperBound { return true }`
`195`	`198`	`switch semanticLevel {`
`196`	`199`	`case .graphemeCluster:`
`@@ -200,13 +203,14 @@ fileprivate extension Compiler.ByteCodeGen {`
`200`	`203`	`}`
`201`	`204`	`}`
`202`	`205`	`} else {`
`203`		`- builder.buildAssert { (input, pos, subjectBounds) in`
	`206`	`+ builder.buildAssert { (_, _, input, pos, subjectBounds) in`
`204`	`207`	`pos == subjectBounds.upperBound`
`205`	`208`	`}`
`206`	`209`	`}`
`207`	`210`
`208`	`211`	`case .wordBoundary:`
`209`		`- builder.buildAssert { [options] (input, pos, subjectBounds) in`
	`212`	`+ builder.buildAssert { [options]`
	`213`	`+ (cache, maxIndex, input, pos, subjectBounds) in`
`210`	`214`	`if options.usesSimpleUnicodeBoundaries {`
`211`	`215`	`// TODO: How should we handle bounds?`
`212`	`216`	`return _CharacterClassModel.word.isBoundary(`
`@@ -216,12 +220,13 @@ fileprivate extension Compiler.ByteCodeGen {`
`216`	`220`	`with: options`
`217`	`221`	`)`
`218`	`222`	`} else {`
`219`		`- return input.isOnWordBoundary(at: pos)`
	`223`	`+ return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)`
`220`	`224`	`}`
`221`	`225`	`}`
`222`	`226`
`223`	`227`	`case .notWordBoundary:`
`224`		`- builder.buildAssert { [options] (input, pos, subjectBounds) in`
	`228`	`+ builder.buildAssert { [options]`
	`229`	`+ (cache, maxIndex, input, pos, subjectBounds) in`
`225`	`230`	`if options.usesSimpleUnicodeBoundaries {`
`226`	`231`	`// TODO: How should we handle bounds?`
`227`	`232`	`return !_CharacterClassModel.word.isBoundary(`
`@@ -231,7 +236,7 @@ fileprivate extension Compiler.ByteCodeGen {`
`231`	`236`	`with: options`
`232`	`237`	`)`
`233`	`238`	`} else {`
`234`		`- return !input.isOnWordBoundary(at: pos)`
	`239`	`+ return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)`
`235`	`240`	`}`
`236`	`241`	`}`
`237`	`242`	`}`