Merge pull request swiftlang#530 from Azoy/words

Azoy · Azoy · commit 8ff944139611 · 2022-07-05T22:22:15.000-07:00
Use word breaking SPI for \b
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -1,3 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+@_spi(_Unicode)
+import Swift
+
 @_implementationOnly import _RegexParser
 
 extension Compiler {
@@ -107,12 +121,13 @@ fileprivate extension Compiler.ByteCodeGen {
     // need to supply both a slice bounds and a per-search bounds.
     switch kind {
     case .startOfSubject:
-      builder.buildAssert { (input, pos, subjectBounds) in
+      builder.buildAssert { (_, _, input, pos, subjectBounds) in
         pos == subjectBounds.lowerBound
       }
 
     case .endOfSubjectBeforeNewline:
-      builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+      builder.buildAssert { [semanticLevel = options.semanticLevel]
+          (_, _, input, pos, subjectBounds) in
         if pos == subjectBounds.upperBound { return true }
         switch semanticLevel {
         case .graphemeCluster:
@@ -125,7 +140,7 @@ fileprivate extension Compiler.ByteCodeGen {
       }
 
     case .endOfSubject:
-      builder.buildAssert { (input, pos, subjectBounds) in
+      builder.buildAssert { (_, _, input, pos, subjectBounds) in
         pos == subjectBounds.upperBound
       }
 
@@ -138,16 +153,16 @@ fileprivate extension Compiler.ByteCodeGen {
       
       // FIXME: This needs to be based on `searchBounds`,
       // not the `subjectBounds` given as an argument here
-      builder.buildAssert { (input, pos, subjectBounds) in false }
+      builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
 
     case .textSegment:
-      builder.buildAssert { (input, pos, _) in
+      builder.buildAssert { (_, _, input, pos, _) in
         // FIXME: Grapheme or word based on options
         input.isOnGraphemeClusterBoundary(pos)
       }
 
     case .notTextSegment:
-      builder.buildAssert { (input, pos, _) in
+      builder.buildAssert { (_, _, input, pos, _) in
         // FIXME: Grapheme or word based on options
         !input.isOnGraphemeClusterBoundary(pos)
       }
@@ -158,7 +173,8 @@ fileprivate extension Compiler.ByteCodeGen {
       // the DSL-based `.startOfLine` anchor should always match the start
       // of a line. Right now we don't distinguish between those anchors.
       if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+        builder.buildAssert { [semanticLevel = options.semanticLevel]
+            (_, _, input, pos, subjectBounds) in
           if pos == subjectBounds.lowerBound { return true }
           switch semanticLevel {
           case .graphemeCluster:
@@ -168,7 +184,7 @@ fileprivate extension Compiler.ByteCodeGen {
           }
         }
       } else {
-        builder.buildAssert { (input, pos, subjectBounds) in
+        builder.buildAssert { (_, _, input, pos, subjectBounds) in
           pos == subjectBounds.lowerBound
         }
       }
@@ -179,7 +195,8 @@ fileprivate extension Compiler.ByteCodeGen {
       // the DSL-based `.endOfLine` anchor should always match the end
       // of a line. Right now we don't distinguish between those anchors.
       if options.anchorsMatchNewlines {
-        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
+        builder.buildAssert { [semanticLevel = options.semanticLevel]
+            (_, _, input, pos, subjectBounds) in
           if pos == subjectBounds.upperBound { return true }
           switch semanticLevel {
           case .graphemeCluster:
@@ -189,25 +206,41 @@ fileprivate extension Compiler.ByteCodeGen {
           }
         }
       } else {
-        builder.buildAssert { (input, pos, subjectBounds) in
+        builder.buildAssert { (_, _, input, pos, subjectBounds) in
           pos == subjectBounds.upperBound
         }
       }
 
     case .wordBoundary:
-      // TODO: May want to consider Unicode level
-      builder.buildAssert { [options] (input, pos, subjectBounds) in
-        // TODO: How should we handle bounds?
-        _CharacterClassModel.word.isBoundary(
-          input, at: pos, bounds: subjectBounds, with: options)
+      builder.buildAssert { [options]
+          (cache, maxIndex, input, pos, subjectBounds) in
+        if options.usesSimpleUnicodeBoundaries {
+          // TODO: How should we handle bounds?
+          return _CharacterClassModel.word.isBoundary(
+            input,
+            at: pos,
+            bounds: subjectBounds,
+            with: options
+          )
+        } else {
+          return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
+        }
       }
 
     case .notWordBoundary:
-      // TODO: May want to consider Unicode level
-      builder.buildAssert { [options] (input, pos, subjectBounds) in
-        // TODO: How should we handle bounds?
-        !_CharacterClassModel.word.isBoundary(
-          input, at: pos, bounds: subjectBounds, with: options)
+      builder.buildAssert { [options]
+          (cache, maxIndex, input, pos, subjectBounds) in
+        if options.usesSimpleUnicodeBoundaries {
+          // TODO: How should we handle bounds?
+          return !_CharacterClassModel.word.isBoundary(
+            input,
+            at: pos,
+            bounds: subjectBounds,
+            with: options
+          )
+        } else {
+          return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
+        }
       }
     }
   }
diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift
@@ -16,7 +16,13 @@ struct MEProgram {
 
   typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
   typealias AssertionFunction =
-    (Input, Input.Index, Range<Input.Index>) throws -> Bool
+    (
+      inout Set<String.Index>?,
+      inout String.Index?,
+      Input,
+      Input.Index,
+      Range<Input.Index>
+    ) throws -> Bool
   typealias TransformFunction =
     (Input, Processor._StoredCapture) throws -> Any?
   typealias MatcherFunction =
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
@@ -78,6 +78,9 @@ struct Processor {
 
   var storedCaptures: Array<_StoredCapture>
 
+  var wordIndexCache: Set<String.Index>? = nil
+  var wordIndexMaxIndex: String.Index? = nil
+  
   var state: State = .inProgress
 
   var failureReason: Error? = nil
@@ -401,7 +404,13 @@ extension Processor {
       let reg = payload.assertion
       let assertion = registers[reg]
       do {
-        guard try assertion(input, currentPosition, subjectBounds) else {
+        guard try assertion(
+          &wordIndexCache,
+          &wordIndexMaxIndex,
+          input,
+          currentPosition,
+          subjectBounds
+        ) else {
           signalFailure()
           return
         }
diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift
@@ -311,9 +311,12 @@ extension MatchingOptions.Representation {
     [.reluctantByDefault, .possessiveByDefault]
   }
   
+  // Uses level 2 Unicode word boundaries
+  static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) }
+  
   /// The default set of options.
   static var `default`: Self {
-    [.graphemeClusterSemantics, .textSegmentGraphemeMode]
+    [.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries]
   }
 }
 
diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+@_spi(_Unicode)
+import Swift
+
+extension String {
+  func isOnWordBoundary(
+    at i: String.Index,
+    using cache: inout Set<String.Index>?,
+    _ maxIndex: inout String.Index?
+  ) -> Bool {
+    guard i != startIndex, i != endIndex else {
+      return true
+    }
+    
+    // If our index is already in our cache, then this is obviously on a
+    // boundary.
+    if let cache = cache, cache.contains(i) {
+      return true
+    }
+    
+    // If its not in the cache AND our max index is larger than our index, it
+    // means this index is never on a word boundary in our string. If our index
+    // is larger than max index, we may need to still do work to determine if
+    // i is on a boundary. If it's equal to max index, then it should've been
+    // taken the cache path.
+    if let maxIndex = maxIndex, i < maxIndex {
+      return false
+    }
+    
+    if #available(SwiftStdlib 5.7, *) {
+      var indices: Set<String.Index> = []
+      var j = maxIndex ?? startIndex
+      
+      while j < endIndex, j <= i {
+        indices.insert(j)
+        j = _wordIndex(after: j)
+      }
+      
+      cache = indices
+      maxIndex = j
+      
+      return indices.contains(i)
+    } else {
+      return false
+    }
+  }
+}
diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift
@@ -271,29 +271,44 @@ class RegexDSLTests: XCTestCase {
         .ignoresCase(false)
       }
     
-#if os(macOS)
-    try XCTExpectFailure("Implement level 2 word boundaries") {
-      try _testDSLCaptures(
-        ("can't stop won't stop", ("can't stop won't stop", "can't", "won")),
-        matchType: (Substring, Substring, Substring).self, ==) {
-          Capture {
-            OneOrMore(.word)
-            Anchor.wordBoundary
-          }
-          OneOrMore(.any, .reluctant)
-          "stop"
-          " "
-          
-          Capture {
-            OneOrMore(.word)
-            Anchor.wordBoundary
-          }
-          .wordBoundaryKind(.unicodeLevel1)
-          OneOrMore(.any, .reluctant)
-          "stop"
+    try _testDSLCaptures(
+      ("can't stop won't stop", ("can't stop won't stop", "can't", "won't")),
+      matchType: (Substring, Substring, Substring).self, ==) {
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
         }
-    }
-#endif
+        OneOrMore(.any, .reluctant)
+        "stop"
+        " "
+        
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
+        }
+        OneOrMore(.any, .reluctant)
+        "stop"
+      }
+    
+    try _testDSLCaptures(
+      ("can't stop won't stop", ("can't stop won't stop", "can", "won")),
+      matchType: (Substring, Substring, Substring).self, ==) {
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
+        }
+        OneOrMore(.any, .reluctant)
+        "stop"
+        " "
+        
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
+        }
+        .wordBoundaryKind(.unicodeLevel1)
+        OneOrMore(.any, .reluctant)
+        "stop"
+      }
     
     try _testDSLCaptures(
       ("abcdef123", ("abcdef123", "a", "123")),
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift

Original file line number	Diff line number	Diff line change
`@@ -311,9 +311,12 @@ extension MatchingOptions.Representation {`
`311`	`311`	`[.reluctantByDefault, .possessiveByDefault]`
`312`	`312`	`}`
`313`	`313`
	`314`	`+ // Uses level 2 Unicode word boundaries`
	`315`	`+ static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) }`
	`316`	`+`
`314`	`317`	`/// The default set of options.`
`315`	`318`	static var `default`: Self {
`316`		`- [.graphemeClusterSemantics, .textSegmentGraphemeMode]`
	`319`	`+ [.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries]`
`317`	`320`	`}`
`318`	`321`	`}`
`319`	`322`