Use word breaking SPI for \b

Azoy · Azoy · commit 9cb337f2b556 · 2022-06-30T10:29:06.000-07:00
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -1,3 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+@_spi(_Unicode)
+import Swift
+
 @_implementationOnly import _RegexParser
 
 extension Compiler {
@@ -192,19 +206,33 @@ fileprivate extension Compiler.ByteCodeGen {
       }
 
     case .wordBoundary:
-      // TODO: May want to consider Unicode level
       builder.buildAssert { [options] (input, pos, subjectBounds) in
-        // TODO: How should we handle bounds?
-        _CharacterClassModel.word.isBoundary(
-          input, at: pos, bounds: subjectBounds, with: options)
+        if options.usesSimpleUnicodeBoundaries {
+          // TODO: How should we handle bounds?
+          return _CharacterClassModel.word.isBoundary(
+            input,
+            at: pos,
+            bounds: subjectBounds,
+            with: options
+          )
+        } else {
+          return input.isOnWordBoundary(at: pos)
+        }
       }
 
     case .notWordBoundary:
-      // TODO: May want to consider Unicode level
       builder.buildAssert { [options] (input, pos, subjectBounds) in
-        // TODO: How should we handle bounds?
-        !_CharacterClassModel.word.isBoundary(
-          input, at: pos, bounds: subjectBounds, with: options)
+        if options.usesSimpleUnicodeBoundaries {
+          // TODO: How should we handle bounds?
+          return !_CharacterClassModel.word.isBoundary(
+            input,
+            at: pos,
+            bounds: subjectBounds,
+            with: options
+          )
+        } else {
+          return !input.isOnWordBoundary(at: pos)
+        }
       }
     }
   }
diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift
@@ -311,9 +311,12 @@ extension MatchingOptions.Representation {
     [.reluctantByDefault, .possessiveByDefault]
   }
   
+  // Uses level 2 Unicode word boundaries
+  static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) }
+  
   /// The default set of options.
   static var `default`: Self {
-    [.graphemeClusterSemantics, .textSegmentGraphemeMode]
+    [.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries]
   }
 }
 
diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+@_spi(_Unicode)
+import Swift
+
+extension String {
+  func isOnWordBoundary(at i: String.Index) -> Bool {
+    guard i != startIndex, i != endIndex else {
+      return true
+    }
+    
+    if #available(SwiftStdlib 5.7, *) {
+      var indices: Set<String.Index> = []
+      var j = startIndex
+      
+      while j < endIndex, j <= i {
+        indices.insert(j)
+        j = _wordIndex(after: j)
+      }
+      
+      return indices.contains(i)
+    } else {
+      return false
+    }
+  }
+}
diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift
@@ -271,29 +271,44 @@ class RegexDSLTests: XCTestCase {
         .ignoresCase(false)
       }
     
-#if os(macOS)
-    try XCTExpectFailure("Implement level 2 word boundaries") {
-      try _testDSLCaptures(
-        ("can't stop won't stop", ("can't stop won't stop", "can't", "won")),
-        matchType: (Substring, Substring, Substring).self, ==) {
-          Capture {
-            OneOrMore(.word)
-            Anchor.wordBoundary
-          }
-          OneOrMore(.any, .reluctant)
-          "stop"
-          " "
-          
-          Capture {
-            OneOrMore(.word)
-            Anchor.wordBoundary
-          }
-          .wordBoundaryKind(.unicodeLevel1)
-          OneOrMore(.any, .reluctant)
-          "stop"
+    try _testDSLCaptures(
+      ("can't stop won't stop", ("can't stop won't stop", "can't", "won't")),
+      matchType: (Substring, Substring, Substring).self, ==) {
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
         }
-    }
-#endif
+        OneOrMore(.any, .reluctant)
+        "stop"
+        " "
+        
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
+        }
+        OneOrMore(.any, .reluctant)
+        "stop"
+      }
+    
+    try _testDSLCaptures(
+      ("can't stop won't stop", ("can't stop won't stop", "can", "won")),
+      matchType: (Substring, Substring, Substring).self, ==) {
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
+        }
+        OneOrMore(.any, .reluctant)
+        "stop"
+        " "
+        
+        Capture {
+          OneOrMore(.word)
+          Anchor.wordBoundary
+        }
+        .wordBoundaryKind(.unicodeLevel1)
+        OneOrMore(.any, .reluctant)
+        "stop"
+      }
     
     try _testDSLCaptures(
       ("abcdef123", ("abcdef123", "a", "123")),
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -1069,6 +1069,23 @@ extension RegexTests {
       ("Sol Cafe", nil), xfail: true)
   }
 
+  func testLevel2WordBoundaries() {
+    // MARK: Level 2 Word Boundaries
+    firstMatchTest(#"\b😊\b"#, input: "🔥😊👍", match: "😊")
+    firstMatchTest(#"\b👨🏽\b"#, input: "👩🏻👶🏿👨🏽🧑🏾👩🏼", match: "👨🏽")
+    firstMatchTest(#"\b🇺🇸\b"#, input: "🇨🇦🇺🇸🇲🇽", match: "🇺🇸")
+    firstMatchTest(#"\b.+\b"#, input: "€1 234,56", match: "€1 234,56")
+    firstMatchTest(#"〱\B㋞\Bツ"#, input: "〱㋞ツ", match: "〱㋞ツ")
+    firstMatchTest(#"\bhello\b"#, input: "hello〱㋞ツ", match: "hello")
+    firstMatchTest(#"\bChicago\b"#, input: "나는 Chicago에 산다", match: "Chicago")
+    firstMatchTest(#"\blove\b"#, input: "眼睛love食物", match: "love")
+    firstMatchTest(#"\b\u{d}\u{a}\b"#, input: "\u{d}\u{a}", match: "\u{d}\u{a}")
+    firstMatchTest(#"\bㅋㅋㅋ\b"#, input: "아니ㅋㅋㅋ네", match: "ㅋㅋㅋ")
+    firstMatchTest(#"Re\B\:\BZero"#, input: "Re:Zero Starting Life in Another World", match: "Re:Zero")
+    firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't")
+    firstMatchTest(#"\b÷\b"#, input: "3 ÷ 3 = 1", match: "÷")
+  }
+  
   func testMatchGroups() {
     // MARK: Groups
 
@@ -1475,12 +1492,12 @@ extension RegexTests {
       #"(?W)abcd\b.+"#,
       ("abcd ef", true),
       ("abcdef", false),
-      ("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII
+      ("abcdéf", false))
     matchTest(
       #"(?P)abcd\b.+"#,
       ("abcd ef", true),
       ("abcdef", false),
-      ("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII
+      ("abcdéf", false))
 
     // 'S' ASCII-only spaces
     matchTest(

Original file line number	Diff line number	Diff line change
`@@ -311,9 +311,12 @@ extension MatchingOptions.Representation {`
`311`	`311`	`[.reluctantByDefault, .possessiveByDefault]`
`312`	`312`	`}`
`313`	`313`
	`314`	`+ // Uses level 2 Unicode word boundaries`
	`315`	`+ static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) }`
	`316`	`+`
`314`	`317`	`/// The default set of options.`
`315`	`318`	static var `default`: Self {
`316`		`- [.graphemeClusterSemantics, .textSegmentGraphemeMode]`
	`319`	`+ [.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries]`
`317`	`320`	`}`
`318`	`321`	`}`
`319`	`322`