swiftlang · natecook1000 · May 16, 2022 · May 7, 2022 · May 7, 2022 · May 7, 2022
diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -399,6 +399,9 @@ extension AST.Atom.CharacterProperty {
     /// Character name in the form `\p{name=...}`
     case named(String)
 
+    /// Character age, as per UnicodeScalar.Properties.age.
+    case age(major: Int, minor: Int)
+
     case posix(Unicode.POSIXProperty)
 
     /// Some special properties implemented by PCRE and Oniguruma.

diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift
@@ -13,17 +13,17 @@ extension Source {
   typealias PropertyKind = AST.Atom.CharacterProperty.Kind
 
   static private func withNormalizedForms<T>(
-    _ str: String, match: (String) -> T?
-  ) -> T? {
+    _ str: String, match: (String) throws -> T?
+  ) rethrows -> T? {
     // This follows the rules provided by UAX44-LM3, including trying to drop an
     // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for
     // consistency with other engines and the Unicode.Scalar.Properties names.
     let str = str.filter { !$0.isWhitespace && $0 != "_" && $0 != "-" }
                  .lowercased()
-    if let m = match(str) {
+    if let m = try match(str) {
       return m
     }
-    if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) {
+    if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) {
       return m
     }
     return nil
@@ -361,6 +361,27 @@ extension Source {
       }
     }
   }
+
+  static func parseAge(_ value: String) -> Unicode.Version? {
+    // Age can be specified in the form '3.0' or 'V3_0'.
+    // Other formats are not supported.
+    var str = value[...]
+
+    let separator: Character
+    if str.first == "V" {
+      str.removeFirst()
+      separator = "_"
+    } else {
+      separator = "."
+    }
+
+    guard let sepIndex = str.firstIndex(of: separator),
+          let major = Int(str[..<sepIndex]),
+          let minor = Int(str[sepIndex...].dropFirst())
+    else { return nil }
+
+    return (major, minor)
+  }
 
   static func classifyCharacterPropertyValueOnly(
     _ value: String
@@ -414,20 +435,28 @@ extension Source {
 
     // This uses the aliases defined in
     // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt.
-    let match = withNormalizedForms(key) { key -> PropertyKind? in
+    let match = try withNormalizedForms(key) { key -> PropertyKind? in
       switch key {
       case "script", "sc":
-        if let script = classifyScriptProperty(value) {
-          return .script(script)
+        guard let script = classifyScriptProperty(value) else {
+          throw ParseError.unrecognizedScript(value)
         }
+        return .script(script)
       case "scriptextensions", "scx":
-        if let script = classifyScriptProperty(value) {
-          return .scriptExtension(script)
+        guard let script = classifyScriptProperty(value) else {
+          throw ParseError.unrecognizedScript(value)
         }
+        return .scriptExtension(script)
       case "gc", "generalcategory":
-        if let cat = classifyGeneralCategory(value) {
-          return .generalCategory(cat)
+        guard let cat = classifyGeneralCategory(value) else {
+          throw ParseError.unrecognizedCategory(value)
+        }
+        return .generalCategory(cat)
+      case "age":
+        guard let (major, minor) = parseAge(value) else {
+          throw ParseError.invalidAge(value)
         }
+        return .age(major: major, minor: minor)
       case "name", "na":
         return .named(value)
       default:

diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -59,6 +59,9 @@ enum ParseError: Error, Hashable {
 
   case emptyProperty
   case unknownProperty(key: String?, value: String)
+  case unrecognizedScript(String)
+  case unrecognizedCategory(String)
+  case invalidAge(String)
 
   case expectedGroupSpecifier
   case unbalancedEndOfGroup
@@ -167,6 +170,12 @@ extension ParseError: CustomStringConvertible {
       return "extended syntax may not be disabled in multi-line mode"
     case .expectedCalloutArgument:
       return "expected argument to callout"
+    case .unrecognizedScript(let value):
+      return "unrecognized script '\(value)'"
+    case .unrecognizedCategory(let value):
+      return "unrecognized category '\(value)'"
+    case .invalidAge(let value):
+      return "invalid age format for '\(value)'. Use '3.0' or 'V3_0' formats."
     }
   }
 }

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -170,7 +170,6 @@ extension Compiler.ByteCodeGen {
   mutating func emitCharacter(_ c: Character) throws {
     // Unicode scalar matches the specific scalars that comprise a character
     if options.semanticLevel == .unicodeScalar {
-      print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars")
       for scalar in c.unicodeScalars {
         try emitScalar(scalar)
       }

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -145,10 +145,7 @@ extension String {
 }
 
 func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram<String>.ConsumeFunction {
-  let consume = opts.semanticLevel == .graphemeCluster
-    ? consumeCharacterWithSingleScalar
-    : consumeScalar
-
+  let consume = consumeFunction(for: opts)
   return consume(propertyScalarPredicate {
     // FIXME: name aliases not covered by $0.nameAlias are missed
     // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases
@@ -491,6 +488,12 @@ extension AST.Atom.CharacterProperty {
       case .named(let n):
         return consumeName(n, opts: opts)
 
+      case .age(let major, let minor):
+        return consume {
+          guard let age = $0.properties.age else { return false }
+          return age <= (major, minor)
+        }
+
       case .posix(let p):
         return p.generateConsumer(opts)
 

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -2257,6 +2257,12 @@ extension RegexTests {
     diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"))
     diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"))
     diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"))
+    diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
+    diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
+    diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category"))
+    diagnosticTest(#"\p{age=3}"#, .invalidAge("3"))
+    diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3"))
+    diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1"))
     diagnosticTest(#"(?#"#, .expected(")"))
     diagnosticTest(#"(?x"#, .expected(")"))
 

diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
@@ -70,6 +70,9 @@ extension UTS18Tests {
   func testHexNotation() {
     expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab")
     expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞")
+    expectFirstMatch("\n", regex(#"\u{0A}"#), "\n")
+    expectFirstMatch("\r", regex(#"\u{0D}"#), "\r")
+    expectFirstMatch("\r\n", regex(#"\u{0D}\u{0A}"#), "\r\n")
   }
 
   // 1.1.1 Hex Notation and Normalization
@@ -140,9 +143,7 @@ extension UTS18Tests {
   }
 
   func testProperties_XFail() {
-    XCTExpectFailure("Need to support 'age' and 'block' properties") {
-      // XCTAssertFalse("z".contains(#/\p{age=3.1}/#))
-      XCTFail(#"\(#/\p{age=3.1}/#)"#)
+    XCTExpectFailure("Need to support 'block' properties") {
       // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#))
       XCTFail(#"\(#/\p{Block=Greek}/#)"#)
     }
@@ -163,11 +164,16 @@ extension UTS18Tests {
     expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6])
     expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11])
     expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13])
-    // TODO: blank
-    // TODO: cntrl
     expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11])
     expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...])
     expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11])
+
+    let blankAndControl = """
+     \t\u{01}\u{19}
+    """
+    // \t - tab is in both [:blank:] and [:cntrl:]
+    expectFirstMatch(blankAndControl, regex(#"[[:blank:]]+"#), blankAndControl[pos: ..<2])
+    expectFirstMatch(blankAndControl, regex(#"[[:cntrl:]]+"#), blankAndControl[pos: 1...])
   }
 
   //RL1.3 Subtraction and Intersection
@@ -188,7 +194,7 @@ extension UTS18Tests {
 
     // Non-ASCII lowercase + non-lowercase ASCII
     expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3])
-    XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#)))
+    XCTAssertTrue("123%&^ABCDéîøü".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#)))
   }
 
   func testSubtractionAndIntersectionPrecedence() {
@@ -360,12 +366,15 @@ extension UTS18Tests {
     XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#)))
     XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar)))
     XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar)))
+    XCTAssertFalse("abcdef🇬🇭".contains(regex(#"abcdef.$"#).matchingSemantics(.unicodeScalar)))
   }
 
   func testCharacterClassesWithStrings() {
     let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#)
     XCTAssertTrue("🧐".contains(regex))
     XCTAssertTrue("🇧🇫".contains(regex))
+    XCTAssertTrue("🧐".contains(regex.matchingSemantics(.unicodeScalar)))
+    XCTAssertTrue("🇧🇫".contains(regex.matchingSemantics(.unicodeScalar)))
   }
 
   // RL2.3 Default Word Boundaries
@@ -448,7 +457,7 @@ extension UTS18Tests {
       // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#))
     }
 
-    XCTExpectFailure("Other named char failures -- investigate") {
+    XCTExpectFailure("Other named char failures -- name aliases") {
       XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#)))
       XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#)))
       XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#)))
@@ -478,24 +487,80 @@ extension UTS18Tests {
   func testFullProperties() {
     // MARK: General
     // Name (Name_Alias)
+    XCTAssertTrue("a".contains(regex(#"\p{name=latin small letter a}"#)))
+
     // Block
+    XCTExpectFailure {
+      XCTFail(#"Unsupported: \(#/^\p{block=Block Elements}+$/#)"#)
+      // XCTAssertTrue("▂▃▄▅▆▇".contains(regex(#"^\p{block=Block Elements}+$"#)))
+    }
+
     // Age
+    XCTAssertTrue("a".contains(regex(#"\p{age=1.1}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{age=V1_1}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{age=14.0}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{age=V99_99}"#)))
+
+    XCTAssertTrue("🥱".contains(regex(#"\p{age=12.0}"#)))
+    XCTAssertFalse("🥱".contains(regex(#"\p{age=11.0}"#)))
+
+    XCTAssertTrue("⌁".contains(regex(#"\p{age=3.0}"#)))
+    XCTAssertFalse("⌁".contains(regex(#"\p{age=2.0}"#)))
+    XCTAssertTrue("⌁".contains(regex(#"[\p{age=3.0}--\p{age=2.0}]"#)))
+
     // General_Category
+    XCTAssertTrue("a".contains(regex(#"\p{Ll}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#)))
+    XCTAssertFalse("A".contains(regex(#"\p{gc=Ll}"#)))
+    XCTAssertTrue("A".contains(regex(#"\p{gc=L}"#)))
+
+    XCTAssertTrue("a".contains(regex(#"\p{Any}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{Assigned}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{ASCII}"#)))
+
     // Script (Script_Extensions)
+    XCTAssertTrue("a".contains(regex(#"\p{script=latin}"#)))
+    XCTAssertTrue("강".contains(regex(#"\p{script=hangul}"#)))
+
     // White_Space
+    XCTAssertTrue(" ".contains(regex(#"\p{whitespace}"#)))
+    XCTAssertTrue("\n".contains(regex(#"\p{White_Space}"#)))
+    XCTAssertFalse("a".contains(regex(#"\p{whitespace}"#)))
+
     // Alphabetic
+    XCTAssertTrue("aéîøüƒ".contains(regex(#"^\p{Alphabetic}+$"#)))
+
     // Hangul_Syllable_Type
+    XCTExpectFailure {
+      XCTFail(#"Unsupported: \(#/\p{Hangul_Syllable_Type=L}/#)"#)
+      // XCTAssertTrue("ㄱ".contains(regex(#"\p{Hangul_Syllable_Type=L}"#)))
+    }
+
     // Noncharacter_Code_Point
+    XCTAssertTrue("\u{10FFFF}".contains(regex(#"\p{Noncharacter_Code_Point}"#)))
+
     // Default_Ignorable_Code_Point
+    XCTAssertTrue("\u{00AD}".contains(regex(#"\p{Default_Ignorable_Code_Point}"#)))
+
     // Deprecated
+    XCTAssertTrue("ŉ".contains(regex(#"\p{Deprecated}"#)))
     // Logical_Order_Exception
+    XCTAssertTrue("ແ".contains(regex(#"\p{Logical_Order_Exception}"#)))
     // Variation_Selector
+    XCTAssertTrue("\u{FE07}".contains(regex(#"\p{Variation_Selector}"#)))
 
     // MARK: Numeric
     // Numeric_Value
     // Numeric_Type
     // Hex_Digit
+    XCTAssertTrue("0123456789abcdef０１２３４５６７８９ＡＢＣＤＥＦ"
+      .contains(regex(#"^\p{Hex_Digit}+$"#)))
+    XCTAssertFalse("0123456789abcdefg".contains(regex(#"^\p{Hex_Digit}+$"#)))
     // ASCII_Hex_Digit
+    XCTAssertTrue("0123456789abcdef".contains(regex(#"^\p{ASCII_Hex_Digit}+$"#)))
+    XCTAssertFalse("0123456789abcdef０１２３４５６７８９ＡＢＣＤＥＦ"
+      .contains(regex(#"^\p{ASCII_Hex_Digit}+$"#)))
 
     // MARK: Identifiers
     // ID_Continue
@@ -528,15 +593,36 @@ extension UTS18Tests {
     // Simple_Case_Folding
     // Soft_Dotted
     // Cased
+    XCTAssertTrue("A".contains(regex(#"\p{Cased}"#)))
+    XCTAssertTrue("A".contains(regex(#"\p{Is_Cased}"#)))
+    XCTAssertFalse("0".contains(regex(#"\p{Cased}"#)))
+
     // Case_Ignorable
+    XCTAssertTrue(":".contains(regex(#"\p{Case_Ignorable}"#)))
+    XCTAssertFalse("a".contains(regex(#"\p{Case_Ignorable}"#)))
+
     // Changes_When_Lowercased
+    XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased}"#)))
+    XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased=true}"#)))
+    XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Lowercased}"#)))
+
     // Changes_When_Uppercased
     XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#)))
     XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#)))
     XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#)))
+
     // Changes_When_Titlecased
+    XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Titlecased=true}"#)))
+    XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Titlecased}"#)))
+
     // Changes_When_Casefolded
+    XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Casefolded=true}"#)))
+    XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Casefolded}"#)))
+    XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casefolded}"#)))
+
     // Changes_When_Casemapped
+    XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Casemapped}"#)))
+    XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casemapped}"#)))
 
     // MARK: Normalization
     // Canonical_Combining_Class