Skip to content

More unicode properties #385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 16, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,9 @@ extension AST.Atom.CharacterProperty {
/// Character name in the form `\p{name=...}`
case named(String)

/// Character age, as per UnicodeScalar.Properties.age.
case age(major: Int, minor: Int)

case posix(Unicode.POSIXProperty)

/// Some special properties implemented by PCRE and Oniguruma.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ extension Source {
typealias PropertyKind = AST.Atom.CharacterProperty.Kind

static private func withNormalizedForms<T>(
_ str: String, match: (String) -> T?
) -> T? {
_ str: String, match: (String) throws -> T?
) rethrows -> T? {
// This follows the rules provided by UAX44-LM3, including trying to drop an
// "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for
// consistency with other engines and the Unicode.Scalar.Properties names.
let str = str.filter { !$0.isWhitespace && $0 != "_" && $0 != "-" }
.lowercased()
if let m = match(str) {
if let m = try match(str) {
return m
}
if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) {
if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) {
return m
}
return nil
Expand Down Expand Up @@ -361,6 +361,27 @@ extension Source {
}
}
}

static func parseAge(_ value: String) -> Unicode.Version? {
// Age can be specified in the form '3.0' or 'V3_0'.
// Other formats are not supported.
var str = value[...]

let separator: Character
if str.first == "V" {
str.removeFirst()
separator = "_"
} else {
separator = "."
}

guard let sepIndex = str.firstIndex(of: separator),
let major = Int(str[..<sepIndex]),
let minor = Int(str[sepIndex...].dropFirst())
else { return nil }

return (major, minor)
}

static func classifyCharacterPropertyValueOnly(
_ value: String
Expand Down Expand Up @@ -414,20 +435,28 @@ extension Source {

// This uses the aliases defined in
// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt.
let match = withNormalizedForms(key) { key -> PropertyKind? in
let match = try withNormalizedForms(key) { key -> PropertyKind? in
switch key {
case "script", "sc":
if let script = classifyScriptProperty(value) {
return .script(script)
guard let script = classifyScriptProperty(value) else {
throw ParseError.unrecognizedScript(value)
}
return .script(script)
case "scriptextensions", "scx":
if let script = classifyScriptProperty(value) {
return .scriptExtension(script)
guard let script = classifyScriptProperty(value) else {
throw ParseError.unrecognizedScript(value)
}
return .scriptExtension(script)
case "gc", "generalcategory":
if let cat = classifyGeneralCategory(value) {
return .generalCategory(cat)
guard let cat = classifyGeneralCategory(value) else {
throw ParseError.unrecognizedCategory(value)
}
return .generalCategory(cat)
case "age":
guard let (major, minor) = parseAge(value) else {
throw ParseError.invalidAge(value)
}
return .age(major: major, minor: minor)
case "name", "na":
return .named(value)
default:
Expand Down
9 changes: 9 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ enum ParseError: Error, Hashable {

case emptyProperty
case unknownProperty(key: String?, value: String)
case unrecognizedScript(String)
case unrecognizedCategory(String)
case invalidAge(String)

case expectedGroupSpecifier
case unbalancedEndOfGroup
Expand Down Expand Up @@ -167,6 +170,12 @@ extension ParseError: CustomStringConvertible {
return "extended syntax may not be disabled in multi-line mode"
case .expectedCalloutArgument:
return "expected argument to callout"
case .unrecognizedScript(let value):
return "unrecognized script '\(value)'"
case .unrecognizedCategory(let value):
return "unrecognized category '\(value)'"
case .invalidAge(let value):
return "invalid age format for '\(value)'. Use '3.0' or 'V3_0' formats."
}
}
}
Expand Down
1 change: 0 additions & 1 deletion Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ extension Compiler.ByteCodeGen {
mutating func emitCharacter(_ c: Character) throws {
// Unicode scalar matches the specific scalars that comprise a character
if options.semanticLevel == .unicodeScalar {
print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars")
for scalar in c.unicodeScalars {
try emitScalar(scalar)
}
Expand Down
11 changes: 7 additions & 4 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,7 @@ extension String {
}

func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram<String>.ConsumeFunction {
let consume = opts.semanticLevel == .graphemeCluster
? consumeCharacterWithSingleScalar
: consumeScalar

let consume = consumeFunction(for: opts)
return consume(propertyScalarPredicate {
// FIXME: name aliases not covered by $0.nameAlias are missed
// e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases
Expand Down Expand Up @@ -491,6 +488,12 @@ extension AST.Atom.CharacterProperty {
case .named(let n):
return consumeName(n, opts: opts)

case .age(let major, let minor):
return consume {
guard let age = $0.properties.age else { return false }
return age <= (major, minor)
}

case .posix(let p):
return p.generateConsumer(opts)

Expand Down
6 changes: 6 additions & 0 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2257,6 +2257,12 @@ extension RegexTests {
diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"))
diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"))
diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"))
diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script"))
diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category"))
diagnosticTest(#"\p{age=3}"#, .invalidAge("3"))
diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3"))
diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1"))
diagnosticTest(#"(?#"#, .expected(")"))
diagnosticTest(#"(?x"#, .expected(")"))

Expand Down
100 changes: 93 additions & 7 deletions Tests/RegexTests/UTS18Tests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ extension UTS18Tests {
func testHexNotation() {
expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab")
expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞")
expectFirstMatch("\n", regex(#"\u{0A}"#), "\n")
expectFirstMatch("\r", regex(#"\u{0D}"#), "\r")
expectFirstMatch("\r\n", regex(#"\u{0D}\u{0A}"#), "\r\n")
}

// 1.1.1 Hex Notation and Normalization
Expand Down Expand Up @@ -140,9 +143,7 @@ extension UTS18Tests {
}

func testProperties_XFail() {
XCTExpectFailure("Need to support 'age' and 'block' properties") {
// XCTAssertFalse("z".contains(#/\p{age=3.1}/#))
XCTFail(#"\(#/\p{age=3.1}/#)"#)
XCTExpectFailure("Need to support 'block' properties") {
// XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#))
XCTFail(#"\(#/\p{Block=Greek}/#)"#)
}
Expand All @@ -163,11 +164,16 @@ extension UTS18Tests {
expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6])
expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11])
expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13])
// TODO: blank
// TODO: cntrl
expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11])
expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...])
expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11])

let blankAndControl = """
\t\u{01}\u{19}
"""
// \t - tab is in both [:blank:] and [:cntrl:]
expectFirstMatch(blankAndControl, regex(#"[[:blank:]]+"#), blankAndControl[pos: ..<2])
expectFirstMatch(blankAndControl, regex(#"[[:cntrl:]]+"#), blankAndControl[pos: 1...])
}

//RL1.3 Subtraction and Intersection
Expand All @@ -188,7 +194,7 @@ extension UTS18Tests {

// Non-ASCII lowercase + non-lowercase ASCII
expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3])
XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#)))
XCTAssertTrue("123%&^ABCDéîøü".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#)))
}

func testSubtractionAndIntersectionPrecedence() {
Expand Down Expand Up @@ -360,12 +366,15 @@ extension UTS18Tests {
XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#)))
XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar)))
XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar)))
XCTAssertFalse("abcdef🇬🇭".contains(regex(#"abcdef.$"#).matchingSemantics(.unicodeScalar)))
}

func testCharacterClassesWithStrings() {
let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#)
XCTAssertTrue("🧐".contains(regex))
XCTAssertTrue("🇧🇫".contains(regex))
XCTAssertTrue("🧐".contains(regex.matchingSemantics(.unicodeScalar)))
XCTAssertTrue("🇧🇫".contains(regex.matchingSemantics(.unicodeScalar)))
}

// RL2.3 Default Word Boundaries
Expand Down Expand Up @@ -448,7 +457,7 @@ extension UTS18Tests {
// XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#))
}

XCTExpectFailure("Other named char failures -- investigate") {
XCTExpectFailure("Other named char failures -- name aliases") {
XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#)))
XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#)))
XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#)))
Expand Down Expand Up @@ -478,24 +487,80 @@ extension UTS18Tests {
func testFullProperties() {
// MARK: General
// Name (Name_Alias)
XCTAssertTrue("a".contains(regex(#"\p{name=latin small letter a}"#)))

// Block
XCTExpectFailure {
XCTFail(#"Unsupported: \(#/^\p{block=Block Elements}+$/#)"#)
// XCTAssertTrue("▂▃▄▅▆▇".contains(regex(#"^\p{block=Block Elements}+$"#)))
}

// Age
XCTAssertTrue("a".contains(regex(#"\p{age=1.1}"#)))
XCTAssertTrue("a".contains(regex(#"\p{age=V1_1}"#)))
XCTAssertTrue("a".contains(regex(#"\p{age=14.0}"#)))
XCTAssertTrue("a".contains(regex(#"\p{age=V99_99}"#)))

XCTAssertTrue("🥱".contains(regex(#"\p{age=12.0}"#)))
XCTAssertFalse("🥱".contains(regex(#"\p{age=11.0}"#)))

XCTAssertTrue("⌁".contains(regex(#"\p{age=3.0}"#)))
XCTAssertFalse("⌁".contains(regex(#"\p{age=2.0}"#)))
XCTAssertTrue("⌁".contains(regex(#"[\p{age=3.0}--\p{age=2.0}]"#)))

// General_Category
XCTAssertTrue("a".contains(regex(#"\p{Ll}"#)))
XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#)))
XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#)))
XCTAssertFalse("A".contains(regex(#"\p{gc=Ll}"#)))
XCTAssertTrue("A".contains(regex(#"\p{gc=L}"#)))

XCTAssertTrue("a".contains(regex(#"\p{Any}"#)))
XCTAssertTrue("a".contains(regex(#"\p{Assigned}"#)))
XCTAssertTrue("a".contains(regex(#"\p{ASCII}"#)))

// Script (Script_Extensions)
XCTAssertTrue("a".contains(regex(#"\p{script=latin}"#)))
XCTAssertTrue("강".contains(regex(#"\p{script=hangul}"#)))

// White_Space
XCTAssertTrue(" ".contains(regex(#"\p{whitespace}"#)))
XCTAssertTrue("\n".contains(regex(#"\p{White_Space}"#)))
XCTAssertFalse("a".contains(regex(#"\p{whitespace}"#)))

// Alphabetic
XCTAssertTrue("aéîøüƒ".contains(regex(#"^\p{Alphabetic}+$"#)))

// Hangul_Syllable_Type
XCTExpectFailure {
XCTFail(#"Unsupported: \(#/\p{Hangul_Syllable_Type=L}/#)"#)
// XCTAssertTrue("ㄱ".contains(regex(#"\p{Hangul_Syllable_Type=L}"#)))
}

// Noncharacter_Code_Point
XCTAssertTrue("\u{10FFFF}".contains(regex(#"\p{Noncharacter_Code_Point}"#)))

// Default_Ignorable_Code_Point
XCTAssertTrue("\u{00AD}".contains(regex(#"\p{Default_Ignorable_Code_Point}"#)))

// Deprecated
XCTAssertTrue("ʼn".contains(regex(#"\p{Deprecated}"#)))
// Logical_Order_Exception
XCTAssertTrue("ແ".contains(regex(#"\p{Logical_Order_Exception}"#)))
// Variation_Selector
XCTAssertTrue("\u{FE07}".contains(regex(#"\p{Variation_Selector}"#)))

// MARK: Numeric
// Numeric_Value
// Numeric_Type
// Hex_Digit
XCTAssertTrue("0123456789abcdef0123456789ABCDEF"
.contains(regex(#"^\p{Hex_Digit}+$"#)))
XCTAssertFalse("0123456789abcdefg".contains(regex(#"^\p{Hex_Digit}+$"#)))
// ASCII_Hex_Digit
XCTAssertTrue("0123456789abcdef".contains(regex(#"^\p{ASCII_Hex_Digit}+$"#)))
XCTAssertFalse("0123456789abcdef0123456789ABCDEF"
.contains(regex(#"^\p{ASCII_Hex_Digit}+$"#)))

// MARK: Identifiers
// ID_Continue
Expand Down Expand Up @@ -528,15 +593,36 @@ extension UTS18Tests {
// Simple_Case_Folding
// Soft_Dotted
// Cased
XCTAssertTrue("A".contains(regex(#"\p{Cased}"#)))
XCTAssertTrue("A".contains(regex(#"\p{Is_Cased}"#)))
XCTAssertFalse("0".contains(regex(#"\p{Cased}"#)))

// Case_Ignorable
XCTAssertTrue(":".contains(regex(#"\p{Case_Ignorable}"#)))
XCTAssertFalse("a".contains(regex(#"\p{Case_Ignorable}"#)))

// Changes_When_Lowercased
XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased}"#)))
XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased=true}"#)))
XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Lowercased}"#)))

// Changes_When_Uppercased
XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#)))
XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#)))
XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#)))

// Changes_When_Titlecased
XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Titlecased=true}"#)))
XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Titlecased}"#)))

// Changes_When_Casefolded
XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Casefolded=true}"#)))
XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Casefolded}"#)))
XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casefolded}"#)))

// Changes_When_Casemapped
XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Casemapped}"#)))
XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casemapped}"#)))

// MARK: Normalization
// Canonical_Combining_Class
Expand Down