Skip to content

Commit 09a385b

Browse files
authored
Support Unicode scalar names in \p{name=...} (#382)
1 parent 6d833aa commit 09a385b

File tree

5 files changed

+54
-22
lines changed

5 files changed

+54
-22
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,9 @@ extension AST.Atom.CharacterProperty {
396396
case script(Unicode.Script)
397397
case scriptExtension(Unicode.Script)
398398

399+
/// Character name in the form `\p{name=...}`
400+
case named(String)
401+
399402
case posix(Unicode.POSIXProperty)
400403

401404
/// Some special properties implemented by PCRE and Oniguruma.

Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,8 @@ extension Source {
428428
if let cat = classifyGeneralCategory(value) {
429429
return .generalCategory(cat)
430430
}
431+
case "name", "na":
432+
return .named(value)
431433
default:
432434
break
433435
}

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,19 @@ extension String {
144144
}
145145
}
146146

147+
func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram<String>.ConsumeFunction {
148+
let consume = opts.semanticLevel == .graphemeCluster
149+
? consumeCharacterWithSingleScalar
150+
: consumeScalar
151+
152+
return consume(propertyScalarPredicate {
153+
// FIXME: name aliases not covered by $0.nameAlias are missed
154+
// e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases
155+
$0.name?.isEqualByUAX44LM2(to: name) == true
156+
|| $0.nameAlias?.isEqualByUAX44LM2(to: name) == true
157+
})
158+
}
159+
147160
// TODO: This is basically an AST interpreter, which would
148161
// be good or interesting to build regardless, and serves
149162
// as a compiler fall-back path
@@ -206,12 +219,7 @@ extension AST.Atom {
206219
return try p.generateConsumer(opts)
207220

208221
case let .namedCharacter(name):
209-
return consumeScalar(propertyScalarPredicate {
210-
// FIXME: name aliases not covered by $0.nameAlias are missed
211-
// e.g. U+FEFF is also 'FORM FEED', 'BYTE ORDER MARK', and 'BOM'
212-
$0.name?.isEqualByUAX44LM2(to: name) == true
213-
|| $0.nameAlias?.isEqualByUAX44LM2(to: name) == true
214-
})
222+
return consumeName(name, opts: opts)
215223

216224
case .any:
217225
assertionFailure(
@@ -479,6 +487,9 @@ extension AST.Atom.CharacterProperty {
479487

480488
case .scriptExtension(let s):
481489
return consume(scriptExtensionScalarPredicate(s))
490+
491+
case .named(let n):
492+
return consumeName(n, opts: opts)
482493

483494
case .posix(let p):
484495
return p.generateConsumer(opts)

Tests/RegexTests/ParseTests.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,13 @@ extension RegexTests {
12191219
parseTest(#"\p{word}"#, prop(.posix(.word)))
12201220
parseTest(#"\p{xdigit}"#, prop(.posix(.xdigit)))
12211221

1222+
parseTest(#"\p{name=A}"#, prop(.named("A")))
1223+
parseTest(#"\p{Name=B}"#, prop(.named("B")))
1224+
parseTest(#"\p{isName=C}"#, prop(.named("C")))
1225+
parseTest(#"\p{na=D}"#, prop(.named("D")))
1226+
parseTest(#"\p{NA=E}"#, prop(.named("E")))
1227+
parseTest(#"\p{na=isI}"#, prop(.named("isI")))
1228+
12221229
// MARK: Conditionals
12231230

12241231
parseTest(#"(?(1))"#, conditional(

Tests/RegexTests/UTS18Tests.swift

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -389,25 +389,34 @@ extension UTS18Tests {
389389
//
390390
// To meet this requirement, an implementation shall support individually
391391
// named characters.
392-
func testNameProperty_XFail() {
393-
XCTExpectFailure("Need \\p{name=...} support") {
394-
XCTFail(#"\(#/\p{name=BOM}/#)"#)
395-
// Name property
396-
// XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#))
397-
// Name property and Matching Rules
398-
// XCTAssertTrue("\u{FEFF}".contains(#/\p{name=zerowidthno breakspace}/#))
392+
func testNameProperty() throws {
393+
// Name property
394+
XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE}"#)))
395+
// Name property and Matching Rules
396+
XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=zerowidthno breakspace}"#)))
397+
398+
// Computed name
399+
XCTAssertTrue("".contains(regex(#"\p{name=HANGUL SYLLABLE GANG}"#)))
400+
401+
// Graphic symbol
402+
XCTAssertTrue("\u{1F514}".contains(regex(#"\p{name=BELL}"#)))
403+
404+
// Name match failures
405+
XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BRAKE SPACE}"#)))
406+
XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE ZZZZ}"#)))
407+
XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK}"#)))
408+
XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=z}"#)))
409+
}
410+
411+
func testNameProperty_XFail() throws {
412+
XCTExpectFailure("Need more expansive name alias matching") {
399413
// Name_Alias property
400-
// XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BYTE ORDER MARK}/#))
414+
XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BYTE ORDER MARK}"#)))
401415
// Name_Alias property (again)
402-
// XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BOM}/#))
403-
404-
// Computed name
405-
// XCTAssertTrue("강".contains(#/\p{name=HANGUL SYLLABLE GANG}/#))
406-
416+
XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BOM}"#)))
417+
407418
// Control character
408-
// XCTAssertTrue("\u{7}".contains(#/\p{name=BEL}/#))
409-
// Graphic symbol
410-
// XCTAssertTrue("\u{1F514}".contains(#/\p{name=BELL}/#))
419+
XCTAssertTrue("\u{7}".contains(regex(#"\p{name=BEL}"#)))
411420
}
412421
}
413422

0 commit comments

Comments
 (0)