Skip to content

Commit e4ea5d3

Browse files
committed
Parse \P{...}
Handle the following kinds of character properties: - General category e.g `Lu`, `C` - Binary properties e.g `Alphabetic` - Script and script extensions e.g `Greek` - `Any`, `Assigned` & `Ascii` This covers the properties required by UTS#18 Level 1, and uses the loose matching specified by UAX44-LM3, including handling of aliases defined by Unicode. This includes a good chunk of the properties supported by Oniguruma, with the exception of the `In_...` properties it supports. For now, I've left them as a separate case that gets parsed without loose matching, but we may want to eventually fold them into some of the other cases as aliases. There are also a couple of PCRE special properties such as `Xps` that get parsed this way. For any properties that aren't currently recognized, they are parsed into an `other` case, as it's possible we may support them in the future.
1 parent 7563958 commit e4ea5d3

File tree

7 files changed

+1063
-849
lines changed

7 files changed

+1063
-849
lines changed

Sources/_MatchingEngine/Regex/AST/ASTBuilder.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,8 @@ public func posixSet(
154154
) -> Atom {
155155
return .namedSet(.init(inverted: inverted, set: set))
156156
}
157+
func prop(
158+
_ kind: Atom.CharacterProperty.Kind, inverted: Bool = false
159+
) -> Atom {
160+
return .property(.init(kind, isInverted: inverted))
161+
}

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public enum Atom: Hashable {
1212
/// A Unicode property, category, or script
1313
///
1414
/// \p{...}, \p{^...}, \P
15-
case property(Prop, inverted: Bool)
15+
case property(CharacterProperty)
1616

1717
/// A built-in escaped character
1818
///
@@ -306,20 +306,51 @@ extension Atom.EscapedBuiltin {
306306
}
307307

308308
extension Atom {
309-
// TODO: Hamish, I believe you have a formulation of this and have
310-
// thought through the parsing a whole lot more. This is just what
311-
// I have at the time, but let's get something better for the AST
312-
// and parser support.
313-
public enum Prop: Hashable {
314-
case gc(Unicode.GeneralCategory)
315-
case pcreSpecial(PCRESpecialCategory)
309+
public struct CharacterProperty: Hashable {
310+
public var kind: Kind
311+
public var isInverted: Bool
312+
313+
public init(_ kind: Kind, isInverted: Bool) {
314+
self.kind = kind
315+
self.isInverted = isInverted
316+
}
317+
}
318+
}
319+
320+
extension Atom.CharacterProperty {
321+
public enum Kind: Hashable {
322+
/// Matches any character, equivalent to Oniguruma's '\O'.
323+
case any
324+
325+
// The inverse of 'Unicode.ExtendedGeneralCategory.unassigned'.
326+
case assigned
327+
328+
/// All ascii characters U+00...U+7F
329+
case ascii
330+
331+
/// A general category property.
332+
case generalCategory(Unicode.ExtendedGeneralCategory)
333+
334+
/// Binary character properties. Note that only the following are required
335+
/// by UTS#18 Level 1:
336+
/// - Alphabetic
337+
/// - Uppercase
338+
/// - Lowercase
339+
/// - White_Space
340+
/// - Noncharacter_Code_Point
341+
/// - Default_Ignorable_Code_Point
342+
case binary(Unicode.BinaryProperty, value: Bool = true)
343+
344+
/// Character script and script extensions.
316345
case script(Unicode.Script)
346+
case scriptExtension(Unicode.Script)
317347

318-
// TODO: replace me
319-
case propCheck(PropName, PropValue)
348+
/// Some special properties implemented by PCRE and Oniguruma.
349+
case pcreSpecial(PCRESpecialCategory)
350+
case onigurumaSpecial(OnigurumaSpecialProperty)
320351

321-
// TODO: yuk, let's make sure other cases are a superset of this
322-
case oniguruma(FlattendedOnigurumaUnicodeProperty)
352+
/// Unhandled properties.
353+
case other(key: String?, value: String)
323354

324355
// TODO: erm, separate out or fold into something? splat it in?
325356
public enum PCRESpecialCategory: String, Hashable {
@@ -329,9 +360,6 @@ extension Atom {
329360
case universallyNamed = "Xuc"
330361
case perlWord = "Xwd"
331362
}
332-
333-
public enum PropName: Hashable {}
334-
public enum PropValue: Hashable {}
335363
}
336364
}
337365

Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift

Lines changed: 422 additions & 0 deletions
Large diffs are not rendered by default.

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ enum ParseError: Error, Hashable {
137137
case invalidCharacterClassRangeOperand
138138

139139
case invalidPOSIXSetName(String)
140+
case emptyProperty
140141
}
141142

142143

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,38 @@ extension Source {
427427
}
428428
}
429429

430+
/// Try to consume a character property.
431+
///
432+
/// Property -> ('p{' | 'P{') Prop ('=' Prop)? '}'
433+
/// Prop -> [\s\w-]+
434+
///
435+
private mutating func lexCharacterProperty(
436+
) throws -> Value<Atom.CharacterProperty>? {
437+
try recordLoc { src in
438+
// '\P{...}' is the inverted version of '\p{...}'
439+
guard src.starts(with: "p{") || src.starts(with: "P{") else { return nil }
440+
let isInverted = src.peek() == "P"
441+
src.eat(count: 2)
442+
443+
// We should either have:
444+
// - '\p{x=y}' where 'x' is a property key, and 'y' is a value.
445+
// - '\p{y}' where 'y' is a value (or a bool key with an inferred value
446+
// of true), and its key is inferred.
447+
// TODO: We could have better recovery here if we only ate the characters
448+
// that property keys and values can use.
449+
let lhs = try src.lexUntil({ $0.peek() == "}" || $0.peek() == "=" }).value
450+
if src.tryEat("}") {
451+
let prop = try Source.classifyCharacterPropertyValueOnly(lhs)
452+
return .init(prop, isInverted: isInverted)
453+
}
454+
src.eat(asserting: "=")
455+
456+
let rhs = try src.lexUntil(eating: "}").value
457+
let prop = try Source.classifyCharacterProperty(key: lhs, value: rhs)
458+
return .init(prop, isInverted: isInverted)
459+
}
460+
}
461+
430462
/// Consume an escaped atom, starting from after the backslash
431463
///
432464
/// Escaped -> KeyboardModified | Builtin
@@ -453,6 +485,11 @@ extension Source {
453485
return .namedCharacter(try src.lexUntil(eating: "}").value)
454486
}
455487

488+
// Character property \p{...} \P{...}.
489+
if let prop = try src.lexCharacterProperty() {
490+
return .property(prop.value)
491+
}
492+
456493
let char = src.eat()
457494

458495
// Single-character builtins

0 commit comments

Comments
 (0)