Skip to content

Commit e417253

Browse files
authored
Merge pull request #58 from hamishknight/lotta-props
2 parents 843b541 + e4ea5d3 commit e417253

File tree

10 files changed

+1114
-862
lines changed

10 files changed

+1114
-862
lines changed

Sources/_MatchingEngine/Regex/AST/ASTBuilder.swift

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,5 +152,10 @@ public func charClass(
152152
public func posixSet(
153153
_ set: Unicode.POSIXCharacterSet, inverted: Bool = false
154154
) -> Atom {
155-
return .named(.init(inverted: inverted, set: set))
155+
return .namedSet(.init(inverted: inverted, set: set))
156+
}
157+
func prop(
158+
_ kind: Atom.CharacterProperty.Kind, inverted: Bool = false
159+
) -> Atom {
160+
return .property(.init(kind, isInverted: inverted))
156161
}

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public enum Atom: Hashable {
1212
/// A Unicode property, category, or script
1313
///
1414
/// \p{...}, \p{^...}, \P
15-
case property(Prop, inverted: Bool)
15+
case property(CharacterProperty)
1616

1717
/// A built-in escaped character
1818
///
@@ -31,7 +31,10 @@ public enum Atom: Hashable {
3131
/// A named set (using POSIX syntax)
3232
///
3333
/// [:...:], [:^...:]
34-
case named(POSIXSet)
34+
case namedSet(POSIXSet)
35+
36+
/// A named character \N{...}
37+
case namedCharacter(String)
3538

3639
/// .
3740
case any
@@ -303,20 +306,51 @@ extension Atom.EscapedBuiltin {
303306
}
304307

305308
extension Atom {
306-
// TODO: Hamish, I believe you have a formulation of this and have
307-
// thought through the parsing a whole lot more. This is just what
308-
// I have at the time, but let's get something better for the AST
309-
// and parser support.
310-
public enum Prop: Hashable {
311-
case gc(Unicode.GeneralCategory)
312-
case pcreSpecial(PCRESpecialCategory)
309+
public struct CharacterProperty: Hashable {
310+
public var kind: Kind
311+
public var isInverted: Bool
312+
313+
public init(_ kind: Kind, isInverted: Bool) {
314+
self.kind = kind
315+
self.isInverted = isInverted
316+
}
317+
}
318+
}
319+
320+
extension Atom.CharacterProperty {
321+
public enum Kind: Hashable {
322+
/// Matches any character, equivalent to Oniguruma's '\O'.
323+
case any
324+
325+
// The inverse of 'Unicode.ExtendedGeneralCategory.unassigned'.
326+
case assigned
327+
328+
/// All ascii characters U+00...U+7F
329+
case ascii
330+
331+
/// A general category property.
332+
case generalCategory(Unicode.ExtendedGeneralCategory)
333+
334+
/// Binary character properties. Note that only the following are required
335+
/// by UTS#18 Level 1:
336+
/// - Alphabetic
337+
/// - Uppercase
338+
/// - Lowercase
339+
/// - White_Space
340+
/// - Noncharacter_Code_Point
341+
/// - Default_Ignorable_Code_Point
342+
case binary(Unicode.BinaryProperty, value: Bool = true)
343+
344+
/// Character script and script extensions.
313345
case script(Unicode.Script)
346+
case scriptExtension(Unicode.Script)
314347

315-
// TODO: replace me
316-
case propCheck(PropName, PropValue)
348+
/// Some special properties implemented by PCRE and Oniguruma.
349+
case pcreSpecial(PCRESpecialCategory)
350+
case onigurumaSpecial(OnigurumaSpecialProperty)
317351

318-
// TODO: yuk, let's make sure other cases are a superset of this
319-
case oniguruma(FlattendedOnigurumaUnicodeProperty)
352+
/// Unhandled properties.
353+
case other(key: String?, value: String)
320354

321355
// TODO: erm, separate out or fold into something? splat it in?
322356
public enum PCRESpecialCategory: String, Hashable {
@@ -326,9 +360,6 @@ extension Atom {
326360
case universallyNamed = "Xuc"
327361
case perlWord = "Xwd"
328362
}
329-
330-
public enum PropName: Hashable {}
331-
public enum PropValue: Hashable {}
332363
}
333364
}
334365

@@ -372,8 +403,10 @@ extension Atom: _ASTPrintable {
372403
fatalError("TODO")
373404
case .keyboardMetaControl(_):
374405
fatalError("TODO")
375-
case .named:
406+
case .namedSet:
376407
fatalError("TODO")
408+
case .namedCharacter(let charName):
409+
return "\\N{\(charName)}"
377410
case .any: return "."
378411
case .startOfLine: return "^"
379412
case .endOfLine: return "$"
@@ -407,8 +440,8 @@ extension Atom {
407440
case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
408441
fatalError("TODO")
409442

410-
case .property, .escaped, .named, .any, .startOfLine, .endOfLine,
411-
.backreference, .subpattern, .condition, .trivia:
443+
case .property, .escaped, .namedSet, .any, .startOfLine, .endOfLine,
444+
.backreference, .subpattern, .condition, .trivia, .namedCharacter:
412445
return nil
413446
}
414447
}

Sources/_MatchingEngine/Regex/Parse/CharacterPropertyClassification.swift

Lines changed: 422 additions & 0 deletions
Large diffs are not rendered by default.

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ enum ParseError: Error, Hashable {
137137
case invalidCharacterClassRangeOperand
138138

139139
case invalidPOSIXSetName(String)
140+
case emptyProperty
140141
}
141142

142143

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -230,24 +230,32 @@ extension Source {
230230
}
231231

232232
private mutating func lexUntil(
233-
_ end: String, validate: (String) throws -> Void = { _ in }
233+
_ predicate: (inout Source) -> Bool,
234+
validate: (String) throws -> Void = { _ in }
234235
) throws -> Value<String> {
235236
try recordLoc { src in
236237
var result = ""
237-
while !src.tryEat(sequence: end) {
238+
while !predicate(&src) {
238239
// TODO(diagnostic): expected `end`, instead of end-of-input
239240

240241
result.append(src.eat())
241242
}
243+
try validate(result)
242244
return result
243245
}
244246
}
245247

248+
private mutating func lexUntil(
249+
eating end: String, validate: (String) throws -> Void = { _ in }
250+
) throws -> Value<String> {
251+
try lexUntil({ src in src.tryEat(sequence: end) }, validate: validate)
252+
}
253+
246254
/// Expect a linear run of non-nested non-empty content
247255
private mutating func expectQuoted(
248256
endingWith end: String
249257
) throws -> Value<String> {
250-
try lexUntil(end, validate: { result in
258+
try lexUntil(eating: end, validate: { result in
251259
guard !result.isEmpty else {
252260
throw ParseError.misc("Expected non-empty contents")
253261
}
@@ -305,7 +313,8 @@ extension Source {
305313
///
306314
/// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
307315
mutating func lexNonSemanticWhitespace() throws -> Value<()>? {
308-
try recordLoc { src in
316+
guard syntax.ignoreWhitespace else { return nil }
317+
return try recordLoc { src in
309318
var didSomething = false
310319
while src.tryEat(" ") {
311320
didSomething = true
@@ -410,14 +419,46 @@ extension Source {
410419
try recordLoc { src in
411420
guard src.tryEat(sequence: "[:") else { return nil }
412421
let inverted = src.tryEat("^")
413-
let name = try src.lexUntil(":]").value
422+
let name = try src.lexUntil(eating: ":]").value
414423
guard let set = Unicode.POSIXCharacterSet(rawValue: name) else {
415424
throw ParseError.invalidPOSIXSetName(name)
416425
}
417426
return Atom.POSIXSet(inverted: inverted, set: set)
418427
}
419428
}
420429

430+
/// Try to consume a character property.
431+
///
432+
/// Property -> ('p{' | 'P{') Prop ('=' Prop)? '}'
433+
/// Prop -> [\s\w-]+
434+
///
435+
private mutating func lexCharacterProperty(
436+
) throws -> Value<Atom.CharacterProperty>? {
437+
try recordLoc { src in
438+
// '\P{...}' is the inverted version of '\p{...}'
439+
guard src.starts(with: "p{") || src.starts(with: "P{") else { return nil }
440+
let isInverted = src.peek() == "P"
441+
src.eat(count: 2)
442+
443+
// We should either have:
444+
// - '\p{x=y}' where 'x' is a property key, and 'y' is a value.
445+
// - '\p{y}' where 'y' is a value (or a bool key with an inferred value
446+
// of true), and its key is inferred.
447+
// TODO: We could have better recovery here if we only ate the characters
448+
// that property keys and values can use.
449+
let lhs = try src.lexUntil({ $0.peek() == "}" || $0.peek() == "=" }).value
450+
if src.tryEat("}") {
451+
let prop = try Source.classifyCharacterPropertyValueOnly(lhs)
452+
return .init(prop, isInverted: isInverted)
453+
}
454+
src.eat(asserting: "=")
455+
456+
let rhs = try src.lexUntil(eating: "}").value
457+
let prop = try Source.classifyCharacterProperty(key: lhs, value: rhs)
458+
return .init(prop, isInverted: isInverted)
459+
}
460+
}
461+
421462
/// Consume an escaped atom, starting from after the backslash
422463
///
423464
/// Escaped -> KeyboardModified | Builtin
@@ -439,6 +480,16 @@ extension Source {
439480
return .keyboardMeta(try src.expectASCII().value)
440481
}
441482

483+
// Named character \N{...}
484+
if src.tryEat(sequence: "N{") {
485+
return .namedCharacter(try src.lexUntil(eating: "}").value)
486+
}
487+
488+
// Character property \p{...} \P{...}.
489+
if let prop = try src.lexCharacterProperty() {
490+
return .property(prop.value)
491+
}
492+
442493
let char = src.eat()
443494

444495
// Single-character builtins
@@ -491,7 +542,7 @@ extension Source {
491542
// TODO: Can we try and recover and diagnose for named sets outside
492543
// character classes?
493544
if customCC, let set = try src.lexPOSIXNamedSet()?.value {
494-
return .named(set)
545+
return .namedSet(set)
495546
}
496547

497548
let char = src.eat()

0 commit comments

Comments
 (0)