Skip to content

Commit f436cca

Browse files
committed
Introduce scalar sequences \u{AA BB CC}
Allow a whitespace-separated list of scalars within the `\u{...}` syntax. This is syntactic sugar that gets implicitly splatted out, for example `\u{A B C}` becomes `\u{A}\u{B}\u{C}`.
1 parent 7752015 commit f436cca

File tree

10 files changed

+242
-31
lines changed

10 files changed

+242
-31
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ extension AST {
3131
/// \u{...}, \0dd, \x{...}, ...
3232
case scalar(Scalar)
3333

34+
/// A whitespace-separated sequence of Unicode scalar values which are
35+
/// implicitly splatted out.
36+
///
37+
/// `\u{A B C}` -> `\u{A}\u{B}\u{C}`
38+
case scalarSequence(ScalarSequence)
39+
3440
/// A Unicode property, category, or script, including those written using
3541
/// POSIX syntax.
3642
///
@@ -84,6 +90,7 @@ extension AST.Atom {
8490
switch kind {
8591
case .char(let v): return v
8692
case .scalar(let v): return v
93+
case .scalarSequence(let v): return v
8794
case .property(let v): return v
8895
case .escaped(let v): return v
8996
case .keyboardControl(let v): return v
@@ -116,6 +123,18 @@ extension AST.Atom {
116123
self.location = location
117124
}
118125
}
126+
127+
public struct ScalarSequence: Hashable {
128+
public var scalars: [Scalar]
129+
public var trivia: [AST.Trivia]
130+
131+
public init(_ scalars: [Scalar], trivia: [AST.Trivia]) {
132+
precondition(scalars.count > 1, "Expected multiple scalars")
133+
self.scalars = scalars
134+
self.trivia = trivia
135+
}
136+
public var scalarValues: [Unicode.Scalar] { scalars.map(\.value) }
137+
}
119138
}
120139

121140
extension AST.Atom {
@@ -725,8 +744,9 @@ extension AST.Atom {
725744
// the AST? Or defer for the matching engine?
726745
return nil
727746

728-
case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
729-
.callout, .backtrackingDirective, .changeMatchingOptions:
747+
case .scalarSequence, .property, .any, .startOfLine, .endOfLine,
748+
.backreference, .subpattern, .callout, .backtrackingDirective,
749+
.changeMatchingOptions:
730750
return nil
731751
}
732752
}
@@ -748,13 +768,21 @@ extension AST.Atom {
748768
/// A string literal representation of the atom, if possible.
749769
///
750770
/// Individual characters are returned as-is, and Unicode scalars are
751-
/// presented using "\u{nnnn}" syntax.
771+
/// presented using "\u{nn nn ...}" syntax.
752772
public var literalStringValue: String? {
773+
func scalarLiteral(_ u: [UnicodeScalar]) -> String {
774+
let digits = u.map { String($0.value, radix: 16, uppercase: true) }
775+
.joined(separator: " ")
776+
return "\\u{\(digits)}"
777+
}
753778
switch kind {
754779
case .char(let c):
755780
return String(c)
756781
case .scalar(let s):
757-
return "\\u{\(String(s.value.value, radix: 16, uppercase: true))}"
782+
return scalarLiteral([s.value])
783+
784+
case .scalarSequence(let s):
785+
return scalarLiteral(s.scalarValues)
758786

759787
case .keyboardControl(let x):
760788
return "\\C-\(x)"

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,54 @@ extension Source {
290290
return try Source.validateUnicodeScalar(str, .hex)
291291
}
292292

293+
/// Try to lex a seqence of hex digit unicode scalars.
294+
///
295+
/// UniScalarSequence -> Whitespace? UniScalarSequencElt+
296+
/// UniScalarSequencElt -> HexDigit{1...} Whitespace?
297+
///
298+
mutating func expectUnicodeScalarSequence(
299+
eating ending: Character
300+
) throws -> AST.Atom.Kind {
301+
try recordLoc { src in
302+
var scalars = [AST.Atom.Scalar]()
303+
var trivia = [AST.Trivia]()
304+
305+
// Eat up any leading whitespace.
306+
if let t = src.lexWhitespace() { trivia.append(t) }
307+
308+
while true {
309+
let str = src.lexUntil { src in
310+
// Hit the ending, stop lexing.
311+
if src.isEmpty || src.peek() == ending {
312+
return true
313+
}
314+
// Eat up trailing whitespace, and stop lexing to record the scalar.
315+
if let t = src.lexWhitespace() {
316+
trivia.append(t)
317+
return true
318+
}
319+
// Not the ending or trivia, must be a digit of the scalar.
320+
return false
321+
}
322+
guard !str.value.isEmpty else { break }
323+
scalars.append(try Source.validateUnicodeScalar(str, .hex))
324+
}
325+
guard !scalars.isEmpty else {
326+
throw ParseError.expectedNumber("", kind: .hex)
327+
}
328+
try src.expect(ending)
329+
330+
if scalars.count == 1 {
331+
return .scalar(scalars[0])
332+
}
333+
return .scalarSequence(.init(scalars, trivia: trivia))
334+
}.value
335+
}
336+
293337
/// Eat a scalar off the front, starting from after the
294338
/// backslash and base character (e.g. `\u` or `\x`).
295339
///
296-
/// UniScalar -> 'u{' HexDigit{1...} '}'
340+
/// UniScalar -> 'u{' UniScalarSequence '}'
297341
/// | 'u' HexDigit{4}
298342
/// | 'x{' HexDigit{1...} '}'
299343
/// | 'x' HexDigit{0...2}
@@ -314,7 +358,10 @@ extension Source {
314358
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
315359
switch base {
316360
// Hex numbers.
317-
case "u" where src.tryEat("{"), "x" where src.tryEat("{"):
361+
case "u" where src.tryEat("{"):
362+
return try src.expectUnicodeScalarSequence(eating: "}")
363+
364+
case "x" where src.tryEat("{"):
318365
let str = try src.lexUntil(eating: "}")
319366
return .scalar(try Source.validateUnicodeScalar(str, .hex))
320367

@@ -598,6 +645,16 @@ extension Source {
598645
// inside a custom character class (and only treats whitespace as
599646
// non-semantic there for the extra-extended `(?xx)` mode). If we get a
600647
// strict-PCRE mode, we'll need to add a case for that.
648+
return lexWhitespace()
649+
}
650+
651+
/// Try to consume whitespace as trivia
652+
///
653+
/// Whitespace -> WhitespaceChar+
654+
///
655+
/// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex
656+
/// whitespace.
657+
mutating func lexWhitespace() -> AST.Trivia? {
601658
let trivia: Located<String>? = recordLoc { src in
602659
src.tryEatPrefix(\.isPatternWhitespace)?.string
603660
}

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ extension RegexValidator {
210210
}
211211
}
212212

213-
func validateAtom(_ atom: AST.Atom) throws {
213+
func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) throws {
214214
switch atom.kind {
215215
case .escaped(let esc):
216216
try validateEscaped(esc, at: atom.location)
@@ -243,6 +243,13 @@ extension RegexValidator {
243243
// TODO: We should error on unknown Unicode scalar names.
244244
break
245245

246+
case .scalarSequence:
247+
// Not currently supported in a custom character class.
248+
if inCustomCharacterClass {
249+
throw error(.unsupported("scalar sequence in custom character class"),
250+
at: atom.location)
251+
}
252+
246253
case .char, .scalar, .startOfLine, .endOfLine, .any:
247254
break
248255
}
@@ -260,8 +267,8 @@ extension RegexValidator {
260267
let lhs = range.lhs
261268
let rhs = range.rhs
262269

263-
try validateAtom(lhs)
264-
try validateAtom(rhs)
270+
try validateAtom(lhs, inCustomCharacterClass: true)
271+
try validateAtom(rhs, inCustomCharacterClass: true)
265272

266273
guard lhs.isValidCharacterClassRangeBound else {
267274
throw error(.invalidCharacterClassRangeOperand, at: lhs.location)
@@ -297,7 +304,7 @@ extension RegexValidator {
297304
try validateCharacterClassRange(r)
298305

299306
case .atom(let a):
300-
try validateAtom(a)
307+
try validateAtom(a, inCustomCharacterClass: true)
301308

302309
case .setOperation(let lhs, _, let rhs):
303310
for lh in lhs { try validateCharacterClassMember(lh) }
@@ -379,7 +386,7 @@ extension RegexValidator {
379386
try validateQuantification(q)
380387

381388
case .atom(let a):
382-
try validateAtom(a)
389+
try validateAtom(a, inCustomCharacterClass: false)
383390

384391
case .customCharacterClass(let c):
385392
try validateCustomCharacterClass(c)

Sources/_RegexParser/Regex/Printing/DumpAST.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@ extension AST.Atom {
138138
switch kind {
139139
case .escaped(let c): return "\\\(c.character)"
140140

141+
case .scalarSequence(let s):
142+
return s.scalars.map(\.value.halfWidthCornerQuoted).joined()
143+
141144
case .namedCharacter(let charName):
142145
return "\\N{\(charName)}"
143146

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,9 @@ extension AST.Atom {
230230
// handled in emitAssertion
231231
return nil
232232

233-
case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl,
234-
.backreference, .subpattern, .callout, .backtrackingDirective,
235-
.changeMatchingOptions:
233+
case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta,
234+
.keyboardMetaControl, .backreference, .subpattern, .callout,
235+
.backtrackingDirective, .changeMatchingOptions:
236236
// FIXME: implement
237237
return nil
238238
}

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -671,13 +671,19 @@ extension AST.Atom {
671671
}
672672

673673
var _dslBase: String {
674+
func scalarLiteral(_ s: UnicodeScalar) -> String {
675+
let hex = String(s.value, radix: 16, uppercase: true)
676+
return "\\u{\(hex)}"
677+
}
674678
switch kind {
675679
case let .char(c):
676680
return String(c)
677681

678682
case let .scalar(s):
679-
let hex = String(s.value.value, radix: 16, uppercase: true)
680-
return "\\u{\(hex)}"
683+
return scalarLiteral(s.value)
684+
685+
case let .scalarSequence(seq):
686+
return seq.scalarValues.map(scalarLiteral).joined()
681687

682688
case let .property(p):
683689
return p._dslBase
@@ -769,13 +775,9 @@ extension AST.Atom {
769775

770776
var _regexBase: String {
771777
switch kind {
772-
case let .char(c):
773-
return String(c)
774-
775-
case let .scalar(s):
776-
let hex = String(s.value.value, radix: 16, uppercase: true)
777-
return "\\u{\(hex)}"
778-
778+
case .char, .scalar, .scalarSequence:
779+
return literalStringValue!
780+
779781
case let .property(p):
780782
return p._regexBase
781783

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,17 @@ extension AST.Node {
6060
var result = ""
6161
var idx = idx
6262
while idx < astChildren.endIndex {
63-
let atom: AST.Atom? = astChildren[idx].as()
63+
guard let atom: AST.Atom = astChildren[idx].as() else { break }
6464

6565
// TODO: For printing, nice to coalesce
6666
// scalars literals too. We likely need a different
6767
// approach even before we have a better IR.
68-
if let char = atom?.singleCharacter {
68+
if let char = atom.singleCharacter {
6969
result.append(char)
70-
} else if let scalar = atom?.singleScalar {
70+
} else if let scalar = atom.singleScalar {
7171
result.append(Character(scalar))
72+
} else if case .scalarSequence(let seq) = atom.kind {
73+
result += seq.scalarValues.map(Character.init)
7274
} else {
7375
break
7476
}
@@ -136,7 +138,15 @@ extension AST.Node {
136138
return .trivia(v.contents)
137139

138140
case let .atom(v):
139-
return .atom(v.dslTreeAtom)
141+
switch v.kind {
142+
case .scalarSequence(let seq):
143+
// Scalar sequences are splatted into concatenated scalars, which
144+
// becomes a quoted literal. Sequences nested in concatenations have
145+
// already been coalesced, this just handles the lone atom case.
146+
return .quotedLiteral(String(seq.scalarValues.map(Character.init)))
147+
default:
148+
return .atom(v.dslTreeAtom)
149+
}
140150

141151
case let .customCharacterClass(ccc):
142152
return .customCharacterClass(ccc.dslTreeClass)

Sources/_StringProcessing/Utility/ASTBuilder.swift

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,10 +338,26 @@ func escaped(
338338
atom(.escaped(e))
339339
}
340340
func scalar(_ s: Unicode.Scalar) -> AST.Node {
341-
atom(.scalar(.init(s, .fake)))
341+
.atom(scalar_a(s))
342+
}
343+
func scalar_a(_ s: Unicode.Scalar) -> AST.Atom {
344+
atom_a(.scalar(.init(s, .fake)))
342345
}
343346
func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member {
344-
atom_m(.scalar(.init(s, .fake)))
347+
.atom(scalar_a(s))
348+
}
349+
350+
func scalarSeq(_ s: Unicode.Scalar...) -> AST.Node {
351+
.atom(scalarSeq_a(s))
352+
}
353+
func scalarSeq_a(_ s: Unicode.Scalar...) -> AST.Atom {
354+
scalarSeq_a(s)
355+
}
356+
func scalarSeq_a(_ s: [Unicode.Scalar]) -> AST.Atom {
357+
atom_a(.scalarSequence(.init(s.map { .init($0, .fake) }, trivia: [])))
358+
}
359+
func scalarSeq_m(_ s: Unicode.Scalar...) -> AST.CustomCharacterClass.Member {
360+
.atom(scalarSeq_a(s))
345361
}
346362

347363
func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node {

Tests/RegexTests/MatchTests.swift

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,20 @@ extension RegexTests {
285285
firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")
286286

287287
// code point sequence
288-
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
288+
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc")
289+
firstMatchTest(#"3\u{ 61 62 63 }"#, input: "123abcxyz", match: "3abc")
290+
firstMatchTest(#"\u{61 62}\u{63}"#, input: "123abcxyz", match: "abc")
291+
firstMatchTest(#"\u{61}\u{62 63}"#, input: "123abcxyz", match: "abc")
292+
firstMatchTest(#"9|\u{61 62 63}"#, input: "123abcxyz", match: "abc")
293+
firstMatchTest(#"(?:\u{61 62 63})"#, input: "123abcxyz", match: "abc")
294+
firstMatchTest(#"23\u{61 62 63}xy"#, input: "123abcxyz", match: "23abcxy")
295+
296+
// o + horn + dot_below
297+
firstMatchTest(
298+
#"\u{006f 031b 0323}"#,
299+
input: "\u{006f}\u{031b}\u{0323}",
300+
match: "\u{006f}\u{031b}\u{0323}"
301+
)
289302

290303
// Escape sequences that represent scalar values.
291304
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
@@ -1405,6 +1418,9 @@ extension RegexTests {
14051418
firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed)
14061419
firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed)
14071420

1421+
firstMatchTest(#"\u{65 301}$"#, input: eDecomposed, match: eDecomposed)
1422+
firstMatchTest(#"\u{65 301}$"#, input: eComposed, match: eComposed)
1423+
14081424
// FIXME: Implicit \y at end of match
14091425
firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil,
14101426
xfail: true)
@@ -1516,7 +1532,8 @@ extension RegexTests {
15161532
firstMatchTest(#"🇰🇷"#, input: flag, match: flag)
15171533
firstMatchTest(#"[🇰🇷]"#, input: flag, match: flag)
15181534
firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag)
1519-
1535+
firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag)
1536+
15201537
// First Unicode scalar followed by CCC of regional indicators
15211538
firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag,
15221539
xfail: true)

0 commit comments

Comments
 (0)