Skip to content

Commit 7752015

Browse files
committed
Introduce AST.Atom.Scalar
This allows us to store the source location of the inner scalar value.
1 parent 05e610a commit 7752015

File tree

7 files changed

+73
-34
lines changed

7 files changed

+73
-34
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ extension AST {
2929
/// A Unicode scalar value written as a literal
3030
///
3131
/// \u{...}, \0dd, \x{...}, ...
32-
case scalar(Unicode.Scalar)
32+
case scalar(Scalar)
3333

3434
/// A Unicode property, category, or script, including those written using
3535
/// POSIX syntax.
@@ -106,6 +106,18 @@ extension AST.Atom {
106106
}
107107
}
108108

109+
extension AST.Atom {
110+
public struct Scalar: Hashable {
111+
public var value: UnicodeScalar
112+
public var location: SourceLocation
113+
114+
public init(_ value: UnicodeScalar, _ location: SourceLocation) {
115+
self.value = value
116+
self.location = location
117+
}
118+
}
119+
}
120+
109121
extension AST.Atom {
110122

111123
// TODO: We might scrap this and break out a few categories so
@@ -697,7 +709,7 @@ extension AST.Atom {
697709
case .char(let c):
698710
return c
699711
case .scalar(let s):
700-
return Character(s)
712+
return Character(s.value)
701713

702714
case .escaped(let c):
703715
return c.scalarValue.map(Character.init)
@@ -742,7 +754,7 @@ extension AST.Atom {
742754
case .char(let c):
743755
return String(c)
744756
case .scalar(let s):
745-
return "\\u{\(String(s.value, radix: 16, uppercase: true))}"
757+
return "\\u{\(String(s.value.value, radix: 16, uppercase: true))}"
746758

747759
case .keyboardControl(let x):
748760
return "\\C-\(x)"

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,19 @@ extension Source {
157157
return .init(start ..< currentPosition)
158158
}
159159

160+
/// Attempt to eat a given prefix that satisfies a given predicate, with the
161+
/// source location recorded.
162+
mutating func tryEatLocatedPrefix(
163+
maxLength: Int? = nil,
164+
_ f: (Char) -> Bool
165+
) -> Located<String>? {
166+
let result = recordLoc { src in
167+
src.tryEatPrefix(maxLength: maxLength, f)
168+
}
169+
guard let result = result else { return nil }
170+
return result.map(\.string)
171+
}
172+
160173
/// Throws an expected ASCII character error if not matched
161174
mutating func expectASCII() throws -> Located<Character> {
162175
try recordLoc { src in
@@ -217,13 +230,13 @@ extension Source {
217230
/// return the scalar value, or throw an error if the string is malformed or
218231
/// would overflow the scalar.
219232
private static func validateUnicodeScalar(
220-
_ str: String, _ kind: RadixKind
221-
) throws -> Unicode.Scalar {
222-
let num = try validateNumber(str, UInt32.self, kind)
233+
_ str: Source.Located<String>, _ kind: RadixKind
234+
) throws -> AST.Atom.Scalar {
235+
let num = try validateNumber(str.value, UInt32.self, kind)
223236
guard let scalar = Unicode.Scalar(num) else {
224237
throw ParseError.misc("Invalid scalar value U+\(num.hexStr)")
225238
}
226-
return scalar
239+
return .init(scalar, str.location)
227240
}
228241

229242
/// Try to eat a number of a particular type and radix off the front.
@@ -266,14 +279,15 @@ extension Source {
266279
/// Eat a scalar value from hexadecimal notation off the front
267280
private mutating func expectUnicodeScalar(
268281
numDigits: Int
269-
) throws -> Located<Unicode.Scalar> {
270-
try recordLoc { src in
282+
) throws -> AST.Atom.Scalar {
283+
let str = try recordLoc { src -> String in
271284
let str = src.eat(upToCount: numDigits).string
272285
guard str.count == numDigits else {
273286
throw ParseError.expectedNumDigits(str, numDigits)
274287
}
275-
return try Source.validateUnicodeScalar(str, .hex)
288+
return str
276289
}
290+
return try Source.validateUnicodeScalar(str, .hex)
277291
}
278292

279293
/// Eat a scalar off the front, starting from after the
@@ -289,49 +303,57 @@ extension Source {
289303
///
290304
mutating func expectUnicodeScalar(
291305
escapedCharacter base: Character
292-
) throws -> Located<Unicode.Scalar> {
306+
) throws -> AST.Atom.Kind {
293307
try recordLoc { src in
308+
309+
func nullScalar() -> AST.Atom.Kind {
310+
let pos = src.currentPosition
311+
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
312+
}
313+
294314
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
295315
switch base {
296316
// Hex numbers.
297317
case "u" where src.tryEat("{"), "x" where src.tryEat("{"):
298-
let str = try src.lexUntil(eating: "}").value
299-
return try Source.validateUnicodeScalar(str, .hex)
318+
let str = try src.lexUntil(eating: "}")
319+
return .scalar(try Source.validateUnicodeScalar(str, .hex))
300320

301321
case "x":
302322
// \x expects *up to* 2 digits.
303-
guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else {
323+
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
324+
else {
304325
// In PCRE, \x without any valid hex digits is \u{0}.
305326
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
306327
// could be changed to throw an error if we had a parsing mode for
307328
// them.
308-
return Unicode.Scalar(0)
329+
return nullScalar()
309330
}
310-
return try Source.validateUnicodeScalar(digits.string, .hex)
331+
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
311332

312333
case "u":
313-
return try src.expectUnicodeScalar(numDigits: 4).value
334+
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
314335
case "U":
315-
return try src.expectUnicodeScalar(numDigits: 8).value
336+
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
316337

317338
// Octal numbers.
318339
case "o" where src.tryEat("{"):
319-
let str = try src.lexUntil(eating: "}").value
320-
return try Source.validateUnicodeScalar(str, .octal)
340+
let str = try src.lexUntil(eating: "}")
341+
return .scalar(try Source.validateUnicodeScalar(str, .octal))
321342

322343
case "0":
323344
// We can read *up to* 3 more octal digits.
324345
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
325346
// PCRE mode, we should limit it here.
326-
guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
327-
return Unicode.Scalar(0)
347+
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
348+
else {
349+
return nullScalar()
328350
}
329-
return try Source.validateUnicodeScalar(digits.string, .octal)
351+
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
330352

331353
default:
332354
fatalError("Unexpected scalar start")
333355
}
334-
}
356+
}.value
335357
}
336358

337359
/// Try to consume a quantifier
@@ -1153,7 +1175,7 @@ extension Source {
11531175

11541176
// We should either have a unicode scalar.
11551177
if src.tryEat(sequence: "U+") {
1156-
let str = try src.lexUntil(eating: "}").value
1178+
let str = try src.lexUntil(eating: "}")
11571179
return .scalar(try Source.validateUnicodeScalar(str, .hex))
11581180
}
11591181

@@ -1581,8 +1603,7 @@ extension Source {
15811603
switch char {
15821604
// Hexadecimal and octal unicode scalars.
15831605
case "u", "x", "U", "o", "0":
1584-
return try .scalar(
1585-
src.expectUnicodeScalar(escapedCharacter: char).value)
1606+
return try src.expectUnicodeScalar(escapedCharacter: char)
15861607
default:
15871608
break
15881609
}

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ extension AST.Atom {
178178

179179
var singleScalar: UnicodeScalar? {
180180
switch kind {
181-
case .scalar(let s): return s
181+
case .scalar(let s): return s.value
182182
default: return nil
183183
}
184184
}
@@ -200,7 +200,7 @@ extension AST.Atom {
200200
case let .scalar(s):
201201
assertionFailure(
202202
"Should have been handled by tree conversion")
203-
return consumeScalar { $0 == s }
203+
return consumeScalar { $0 == s.value }
204204

205205
case let .char(c):
206206
assertionFailure(

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,7 @@ extension AST.Atom {
676676
return String(c)
677677

678678
case let .scalar(s):
679-
let hex = String(s.value, radix: 16, uppercase: true)
679+
let hex = String(s.value.value, radix: 16, uppercase: true)
680680
return "\\u{\(hex)}"
681681

682682
case let .property(p):
@@ -773,7 +773,7 @@ extension AST.Atom {
773773
return String(c)
774774

775775
case let .scalar(s):
776-
let hex = String(s.value, radix: 16, uppercase: true)
776+
let hex = String(s.value.value, radix: 16, uppercase: true)
777777
return "\\u{\(hex)}"
778778

779779
case let .property(p):

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ extension AST.Atom {
211211

212212
switch self.kind {
213213
case let .char(c): return .char(c)
214-
case let .scalar(s): return .char(Character(s))
214+
case let .scalar(s): return .char(Character(s.value))
215215
case .any: return .any
216216
case let .backreference(r): return .backreference(.init(ast: r))
217217
case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq))

Sources/_StringProcessing/Utility/ASTBuilder.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,10 +338,10 @@ func escaped(
338338
atom(.escaped(e))
339339
}
340340
func scalar(_ s: Unicode.Scalar) -> AST.Node {
341-
atom(.scalar(s))
341+
atom(.scalar(.init(s, .fake)))
342342
}
343343
func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member {
344-
atom_m(.scalar(s))
344+
atom_m(.scalar(.init(s, .fake)))
345345
}
346346

347347
func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node {

Tests/RegexTests/ParseTests.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2272,6 +2272,12 @@ extension RegexTests {
22722272
$0.as(CustomCC.self)!.members[0].as(CustomCC.Range.self)!.dashLoc
22732273
})
22742274

2275+
// MARK: Unicode scalars
2276+
2277+
rangeTest(#"\u{65}"#, range(3 ..< 5), at: {
2278+
$0.as(AST.Atom.self)!.as(AST.Atom.Scalar.self)!.location
2279+
})
2280+
22752281
// MARK: References
22762282

22772283
rangeTest(#"\k<a+2>"#, range(3 ..< 6), at: {

0 commit comments

Comments
 (0)