Skip to content

Commit a786546

Browse files
committed
Coalesce adjacent scalars and characters in the DSL
Previously we would emit a series of scalars written in the DSL as a series of individual characters in grapheme semantic mode. Change the behavior such that we coalesce any adjacent scalars and characters, including those in regex literals and nested concatenations. We then perform grapheme breaking over the result, and can emit character matches for scalars that coalesced into a grapheme. This transform subsumes a similar transform we performed for regex literals when converting them to a DSLTree. This has the nice side effect of allowing us to better preserve scalar syntax in the DSL transform. rdar://96942688
1 parent b8a729c commit a786546

File tree

8 files changed

+318
-102
lines changed

8 files changed

+318
-102
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,41 @@ fileprivate extension Compiler.ByteCodeGen {
791791
}
792792
}
793793

794+
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
795+
// Before emitting a concatenation, we need to flatten out any nested
796+
// concatenations, and coalesce any adjacent characters and scalars, forming
797+
// quoted literals of their contents, over which we can perform grapheme
798+
// breaking.
799+
func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
800+
switch node {
801+
case .concatenation(let ch):
802+
return ch.flatMap(flatten)
803+
case .convertedRegexLiteral(let n, _):
804+
return flatten(n)
805+
default:
806+
return [node]
807+
}
808+
}
809+
let children = children
810+
.flatMap(flatten)
811+
.coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
812+
switch node {
813+
case .atom(let a):
814+
guard let c = a.literalCharacterValue else { return false }
815+
str.append(c)
816+
return true
817+
case .quotedLiteral(let q):
818+
str += q
819+
return true
820+
default:
821+
return false
822+
}
823+
}
824+
for child in children {
825+
try emitConcatenationComponent(child)
826+
}
827+
}
828+
794829
@discardableResult
795830
mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
796831
switch node {
@@ -799,9 +834,7 @@ fileprivate extension Compiler.ByteCodeGen {
799834
try emitAlternation(children)
800835

801836
case let .concatenation(children):
802-
for child in children {
803-
try emitConcatenationComponent(child)
804-
}
837+
try emitConcatenation(children)
805838

806839
case let .capture(name, refId, child, transform):
807840
options.beginScope()

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,9 @@ extension PrettyPrinter {
7070
for namedCapture in namedCaptures {
7171
print("let \(namedCapture) = Reference(Substring.self)")
7272
}
73-
74-
switch node {
75-
case .concatenation(_):
76-
printAsPattern(convertedFromAST: node)
77-
case .convertedRegexLiteral(.concatenation(_), _):
78-
printAsPattern(convertedFromAST: node)
79-
default:
80-
printBlock("Regex") { printer in
81-
printer.printAsPattern(convertedFromAST: node)
82-
}
73+
74+
printBlock("Regex") { printer in
75+
printer.printAsPattern(convertedFromAST: node, isTopLevel: true)
8376
}
8477
}
8578

@@ -89,7 +82,7 @@ extension PrettyPrinter {
8982
// to have a non-backing-off pretty-printer that this
9083
// can defer to.
9184
private mutating func printAsPattern(
92-
convertedFromAST node: DSLTree.Node
85+
convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false
9386
) {
9487
if patternBackoff(DSLTree._Tree(node)) {
9588
printBackoff(node)
@@ -106,11 +99,7 @@ extension PrettyPrinter {
10699
}
107100

108101
case let .concatenation(c):
109-
printBlock("Regex") { printer in
110-
c.forEach {
111-
printer.printAsPattern(convertedFromAST: $0)
112-
}
113-
}
102+
printConcatenationAsPattern(c, isTopLevel: isTopLevel)
114103

115104
case let .nonCapturingGroup(kind, child):
116105
switch kind.ast {
@@ -273,7 +262,7 @@ extension PrettyPrinter {
273262
// check above, so it should work out. Need a
274263
// cleaner way to do this. This means the argument
275264
// label is a lie.
276-
printAsPattern(convertedFromAST: n)
265+
printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel)
277266

278267
case let .customCharacterClass(ccc):
279268
printAsPattern(ccc)
@@ -289,6 +278,60 @@ extension PrettyPrinter {
289278
print("/* TODO: absent function */")
290279
}
291280
}
281+
282+
enum NodeToPrint {
283+
case dslNode(DSLTree.Node)
284+
case stringLiteral(String)
285+
}
286+
287+
mutating func printAsPattern(_ node: NodeToPrint) {
288+
switch node {
289+
case .dslNode(let n):
290+
printAsPattern(convertedFromAST: n)
291+
case .stringLiteral(let str):
292+
print(str)
293+
}
294+
}
295+
296+
mutating func printConcatenationAsPattern(
297+
_ nodes: [DSLTree.Node], isTopLevel: Bool
298+
) {
299+
// We need to coalesce any adjacent character and scalar elements into a
300+
// string literal, preserving scalar syntax.
301+
let nodes = nodes
302+
.map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) }
303+
.coalescing(
304+
with: StringLiteralBuilder(), into: { .stringLiteral($0.result) }
305+
) { literal, node in
306+
guard case .dslNode(let node) = node else { return false }
307+
switch node {
308+
case let .atom(.char(c)):
309+
literal.append(c)
310+
return true
311+
case let .atom(.scalar(s)):
312+
literal.append(unescaped: s._dslBase)
313+
return true
314+
case .quotedLiteral(let q):
315+
literal.append(q)
316+
return true
317+
default:
318+
return false
319+
}
320+
}
321+
if isTopLevel || nodes.count == 1 {
322+
// If we're at the top level, or we coalesced everything into a single
323+
// element, we don't need to print a surrounding Regex { ... }.
324+
for n in nodes {
325+
printAsPattern(n)
326+
}
327+
return
328+
}
329+
printBlock("Regex") { printer in
330+
for n in nodes {
331+
printer.printAsPattern(n)
332+
}
333+
}
334+
}
292335

293336
mutating func printAsPattern(
294337
_ ccc: DSLTree.CustomCharacterClass,
@@ -351,8 +394,7 @@ extension PrettyPrinter {
351394
charMembers.append(c)
352395
return false
353396
case let .scalar(s):
354-
charMembers.append(
355-
unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}")
397+
charMembers.append(unescaped: s._dslBase)
356398
return false
357399
case .unconverted(_):
358400
return true
@@ -459,9 +501,9 @@ extension PrettyPrinter {
459501
case let .scalar(s):
460502

461503
if wrap {
462-
output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))")
504+
output("One(.anyOf(\(s._dslBase._bareQuoted)))")
463505
} else {
464-
output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")")
506+
output(".anyOf(\(s._dslBase._bareQuoted))")
465507
}
466508

467509
case let .unconverted(a):
@@ -635,6 +677,10 @@ extension String {
635677
}
636678
}
637679

680+
extension UnicodeScalar {
681+
var _dslBase: String { "\\u{\(String(value, radix: 16, uppercase: true))}" }
682+
}
683+
638684
/// A helper for building string literals, which handles escaping the contents
639685
/// appended.
640686
fileprivate struct StringLiteralBuilder {
@@ -861,19 +907,15 @@ extension AST.Atom {
861907
}
862908

863909
var _dslBase: (String, canBeWrapped: Bool) {
864-
func scalarLiteral(_ s: UnicodeScalar) -> String {
865-
let hex = String(s.value, radix: 16, uppercase: true)
866-
return "\\u{\(hex)}"
867-
}
868910
switch kind {
869911
case let .char(c):
870912
return (String(c), false)
871913

872914
case let .scalar(s):
873-
return (scalarLiteral(s.value), false)
915+
return (s.value._dslBase, false)
874916

875917
case let .scalarSequence(seq):
876-
return (seq.scalarValues.map(scalarLiteral).joined(), false)
918+
return (seq.scalarValues.map(\._dslBase).joined(), false)
877919

878920
case let .property(p):
879921
return (p._dslBase, true)

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 4 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -43,61 +43,7 @@ extension AST.Node {
4343
return .orderedChoice(children)
4444

4545
case let .concatenation(v):
46-
// Coalesce adjacent children who can produce a
47-
// string literal representation
48-
let astChildren = v.children
49-
func coalesce(
50-
_ idx: Array<AST>.Index
51-
) -> (Array<AST>.Index, String)? {
52-
var result = ""
53-
var idx = idx
54-
while idx < astChildren.endIndex {
55-
guard let atom: AST.Atom = astChildren[idx].as() else { break }
56-
57-
// TODO: For printing, nice to coalesce
58-
// scalars literals too. We likely need a different
59-
// approach even before we have a better IR.
60-
if let char = atom.singleCharacter {
61-
result.append(char)
62-
} else if let scalar = atom.singleScalar {
63-
result.append(Character(scalar))
64-
} else if case .scalarSequence(let seq) = atom.kind {
65-
result += seq.scalarValues.map(Character.init)
66-
} else {
67-
break
68-
}
69-
70-
astChildren.formIndex(after: &idx)
71-
}
72-
return result.isEmpty ? nil : (idx, result)
73-
}
74-
75-
// No need to nest single children concatenations
76-
if astChildren.count == 1 {
77-
return astChildren.first!.dslTreeNode
78-
}
79-
80-
// Check for a single child post-coalescing
81-
if let (idx, str) = coalesce(astChildren.startIndex),
82-
idx == astChildren.endIndex
83-
{
84-
return .quotedLiteral(str)
85-
}
86-
87-
// Coalesce adjacent string children
88-
var curIdx = astChildren.startIndex
89-
var children = Array<DSLTree.Node>()
90-
while curIdx < astChildren.endIndex {
91-
if let (nextIdx, str) = coalesce(curIdx) {
92-
// TODO: Track source info...
93-
children.append(.quotedLiteral(str))
94-
curIdx = nextIdx
95-
} else {
96-
children.append(astChildren[curIdx].dslTreeNode)
97-
astChildren.formIndex(after: &curIdx)
98-
}
99-
}
100-
return .concatenation(children)
46+
return .concatenation(v.children.map(\.dslTreeNode))
10147

10248
case let .group(v):
10349
let child = v.child.dslTreeNode
@@ -135,10 +81,9 @@ extension AST.Node {
13581
case let .atom(v):
13682
switch v.kind {
13783
case .scalarSequence(let seq):
138-
// Scalar sequences are splatted into concatenated scalars, which
139-
// becomes a quoted literal. Sequences nested in concatenations have
140-
// already been coalesced, this just handles the lone atom case.
141-
return .quotedLiteral(String(seq.scalarValues.map(Character.init)))
84+
// The DSL doesn't have an equivalent node for scalar sequences. Splat
85+
// them into a concatenation of scalars.
86+
return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) })
14287
default:
14388
return .atom(v.dslTreeAtom)
14489
}

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,14 @@ extension DSLTree.Node {
334334
default: return nil
335335
}
336336
}
337+
338+
/// If this node is for a converted literal, look through it.
339+
var lookingThroughConvertedLiteral: Self {
340+
switch self {
341+
case let .convertedRegexLiteral(n, _): return n
342+
default: return self
343+
}
344+
}
337345
}
338346

339347
extension DSLTree.Atom {
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
extension Array {
13+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14+
/// transformed into an element of the array by `finish`. The `accumulate`
15+
/// function should return `true` if the accumulator has coalesced the
16+
/// element, `false` otherwise.
17+
func coalescing<T>(
18+
with initialAccumulator: T, into finish: (T) -> Element,
19+
accumulate: (inout T, Element) -> Bool
20+
) -> Self {
21+
var didAccumulate = false
22+
var accumulator = initialAccumulator
23+
24+
var result = Self()
25+
for elt in self {
26+
if accumulate(&accumulator, elt) {
27+
// The element has been coalesced into accumulator, there is nothing
28+
// else to do.
29+
didAccumulate = true
30+
continue
31+
}
32+
if didAccumulate {
33+
// We have a leftover accumulator, which needs to be finished before we
34+
// can append the next element.
35+
result.append(finish(accumulator))
36+
accumulator = initialAccumulator
37+
didAccumulate = false
38+
}
39+
result.append(elt)
40+
}
41+
// Handle a leftover accumulation.
42+
if didAccumulate {
43+
result.append(finish(accumulator))
44+
}
45+
return result
46+
}
47+
}

0 commit comments

Comments
 (0)