Skip to content

Commit 0360b9f

Browse files
committed
[pseudo] (trivial) bracket-matching
Error-tolerant bracket matching enables our error-tolerant parsing strategies. The implementation here is *not* yet error tolerant: this patch sets up the APIs and plumbing, and describes the planned approach. Differential Revision: https://reviews.llvm.org/D125911
1 parent f371019 commit 0360b9f

File tree

8 files changed

+342
-2
lines changed

8 files changed

+342
-2
lines changed

clang-tools-extra/pseudo/benchmarks/Benchmark.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
//===----------------------------------------------------------------------===//
2121

2222
#include "benchmark/benchmark.h"
23+
#include "clang-pseudo/Bracket.h"
2324
#include "clang-pseudo/DirectiveTree.h"
2425
#include "clang-pseudo/Forest.h"
2526
#include "clang-pseudo/GLR.h"
@@ -89,7 +90,9 @@ TokenStream lexAndPreprocess() {
8990
chooseConditionalBranches(DirectiveStructure, RawStream);
9091
TokenStream Cook =
9192
cook(DirectiveStructure.stripDirectives(RawStream), LangOpts);
92-
return stripComments(Cook);
93+
auto Stream = stripComments(Cook);
94+
pairBrackets(Stream);
95+
return Stream;
9396
}
9497

9598
static void lex(benchmark::State &State) {
@@ -101,6 +104,16 @@ static void lex(benchmark::State &State) {
101104
}
102105
BENCHMARK(lex);
103106

107+
static void pairBrackets(benchmark::State &State) {
108+
clang::LangOptions LangOpts = genericLangOpts();
109+
auto Stream = clang::pseudo::lex(*SourceText, LangOpts);
110+
for (auto _ : State)
111+
pairBrackets(Stream);
112+
State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
113+
SourceText->size());
114+
}
115+
BENCHMARK(pairBrackets);
116+
104117
static void preprocess(benchmark::State &State) {
105118
clang::LangOptions LangOpts = genericLangOpts();
106119
TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts);
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//===--- Bracket.h - Analyze bracket structure --------------------*-C++-*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Bracket structure (particularly braces) is key to isolating broken regions
10+
// of code and preventing parsing from going "off the rails".
11+
//
12+
// For correct C++ code, brackets are well-nested and identifying pairs and
13+
// therefore blocks is simple. In broken code, brackets are not properly nested.
14+
// We cannot match them all and must choose which pairs to form.
15+
//
16+
// Rather than have the grammar-based parser make these choices, we pair
17+
// brackets up-front based on textual features like indentation.
18+
// This mirrors the way humans read code, and so is likely to produce the
19+
// "correct" interpretation of broken code.
20+
//
21+
// This interpretation then guides the parse: a rule containing a bracket pair
22+
// must match against paired bracket tokens.
23+
//
24+
//===----------------------------------------------------------------------===//
25+
26+
#ifndef CLANG_PSEUDO_BRACKET_H
27+
#define CLANG_PSEUDO_BRACKET_H
28+
29+
#include "clang-pseudo/Token.h"
30+
31+
namespace clang {
32+
namespace pseudo {
33+
34+
/// Identifies bracket token in the stream which should be paired.
35+
/// Sets Token::Pair accordingly.
36+
void pairBrackets(TokenStream &);
37+
38+
} // namespace pseudo
39+
} // namespace clang
40+
41+
#endif

clang-tools-extra/pseudo/include/clang-pseudo/Token.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,15 @@ struct Token {
8888
while (T->Kind == tok::comment);
8989
return *T;
9090
}
91+
/// Returns the bracket paired with this one, if any.
92+
const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
9193

9294
/// The type of token as determined by clang's lexer.
9395
clang::tok::TokenKind Kind = clang::tok::unknown;
96+
/// If this token is a paired bracket, the offset of the pair in the stream.
97+
int32_t Pair = 0;
9498
};
95-
static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
99+
static_assert(sizeof(Token) <= sizeof(char *) + 20, "Careful with layout!");
96100
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
97101

98102
/// A half-open range of tokens within a stream.
@@ -155,6 +159,11 @@ class TokenStream {
155159
return tokens().slice(R.Begin, R.End - R.Begin);
156160
}
157161

162+
MutableArrayRef<Token> tokens() {
163+
assert(isFinalized());
164+
return Tokens;
165+
}
166+
158167
/// May return the end sentinel if the stream is empty.
159168
const Token &front() const {
160169
assert(isFinalized());
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
//===--- Bracket.cpp - Analyze bracket structure --------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// The basic phases of our bracket matching are:
10+
//
11+
// 1) A simple "greedy" match looks for well-nested subsequences.
12+
//
13+
// We can't fully trust the results of this, consider:
14+
// while (1) { // A
15+
// if (true) { // B
16+
// break;
17+
// } // C
18+
// Greedy matching will match B=C, when we should at least consider A=C.
19+
// However for the correct parts of the file, the greedy match gives the
20+
// right answer. It produces useful candidates for phase 2.
21+
//
22+
// simplePairBrackets handles this step.
23+
//
24+
// 2) Try to identify places where formatting indicates that the greedy match
25+
// was correct. This is similar to how a human would scan a large file.
26+
//
27+
// For example:
28+
// int foo() { // X
29+
// // indented
30+
// while (1) {
31+
// // valid code
32+
// }
33+
// return bar(42);
34+
// } // Y
35+
// We can "verify" that X..Y looks like a braced block, and the greedy match
36+
// tells us that substring is perfectly nested.
37+
// We trust the pairings of those brackets and don't examine them further.
38+
// However in the first example above, we do not trust B=C because the brace
39+
// indentation is suspect.
40+
//
41+
// FIXME: implement this step.
42+
//
43+
// 3) Run full best-match optimization on remaining brackets.
44+
//
45+
// Conceptually, this considers all possible matchings and optimizes cost:
46+
// - there is a cost for failing to match a bracket
47+
// - there is a variable cost for matching two brackets.
48+
// (For example if brace indentation doesn't match).
49+
//
50+
// In the first example we have three alternatives, and they are ranked:
51+
// 1) A=C, skip B
52+
// 2) B=C, skip A
53+
// 3) skip A, skip B, skip C
54+
// The cost for skipping a bracket is high, so option 3 is worst.
55+
// B=C costs more than A=C, because the indentation doesn't match.
56+
//
57+
// It would be correct to run this step alone, but it would be too slow.
58+
// The implementation is dynamic programming in N^3 space and N^2 time.
59+
// Having earlier steps filter out most brackets is key to performance.
60+
//
61+
// FIXME: implement this step.
62+
//
63+
//===----------------------------------------------------------------------===//
64+
65+
#include "clang-pseudo/Bracket.h"
66+
67+
namespace clang {
68+
namespace pseudo {
69+
namespace {
70+
71+
struct Bracket {
72+
using Index = unsigned;
73+
constexpr static Index None = -1;
74+
75+
enum BracketKind : char { Paren, Brace, Square } Kind;
76+
enum Direction : bool { Open, Close } Dir;
77+
unsigned Line;
78+
unsigned Indent;
79+
Token::Index Tok;
80+
Bracket::Index Pair = None;
81+
};
82+
83+
// Find brackets in the stream and convert to Bracket struct.
84+
std::vector<Bracket> findBrackets(const TokenStream &Stream) {
85+
std::vector<Bracket> Brackets;
86+
auto Add = [&](const pseudo::Token &Tok, Bracket::BracketKind K,
87+
Bracket::Direction D) {
88+
Brackets.push_back(
89+
{K, D, Tok.Line, Tok.Indent, Stream.index(Tok), Bracket::None});
90+
};
91+
for (const auto &Tok : Stream.tokens()) {
92+
switch (Tok.Kind) {
93+
case clang::tok::l_paren:
94+
Add(Tok, Bracket::Paren, Bracket::Open);
95+
break;
96+
case clang::tok::r_paren:
97+
Add(Tok, Bracket::Paren, Bracket::Close);
98+
break;
99+
case clang::tok::l_brace:
100+
Add(Tok, Bracket::Brace, Bracket::Open);
101+
break;
102+
case clang::tok::r_brace:
103+
Add(Tok, Bracket::Brace, Bracket::Close);
104+
break;
105+
case clang::tok::l_square:
106+
Add(Tok, Bracket::Square, Bracket::Open);
107+
break;
108+
case clang::tok::r_square:
109+
Add(Tok, Bracket::Square, Bracket::Close);
110+
break;
111+
default:
112+
break;
113+
}
114+
}
115+
return Brackets;
116+
}
117+
118+
// Write the bracket pairings from Brackets back to Tokens.
119+
void applyPairings(ArrayRef<Bracket> Brackets, TokenStream &Tokens) {
120+
for (const auto &B : Brackets)
121+
Tokens.tokens()[B.Tok].Pair =
122+
(B.Pair == Bracket::None) ? 0 : (int32_t)Brackets[B.Pair].Tok - B.Tok;
123+
}
124+
125+
// Find perfect pairings (ignoring whitespace) via greedy algorithm.
126+
// This means two brackets are paired if they match and the brackets between
127+
// them nest perfectly, with no skipped or crossed brackets.
128+
void simplePairBrackets(MutableArrayRef<Bracket> Brackets) {
129+
std::vector<unsigned> Stack;
130+
for (unsigned I = 0; I < Brackets.size(); ++I) {
131+
if (Brackets[I].Dir == Bracket::Open) {
132+
Stack.push_back(I);
133+
} else if (!Stack.empty() &&
134+
Brackets[Stack.back()].Kind == Brackets[I].Kind) {
135+
Brackets[Stack.back()].Pair = I;
136+
Brackets[I].Pair = Stack.back();
137+
Stack.pop_back();
138+
} else {
139+
// Unpaired closer, no brackets on stack are part of a perfect sequence.
140+
Stack.clear();
141+
}
142+
}
143+
// Any remaining brackets on the stack stay unpaired.
144+
}
145+
146+
} // namespace
147+
148+
void pairBrackets(TokenStream &Stream) {
149+
auto Brackets = findBrackets(Stream);
150+
simplePairBrackets(Brackets);
151+
applyPairings(Brackets, Stream);
152+
}
153+
154+
} // namespace pseudo
155+
} // namespace clang

clang-tools-extra/pseudo/lib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
set(LLVM_LINK_COMPONENTS Support)
22

33
add_clang_library(clangPseudo
4+
Bracket.cpp
45
DirectiveTree.cpp
56
Forest.cpp
67
GLR.cpp

clang-tools-extra/pseudo/tool/ClangPseudo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include "clang-pseudo/Bracket.h"
910
#include "clang-pseudo/DirectiveTree.h"
1011
#include "clang-pseudo/GLR.h"
1112
#include "clang-pseudo/Grammar.h"
@@ -89,6 +90,7 @@ int main(int argc, char *argv[]) {
8990
llvm::outs() << DirectiveStructure;
9091

9192
ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
93+
pairBrackets(*ParseableStream);
9294
}
9395

9496
if (Grammar.getNumOccurrences()) {

0 commit comments

Comments
 (0)