Skip to content

Commit bd5cc65

Browse files
committed
[pseudo] Start rules are _ := start-symbol EOF, improve recovery.
Previously we were calling glrRecover() ad-hoc at the end of input. Two main problems with this: - glrRecover() on two separate code paths is inelegant - We may have to recover several times in succession (e.g. to exit from nested scopes), so we need a loop at end-of-file Having an actual shift action for an EOF terminal allows us to handle both concerns in the main shift/recover/reduce loop. This revealed a recovery design bug where recovery could enter a loop by repeatedly choosing the same parent to identically recover from. Addressed this by allowing each node to be used as a recovery base once. Differential Revision: https://reviews.llvm.org/D130550
1 parent 4d931b6 commit bd5cc65

File tree

9 files changed

+122
-73
lines changed

9 files changed

+122
-73
lines changed

clang-tools-extra/pseudo/include/clang-pseudo/GLR.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ struct GSS {
7171
LRTable::StateID State;
7272
// Used internally to track reachability during garbage collection.
7373
bool GCParity;
74+
// Have we already used this node for error recovery? (prevents loops)
75+
mutable bool Recovered = false;
7476
// Number of the parents of this node.
7577
// The parents hold previous parsed symbols, and may resume control after
7678
// this node is reduced.

clang-tools-extra/pseudo/lib/Forest.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,14 +178,20 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
178178

179179
llvm::ArrayRef<ForestNode>
180180
ForestArena::createTerminals(const TokenStream &Code) {
181-
ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size());
181+
ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size() + 1);
182182
size_t Index = 0;
183183
for (const auto &T : Code.tokens()) {
184184
new (&Terminals[Index])
185185
ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
186186
/*Start=*/Index, /*TerminalData*/ 0);
187187
++Index;
188188
}
189+
// Include an `eof` terminal.
190+
// This is important to drive the final shift/recover/reduce loop.
191+
new (&Terminals[Index])
192+
ForestNode(ForestNode::Terminal, tokenSymbol(tok::eof),
193+
/*Start=*/Index, /*TerminalData*/ 0);
194+
++Index;
189195
NodeCount = Index;
190196
return llvm::makeArrayRef(Terminals, Index);
191197
}

clang-tools-extra/pseudo/lib/GLR.cpp

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -95,17 +95,19 @@ void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
9595
auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
9696
if (!Seen.insert(N).second)
9797
return;
98-
for (auto Strategy : Lang.Table.getRecovery(N->State)) {
99-
Options.push_back(PlaceholderRecovery{
100-
NextTok,
101-
Strategy.Result,
102-
Strategy.Strategy,
103-
N,
104-
Path,
105-
});
106-
LLVM_DEBUG(llvm::dbgs()
107-
<< "Option: recover " << Lang.G.symbolName(Strategy.Result)
108-
<< " at token " << NextTok << "\n");
98+
if (!N->Recovered) { // Don't recover the same way twice!
99+
for (auto Strategy : Lang.Table.getRecovery(N->State)) {
100+
Options.push_back(PlaceholderRecovery{
101+
NextTok,
102+
Strategy.Result,
103+
Strategy.Strategy,
104+
N,
105+
Path,
106+
});
107+
LLVM_DEBUG(llvm::dbgs()
108+
<< "Option: recover " << Lang.G.symbolName(Strategy.Result)
109+
<< " at token " << NextTok << "\n");
110+
}
109111
}
110112
Path.push_back(N->Payload);
111113
for (const GSS::Node *Parent : N->parents())
@@ -180,6 +182,7 @@ void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
180182
// There are various options, including simply breaking ties between options.
181183
// For now it's obscure enough to ignore.
182184
for (const PlaceholderRecovery *Option : BestOptions) {
185+
Option->RecoveryNode->Recovered = true;
183186
const ForestNode &Placeholder =
184187
Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
185188
LRTable::StateID OldState = Option->RecoveryNode->State;
@@ -587,6 +590,9 @@ class GLRReduce {
587590
auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target);
588591
assert(NextState.has_value() && "goto must succeed after reduce!");
589592
Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base}));
593+
LLVM_DEBUG(llvm::dbgs()
594+
<< " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n"
595+
<< " --> S" << Heads->back()->State << "\n");
590596
return true;
591597
}
592598
};
@@ -638,7 +644,7 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
638644
// We discard all heads formed by reduction, and recreate them without
639645
// this constraint. This may duplicate some nodes, but it's rare.
640646
LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
641-
"Re-reducing without lookahead.");
647+
"Re-reducing without lookahead.\n");
642648
Heads.resize(HeadsPartition);
643649
Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
644650

@@ -662,34 +668,26 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
662668
}
663669
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
664670

665-
// The parse was successful if we're in state `_ := start-symbol .`
666-
auto AcceptState = Lang.Table.getGoToState(StartState, StartSymbol);
667-
assert(AcceptState.has_value() && "goto must succeed after start symbol!");
671+
// The parse was successful if in state `_ := start-symbol EOF .`
672+
// The GSS parent has `_ := start-symbol . EOF`; its payload is the parse.
673+
auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol);
674+
assert(AfterStart.has_value() && "goto must succeed after start symbol!");
675+
auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof));
676+
assert(Accept.has_value() && "shift EOF must succeed!");
668677
auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
669678
const ForestNode *Result = nullptr;
670679
for (const auto *Head : Heads) {
671-
if (Head->State == *AcceptState) {
672-
assert(Head->Payload->symbol() == StartSymbol);
680+
if (Head->State == *Accept) {
681+
assert(Head->Payload->symbol() == tokenSymbol(tok::eof));
673682
assert(Result == nullptr && "multiple results!");
674-
Result = Head->Payload;
683+
Result = Head->parents().front()->Payload;
684+
assert(Result->symbol() == StartSymbol);
675685
}
676686
}
677687
return Result;
678688
};
679689
if (auto *Result = SearchForAccept(Heads))
680690
return *Result;
681-
// Failed to parse the input, attempt to run recovery.
682-
// FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
683-
// More elegant is to include EOF in the token stream, and make the
684-
// augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
685-
// would not be a special case: it show up as a failure to shift the EOF
686-
// token.
687-
unsigned I = Terminals.size();
688-
glrRecover(Heads, I, Params, Lang, NextHeads);
689-
Reduce(NextHeads, tokenSymbol(tok::eof));
690-
if (auto *Result = SearchForAccept(NextHeads))
691-
return *Result;
692-
693691
// We failed to parse the input, returning an opaque forest node for recovery.
694692
// FIXME: as above, we can add fallback error handling so this is impossible.
695693
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
@@ -704,8 +702,10 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
704702
const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
705703

706704
llvm::ArrayRef<const Node *> Parents) {
707-
Node *Result = new (allocate(Parents.size()))
708-
Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
705+
Node *Result = new (allocate(Parents.size())) Node();
706+
Result->State = State;
707+
Result->GCParity = GCParity;
708+
Result->ParentCount = Parents.size();
709709
Alive.push_back(Result);
710710
++NodesCreated;
711711
Result->Payload = Symbol;

clang-tools-extra/pseudo/lib/cxx/cxx.bnf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929
# We list important nonterminals as start symbols, rather than doing it for all
3030
# nonterminals by default, this reduces the number of states by 30% and LRTable
3131
# actions by 16%.
32-
_ := translation-unit
33-
_ := statement-seq
34-
_ := declaration-seq
32+
_ := translation-unit EOF
33+
_ := statement-seq EOF
34+
_ := declaration-seq EOF
3535

3636
# gram.key
3737
#! we don't distinguish between namespaces and namespace aliases, as it's hard

clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,9 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
240240
PendingStates.push_back(Result.first);
241241

242242
const Rule &StartRule = G.lookupRule(RID);
243-
assert(StartRule.Size == 1 &&
244-
"Start rule must have exactly one symbol in its body!");
243+
assert(StartRule.Size == 2 &&
244+
StartRule.seq().back() == tokenSymbol(tok::eof) &&
245+
"Start rule must be of the form `_ := start-symbol EOF`!");
245246
Builder.addStartState(StartRule.seq().front(), Result.first);
246247
}
247248

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
1-
_ := expr
1+
_ := expr EOF
22
expr := id
33
id := IDENTIFIER
44

55
# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
66
# GRAPH: States:
77
# GRAPH-NEXT: State 0
8-
# GRAPH-NEXT: _ := • expr
8+
# GRAPH-NEXT: _ := • expr EOF
99
# GRAPH-NEXT: expr := • id
1010
# GRAPH-NEXT: id := • IDENTIFIER
1111
# GRAPH-NEXT: State 1
12-
# GRAPH-NEXT: _ := expr •
12+
# GRAPH-NEXT: _ := expr • EOF
1313
# GRAPH-NEXT: State 2
1414
# GRAPH-NEXT: expr := id •
1515
# GRAPH-NEXT: State 3
1616
# GRAPH-NEXT: id := IDENTIFIER •
17+
# GRAPH-NEXT: State 4
18+
# GRAPH-NEXT: _ := expr EOF •
1719

1820
# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
1921
# TABLE: LRTable:
@@ -22,7 +24,9 @@ id := IDENTIFIER
2224
# TABLE-NEXT: expr: go to state 1
2325
# TABLE-NEXT: id: go to state 2
2426
# TABLE-NEXT: State 1
27+
# TABLE-NEXT: EOF: shift state 4
2528
# TABLE-NEXT: State 2
26-
# TABLE-NEXT: EOF: reduce by rule 1 'expr := id'
29+
# TABLE-NEXT: EOF: reduce by rule 2 'expr := id'
2730
# TABLE-NEXT: State 3
28-
# TABLE-NEXT: EOF: reduce by rule 0 'id := IDENTIFIER'
31+
# TABLE-NEXT: EOF: reduce by rule 1 'id := IDENTIFIER'
32+
# TABLE-NEXT: State 4
Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,49 @@
1-
_ := expr
1+
_ := expr EOF
22
expr := expr - expr # S/R conflict at state 4 on '-' token
33
expr := IDENTIFIER
44

55
# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
66
# GRAPH: States
77
# GRAPH-NEXT: State 0
8+
# GRAPH-NEXT: _ := • expr EOF
89
# GRAPH-NEXT: expr := • expr - expr
9-
# GRAPH-NEXT: _ := • expr
1010
# GRAPH-NEXT: expr := • IDENTIFIER
1111
# GRAPH-NEXT: State 1
12-
# GRAPH-NEXT: _ := expr •
12+
# GRAPH-NEXT: _ := expr • EOF
1313
# GRAPH-NEXT: expr := expr • - expr
1414
# GRAPH-NEXT: State 2
1515
# GRAPH-NEXT: expr := IDENTIFIER •
1616
# GRAPH-NEXT: State 3
17+
# GRAPH-NEXT: _ := expr EOF •
18+
# GRAPH-NEXT: State 4
1719
# GRAPH-NEXT: expr := • expr - expr
1820
# GRAPH-NEXT: expr := expr - • expr
1921
# GRAPH-NEXT: expr := • IDENTIFIER
20-
# GRAPH-NEXT: State 4
22+
# GRAPH-NEXT: State 5
2123
# GRAPH-NEXT: expr := expr - expr •
2224
# GRAPH-NEXT: expr := expr • - expr
2325
# GRAPH-NEXT: 0 ->[expr] 1
2426
# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
25-
# GRAPH-NEXT: 1 ->[-] 3
26-
# GRAPH-NEXT: 3 ->[expr] 4
27-
# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
28-
# GRAPH-NEXT: 4 ->[-] 3
27+
# GRAPH-NEXT: 1 ->[EOF] 3
28+
# GRAPH-NEXT: 1 ->[-] 4
29+
# GRAPH-NEXT: 4 ->[expr] 5
30+
# GRAPH-NEXT: 4 ->[IDENTIFIER] 2
31+
# GRAPH-NEXT: 5 ->[-] 4
2932

3033
# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
3134
# TABLE: LRTable:
3235
# TABLE-NEXT: State 0
3336
# TABLE-NEXT: IDENTIFIER: shift state 2
3437
# TABLE-NEXT: expr: go to state 1
3538
# TABLE-NEXT: State 1
36-
# TABLE-NEXT: -: shift state 3
39+
# TABLE-NEXT: EOF: shift state 3
40+
# TABLE-NEXT: -: shift state 4
3741
# TABLE-NEXT: State 2
38-
# TABLE-NEXT: EOF -: reduce by rule 1 'expr := IDENTIFIER'
42+
# TABLE-NEXT: EOF -: reduce by rule 2 'expr := IDENTIFIER'
3943
# TABLE-NEXT: State 3
40-
# TABLE-NEXT: IDENTIFIER: shift state 2
41-
# TABLE-NEXT: expr: go to state 4
4244
# TABLE-NEXT: State 4
43-
# TABLE-NEXT: -: shift state 3
44-
# TABLE-NEXT: EOF -: reduce by rule 0 'expr := expr - expr'
45+
# TABLE-NEXT: IDENTIFIER: shift state 2
46+
# TABLE-NEXT: expr: go to state 5
47+
# TABLE-NEXT: State 5
48+
# TABLE-NEXT: -: shift state 4
49+
# TABLE-NEXT: EOF -: reduce by rule 1 'expr := expr - expr'

clang-tools-extra/pseudo/unittests/ForestTest.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class ForestTest : public ::testing::Test {
5454

5555
TEST_F(ForestTest, DumpBasic) {
5656
build(R"cpp(
57-
_ := add-expression
57+
_ := add-expression EOF
5858
add-expression := id-expression + id-expression
5959
id-expression := IDENTIFIER
6060
)cpp");
@@ -64,7 +64,7 @@ TEST_F(ForestTest, DumpBasic) {
6464
cook(lex("a + b", clang::LangOptions()), clang::LangOptions());
6565

6666
auto T = Arena.createTerminals(TS);
67-
ASSERT_EQ(T.size(), 3u);
67+
ASSERT_EQ(T.size(), 4u);
6868
const auto *Left = &Arena.createSequence(
6969
symbol("id-expression"), ruleFor("id-expression"), {&T.front()});
7070
const auto *Right = &Arena.createSequence(symbol("id-expression"),
@@ -89,9 +89,9 @@ TEST_F(ForestTest, DumpBasic) {
8989

9090
TEST_F(ForestTest, DumpAmbiguousAndRefs) {
9191
build(R"cpp(
92-
_ := type
93-
type := class-type # rule 3
94-
type := enum-type # rule 4
92+
_ := type EOF
93+
type := class-type # rule 4
94+
type := enum-type # rule 5
9595
class-type := shared-type
9696
enum-type := shared-type
9797
shared-type := IDENTIFIER)cpp");
@@ -100,7 +100,7 @@ TEST_F(ForestTest, DumpAmbiguousAndRefs) {
100100
const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions());
101101

102102
auto Terminals = Arena.createTerminals(TS);
103-
ASSERT_EQ(Terminals.size(), 1u);
103+
ASSERT_EQ(Terminals.size(), 2u);
104104

105105
const auto *SharedType = &Arena.createSequence(
106106
symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()});
@@ -109,9 +109,9 @@ TEST_F(ForestTest, DumpAmbiguousAndRefs) {
109109
const auto *EnumType = &Arena.createSequence(
110110
symbol("enum-type"), ruleFor("enum-type"), {SharedType});
111111
const auto *Alternative1 =
112-
&Arena.createSequence(symbol("type"), /*RuleID=*/3, {ClassType});
112+
&Arena.createSequence(symbol("type"), /*RuleID=*/4, {ClassType});
113113
const auto *Alternative2 =
114-
&Arena.createSequence(symbol("type"), /*RuleID=*/4, {EnumType});
114+
&Arena.createSequence(symbol("type"), /*RuleID=*/5, {EnumType});
115115
const auto *Type =
116116
&Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2});
117117
EXPECT_EQ(Type->dumpRecursive(G),

0 commit comments

Comments
 (0)