Skip to content

Commit 05c751d

Browse files
[clang-format] Limit how much work guessLanguage() can do
guessLanguage() uses UnwrappedLineParser to process different preprocessor variants of a file. For large files with many preprocessor branches, the number of variants can be very large and the operation can hang for a long time and eventually OOM. (This has been observed particularly for single-header libraries such as miniaudio.h). This patch implements a limit on how many variants guessLanguage() analyzes, to avoid such a performance cliff. The limit is expressed as a maximum number of lines (summed over preprocessor variants) to process. This allows shorter files to have more variants processed than longer files. Fixes clangd/clangd#719 Fixes clangd/clangd#1384 Fixes #70945 This patch does not fix the broader problem of actually trying to format such large headers, which involves using UnwrappedLineParser from call sites other than guessLanguage(), though the approach in the patch could be extended to other call sites as well.
1 parent a9f39ff commit 05c751d

File tree

5 files changed

+36
-8
lines changed

5 files changed

+36
-8
lines changed

clang/lib/Format/Format.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2830,7 +2830,7 @@ class Cleaner : public TokenAnalyzer {
28302830
class ObjCHeaderStyleGuesser : public TokenAnalyzer {
28312831
public:
28322832
ObjCHeaderStyleGuesser(const Environment &Env, const FormatStyle &Style)
2833-
: TokenAnalyzer(Env, Style), IsObjC(false) {}
2833+
: TokenAnalyzer(Env, Style, MaxLinesToProcess), IsObjC(false) {}
28342834

28352835
std::pair<tooling::Replacements, unsigned>
28362836
analyze(TokenAnnotator &Annotator,
@@ -2846,6 +2846,12 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer {
28462846
bool isObjC() { return IsObjC; }
28472847

28482848
private:
2849+
// Limit the number of variants of the file TokenAnalyzer processes for
2850+
// the purpose of guessing the language. An inaccurate guess is better than
2851+
// hanging for a long time or OOMing, which has been observed with real
2852+
// libraries which are single-header with many preprocessor branches.
2853+
static const unsigned MaxLinesToProcess = (1 << 20);
2854+
28492855
static bool
28502856
guessIsObjC(const SourceManager &SourceManager,
28512857
const SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,

clang/lib/Format/TokenAnalyzer.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,14 @@ Environment::Environment(StringRef Code, StringRef FileName,
8383
ID(VirtualSM->get().getMainFileID()), FirstStartColumn(FirstStartColumn),
8484
NextStartColumn(NextStartColumn), LastStartColumn(LastStartColumn) {}
8585

86-
TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style)
86+
TokenAnalyzer::TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
87+
unsigned MaxLinesToProcess)
8788
: Style(Style), Env(Env),
8889
AffectedRangeMgr(Env.getSourceManager(), Env.getCharRanges()),
8990
UnwrappedLines(1),
9091
Encoding(encoding::detectEncoding(
91-
Env.getSourceManager().getBufferData(Env.getFileID()))) {
92+
Env.getSourceManager().getBufferData(Env.getFileID()))),
93+
MaxLinesToProcess(MaxLinesToProcess) {
9294
LLVM_DEBUG(
9395
llvm::dbgs() << "File encoding: "
9496
<< (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
@@ -109,7 +111,7 @@ TokenAnalyzer::process(bool SkipAnnotation) {
109111
SmallVector<FormatToken *, 10> Tokens(Toks.begin(), Toks.end());
110112
UnwrappedLineParser Parser(Env.getSourceManager(), Style, Lex.getKeywords(),
111113
Env.getFirstStartColumn(), Tokens, *this,
112-
Allocator, IdentTable);
114+
Allocator, IdentTable, MaxLinesToProcess);
113115
Parser.parse();
114116
assert(UnwrappedLines.back().empty());
115117
unsigned Penalty = 0;

clang/lib/Format/TokenAnalyzer.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ class Environment {
8787

8888
class TokenAnalyzer : public UnwrappedLineConsumer {
8989
public:
90-
TokenAnalyzer(const Environment &Env, const FormatStyle &Style);
90+
TokenAnalyzer(const Environment &Env, const FormatStyle &Style,
91+
unsigned MaxLinesToProcess = 0);
9192

9293
std::pair<tooling::Replacements, unsigned>
9394
process(bool SkipAnnotation = false);
@@ -109,6 +110,7 @@ class TokenAnalyzer : public UnwrappedLineConsumer {
109110
AffectedRangeManager AffectedRangeMgr;
110111
SmallVector<SmallVector<UnwrappedLine, 16>, 2> UnwrappedLines;
111112
encoding::Encoding Encoding;
113+
unsigned MaxLinesToProcess;
112114
};
113115

114116
} // end namespace format

clang/lib/Format/UnwrappedLineParser.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ UnwrappedLineParser::UnwrappedLineParser(
151151
const AdditionalKeywords &Keywords, unsigned FirstStartColumn,
152152
ArrayRef<FormatToken *> Tokens, UnwrappedLineConsumer &Callback,
153153
llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
154-
IdentifierTable &IdentTable)
154+
IdentifierTable &IdentTable, unsigned MaxLinesToProcess)
155155
: Line(new UnwrappedLine), MustBreakBeforeNextToken(false),
156156
CurrentLines(&Lines), Style(Style), Keywords(Keywords),
157157
CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr),
@@ -160,7 +160,8 @@ UnwrappedLineParser::UnwrappedLineParser(
160160
? IG_Rejected
161161
: IG_Inited),
162162
IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn),
163-
Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {}
163+
Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable),
164+
MaxLinesToProcess(MaxLinesToProcess) {}
164165

165166
void UnwrappedLineParser::reset() {
166167
PPBranchLevel = -1;
@@ -194,6 +195,8 @@ void UnwrappedLineParser::reset() {
194195
void UnwrappedLineParser::parse() {
195196
IndexedTokenSource TokenSource(AllTokens);
196197
Line->FirstStartColumn = FirstStartColumn;
198+
size_t TotalLinesProcessed = 0;
199+
size_t LastReport = 0;
197200
do {
198201
LLVM_DEBUG(llvm::dbgs() << "----\n");
199202
reset();
@@ -235,6 +238,7 @@ void UnwrappedLineParser::parse() {
235238
Callback.consumeUnwrappedLine(Line);
236239
}
237240
Callback.finishRun();
241+
TotalLinesProcessed += ExpandedLines.size();
238242
}
239243

240244
LLVM_DEBUG(llvm::dbgs() << "Unwrapped lines:\n");
@@ -243,6 +247,14 @@ void UnwrappedLineParser::parse() {
243247
Callback.consumeUnwrappedLine(Line);
244248
}
245249
Callback.finishRun();
250+
TotalLinesProcessed += Lines.size();
251+
if ((TotalLinesProcessed / 10000) > LastReport) {
252+
llvm::errs() << "Processed " << TotalLinesProcessed << " lines\n";
253+
LastReport = (TotalLinesProcessed / 10000);
254+
}
255+
if (MaxLinesToProcess > 0 && TotalLinesProcessed >= MaxLinesToProcess) {
256+
break;
257+
}
246258
Lines.clear();
247259
while (!PPLevelBranchIndex.empty() &&
248260
PPLevelBranchIndex.back() + 1 >= PPLevelBranchCount.back()) {

clang/lib/Format/UnwrappedLineParser.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ class UnwrappedLineParser {
111111
unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
112112
UnwrappedLineConsumer &Callback,
113113
llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
114-
IdentifierTable &IdentTable);
114+
IdentifierTable &IdentTable,
115+
unsigned MaxLinesToProcess = 0);
115116

116117
void parse();
117118

@@ -406,6 +407,11 @@ class UnwrappedLineParser {
406407

407408
MacroExpander Macros;
408409

410+
// If set to a nonzero value, stop generating new runs after this
411+
// many lines (summed over all the runs generated so far) have been
412+
// processed.
413+
unsigned MaxLinesToProcess;
414+
409415
friend class ScopedLineState;
410416
friend class CompoundStatementIndenter;
411417
};

0 commit comments

Comments
 (0)