Skip to content

Commit ee8ed0b

Browse files
committed
[clang][deps] Teach dep directive scanner about _Pragma
While we cannot handle `_Pragma` used inside macros, we can handle this at the top level, and it some projects use the `_Pragma("once")` spelling like that, which was causing spurious failures in the scanner. Limitations * Cannot handle #define ONCE _Pragma("once"), same issue as using @import in a macro -- ideally we should diagnose this in obvious cases * Our LangOpts are currently fixed, so we are not handling u"" strings or R"()" strings that require C11/C++11. rdar://108629982 Differential Revision: https://reviews.llvm.org/D149884
1 parent ec77d1f commit ee8ed0b

File tree

5 files changed

+271
-41
lines changed

5 files changed

+271
-41
lines changed

clang/include/clang/Lex/Pragma.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,13 @@ class PragmaNamespace : public PragmaHandler {
123123
PragmaNamespace *getIfNamespace() override { return this; }
124124
};
125125

126+
/// Destringize a \c _Pragma("") string according to C11 6.10.9.1:
127+
/// "The string literal is destringized by deleting any encoding prefix,
128+
/// deleting the leading and trailing double-quotes, replacing each escape
129+
/// sequence \" by a double-quote, and replacing each escape sequence \\ by a
130+
/// single backslash."
131+
void prepare_PragmaString(SmallVectorImpl<char> &StrVal);
132+
126133
} // namespace clang
127134

128135
#endif // LLVM_CLANG_LEX_PRAGMA_H

clang/lib/Lex/DependencyDirectivesScanner.cpp

Lines changed: 105 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "clang/Basic/Diagnostic.h"
2020
#include "clang/Lex/LexDiagnostic.h"
2121
#include "clang/Lex/Lexer.h"
22+
#include "clang/Lex/Pragma.h"
2223
#include "llvm/ADT/ScopeExit.h"
2324
#include "llvm/ADT/SmallString.h"
2425
#include "llvm/ADT/StringMap.h"
@@ -72,6 +73,8 @@ struct Scanner {
7273
// Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
7374
LangOpts.ObjC = true;
7475
LangOpts.LineComment = true;
76+
// FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
77+
// R"()" literals.
7578
return LangOpts;
7679
}
7780

@@ -91,6 +94,10 @@ struct Scanner {
9194
void skipLine(const char *&First, const char *const End);
9295
void skipDirective(StringRef Name, const char *&First, const char *const End);
9396

97+
/// Returns the spelling of a string literal or identifier after performing
98+
/// any processing needed to handle \c clang::Token::NeedsCleaning.
99+
StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
100+
94101
/// Lexes next token and if it is identifier returns its string, otherwise
95102
/// it skips the current line and returns \p std::nullopt.
96103
///
@@ -112,13 +119,30 @@ struct Scanner {
112119
const char *&First,
113120
const char *const End);
114121

122+
/// Lexes next token and returns true iff it matches the kind \p K.
123+
/// Otherwise it skips the current line and returns false.
124+
///
125+
/// In any case (whatever the token kind) \p First and the \p Lexer will
126+
/// advance beyond the token.
127+
[[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
128+
const char *const End);
129+
130+
/// Lexes next token and if it is string literal, returns its string.
131+
/// Otherwise, it skips the current line and returns \p std::nullopt.
132+
///
133+
/// In any case (whatever the token kind) \p First and the \p Lexer will
134+
/// advance beyond the token.
135+
[[nodiscard]] std::optional<StringRef>
136+
tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
137+
115138
[[nodiscard]] bool scanImpl(const char *First, const char *const End);
116139
[[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
117140
[[nodiscard]] bool lexAt(const char *&First, const char *const End);
118141
[[nodiscard]] bool lexModule(const char *&First, const char *const End);
119142
[[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
120143
const char *const End);
121144
[[nodiscard]] bool lexPragma(const char *&First, const char *const End);
145+
[[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
122146
[[nodiscard]] bool lexEndif(const char *&First, const char *const End);
123147
[[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
124148
const char *const End);
@@ -525,22 +549,18 @@ void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
525549
}
526550
}
527551

528-
[[nodiscard]] std::optional<StringRef>
529-
Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
530-
const dependency_directives_scan::Token &Tok = lexToken(First, End);
531-
if (Tok.isNot(tok::raw_identifier)) {
532-
if (!Tok.is(tok::eod))
533-
skipLine(First, End);
534-
return std::nullopt;
535-
}
536-
552+
StringRef
553+
Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
537554
bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
538555
if (LLVM_LIKELY(!NeedsCleaning))
539556
return Input.slice(Tok.Offset, Tok.getEnd());
540557

541558
SmallString<64> Spelling;
542559
Spelling.resize(Tok.Length);
543560

561+
// FIXME: C++11 raw string literals need special handling (see getSpellingSlow
562+
// in the Lexer). Currently we cannot see them due to our LangOpts.
563+
544564
unsigned SpellingLength = 0;
545565
const char *BufPtr = Input.begin() + Tok.Offset;
546566
const char *AfterIdent = Input.begin() + Tok.getEnd();
@@ -555,6 +575,18 @@ Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
555575
.first->first();
556576
}
557577

578+
std::optional<StringRef>
579+
Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
580+
const dependency_directives_scan::Token &Tok = lexToken(First, End);
581+
if (Tok.isNot(tok::raw_identifier)) {
582+
if (!Tok.is(tok::eod))
583+
skipLine(First, End);
584+
return std::nullopt;
585+
}
586+
587+
return cleanStringIfNeeded(Tok);
588+
}
589+
558590
StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
559591
std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
560592
assert(Id && "expected identifier token");
@@ -572,6 +604,28 @@ bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
572604
return false;
573605
}
574606

607+
bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
608+
const char *const End) {
609+
const dependency_directives_scan::Token &Tok = lexToken(First, End);
610+
if (Tok.is(K))
611+
return true;
612+
skipLine(First, End);
613+
return false;
614+
}
615+
616+
std::optional<StringRef>
617+
Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
618+
const char *const End) {
619+
const dependency_directives_scan::Token &Tok = lexToken(First, End);
620+
if (!tok::isStringLiteral(Tok.Kind)) {
621+
if (!Tok.is(tok::eod))
622+
skipLine(First, End);
623+
return std::nullopt;
624+
}
625+
626+
return cleanStringIfNeeded(Tok);
627+
}
628+
575629
bool Scanner::lexAt(const char *&First, const char *const End) {
576630
// Handle "@import".
577631

@@ -629,6 +683,41 @@ bool Scanner::lexModule(const char *&First, const char *const End) {
629683
return lexModuleDirectiveBody(Kind, First, End);
630684
}
631685

686+
bool Scanner::lex_Pragma(const char *&First, const char *const End) {
687+
if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
688+
return false;
689+
690+
std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
691+
692+
if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
693+
return false;
694+
695+
SmallString<64> Buffer(*Str);
696+
prepare_PragmaString(Buffer);
697+
698+
// Use a new scanner instance since the tokens will be inside the allocated
699+
// string. We should already have captured all the relevant tokens in the
700+
// current scanner.
701+
SmallVector<dependency_directives_scan::Token> DiscardTokens;
702+
const char *Begin = Buffer.c_str();
703+
Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
704+
InputSourceLoc};
705+
706+
PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
707+
if (PragmaScanner.lexPragma(Begin, Buffer.end()))
708+
return true;
709+
710+
DirectiveKind K = PragmaScanner.topDirective();
711+
if (K == pp_none) {
712+
skipLine(First, End);
713+
return false;
714+
}
715+
716+
assert(Begin == Buffer.end());
717+
pushDirective(K);
718+
return false;
719+
}
720+
632721
bool Scanner::lexPragma(const char *&First, const char *const End) {
633722
std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
634723
if (!FoundId)
@@ -713,6 +802,7 @@ static bool isStartOfRelevantLine(char First) {
713802
case 'i':
714803
case 'e':
715804
case 'm':
805+
case '_':
716806
return true;
717807
}
718808
return false;
@@ -749,6 +839,12 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
749839
if (*First == 'i' || *First == 'e' || *First == 'm')
750840
return lexModule(First, End);
751841

842+
if (*First == '_') {
843+
if (isNextIdentifierOrSkipLine("_Pragma", First, End))
844+
return lex_Pragma(First, End);
845+
return false;
846+
}
847+
752848
// Handle preprocessing directives.
753849

754850
TheLexer.setParsingPreprocessorDirective(true);

clang/lib/Lex/Pragma.cpp

Lines changed: 40 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -262,17 +262,48 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
262262

263263
SourceLocation RParenLoc = Tok.getLocation();
264264
bool Invalid = false;
265-
std::string StrVal = getSpelling(StrTok, &Invalid);
265+
SmallString<64> StrVal;
266+
StrVal.resize(StrTok.getLength());
267+
StringRef StrValRef = getSpelling(StrTok, StrVal, &Invalid);
266268
if (Invalid) {
267269
Diag(PragmaLoc, diag::err__Pragma_malformed);
268270
return;
269271
}
270272

271-
// The _Pragma is lexically sound. Destringize according to C11 6.10.9.1:
272-
// "The string literal is destringized by deleting any encoding prefix,
273-
// deleting the leading and trailing double-quotes, replacing each escape
274-
// sequence \" by a double-quote, and replacing each escape sequence \\ by a
275-
// single backslash."
273+
assert(StrValRef.size() <= StrVal.size());
274+
275+
// If the token was spelled somewhere else, copy it.
276+
if (StrValRef.begin() != StrVal.begin())
277+
StrVal.assign(StrValRef);
278+
// Truncate if necessary.
279+
else if (StrValRef.size() != StrVal.size())
280+
StrVal.resize(StrValRef.size());
281+
282+
// The _Pragma is lexically sound. Destringize according to C11 6.10.9.1.
283+
prepare_PragmaString(StrVal);
284+
285+
// Plop the string (including the newline and trailing null) into a buffer
286+
// where we can lex it.
287+
Token TmpTok;
288+
TmpTok.startToken();
289+
CreateString(StrVal, TmpTok);
290+
SourceLocation TokLoc = TmpTok.getLocation();
291+
292+
// Make and enter a lexer object so that we lex and expand the tokens just
293+
// like any others.
294+
Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
295+
StrVal.size(), *this);
296+
297+
EnterSourceFileWithLexer(TL, nullptr);
298+
299+
// With everything set up, lex this as a #pragma directive.
300+
HandlePragmaDirective({PIK__Pragma, PragmaLoc});
301+
302+
// Finally, return whatever came after the pragma directive.
303+
return Lex(Tok);
304+
}
305+
306+
void clang::prepare_PragmaString(SmallVectorImpl<char> &StrVal) {
276307
if (StrVal[0] == 'L' || StrVal[0] == 'U' ||
277308
(StrVal[0] == 'u' && StrVal[1] != '8'))
278309
StrVal.erase(StrVal.begin());
@@ -296,8 +327,8 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
296327

297328
// Remove 'R " d-char-sequence' and 'd-char-sequence "'. We'll replace the
298329
// parens below.
299-
StrVal.erase(0, 2 + NumDChars);
300-
StrVal.erase(StrVal.size() - 1 - NumDChars);
330+
StrVal.erase(StrVal.begin(), StrVal.begin() + 2 + NumDChars);
331+
StrVal.erase(StrVal.end() - 1 - NumDChars, StrVal.end());
301332
} else {
302333
assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' &&
303334
"Invalid string token!");
@@ -319,27 +350,7 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
319350
StrVal[0] = ' ';
320351

321352
// Replace the terminating quote with a \n.
322-
StrVal[StrVal.size()-1] = '\n';
323-
324-
// Plop the string (including the newline and trailing null) into a buffer
325-
// where we can lex it.
326-
Token TmpTok;
327-
TmpTok.startToken();
328-
CreateString(StrVal, TmpTok);
329-
SourceLocation TokLoc = TmpTok.getLocation();
330-
331-
// Make and enter a lexer object so that we lex and expand the tokens just
332-
// like any others.
333-
Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
334-
StrVal.size(), *this);
335-
336-
EnterSourceFileWithLexer(TL, nullptr);
337-
338-
// With everything set up, lex this as a #pragma directive.
339-
HandlePragmaDirective({PIK__Pragma, PragmaLoc});
340-
341-
// Finally, return whatever came after the pragma directive.
342-
return Lex(Tok);
353+
StrVal[StrVal.size() - 1] = '\n';
343354
}
344355

345356
/// HandleMicrosoft__pragma - Like Handle_Pragma except the pragma text
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// Test scanning deps works with _Pragma syntax when not inside a macro.
2+
3+
// RUN: rm -rf %t
4+
// RUN: split-file %s %t
5+
// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
6+
7+
// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1
8+
9+
//--- cdb.json.template
10+
[{
11+
"directory": "DIR",
12+
"command": "clang -fsyntax-only DIR/tu.c",
13+
"file": "DIR/tu.c"
14+
}]
15+
16+
//--- a.h
17+
_Pragma("once")
18+
#include "b.h"
19+
20+
//--- b.h
21+
#include "a.h"
22+
23+
//--- tu.c
24+
#include "a.h"

0 commit comments

Comments
 (0)