[clang][deps] Teach dep directive scanner about _Pragma

benlangmuir · benlangmuir · commit ee8ed0b3099e · 2023-05-09T10:05:12.000-07:00
While we cannot handle `_Pragma` used inside macros, we can handle this at the top level, and it some projects use the `_Pragma("once")` spelling like that, which was causing spurious failures in the scanner. Limitations * Cannot handle #define ONCE _Pragma("once"), same issue as using @import in a macro -- ideally we should diagnose this in obvious cases * Our LangOpts are currently fixed, so we are not handling u"" strings or R"()" strings that require C11/C++11. rdar://108629982 Differential Revision: https://reviews.llvm.org/D149884
diff --git a/clang/include/clang/Lex/Pragma.h b/clang/include/clang/Lex/Pragma.h
@@ -123,6 +123,13 @@ class PragmaNamespace : public PragmaHandler {
   PragmaNamespace *getIfNamespace() override { return this; }
 };
 
+/// Destringize a \c _Pragma("") string according to C11 6.10.9.1:
+/// "The string literal is destringized by deleting any encoding prefix,
+/// deleting the leading and trailing double-quotes, replacing each escape
+/// sequence \" by a double-quote, and replacing each escape sequence \\ by a
+/// single backslash."
+void prepare_PragmaString(SmallVectorImpl<char> &StrVal);
+
 } // namespace clang
 
 #endif // LLVM_CLANG_LEX_PRAGMA_H
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -19,6 +19,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/Pragma.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
@@ -72,6 +73,8 @@ struct Scanner {
     // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
     LangOpts.ObjC = true;
     LangOpts.LineComment = true;
+    // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
+    // R"()" literals.
     return LangOpts;
   }
 
@@ -91,6 +94,10 @@ struct Scanner {
   void skipLine(const char *&First, const char *const End);
   void skipDirective(StringRef Name, const char *&First, const char *const End);
 
+  /// Returns the spelling of a string literal or identifier after performing
+  /// any processing needed to handle \c clang::Token::NeedsCleaning.
+  StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
+
   /// Lexes next token and if it is identifier returns its string, otherwise
   /// it skips the current line and returns \p std::nullopt.
   ///
@@ -112,13 +119,30 @@ struct Scanner {
                                                 const char *&First,
                                                 const char *const End);
 
+  /// Lexes next token and returns true iff it matches the kind \p K.
+  /// Otherwise it skips the current line and returns false.
+  ///
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
+  [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
+                                           const char *const End);
+
+  /// Lexes next token and if it is string literal, returns its string.
+  /// Otherwise, it skips the current line and returns \p std::nullopt.
+  ///
+  /// In any case (whatever the token kind) \p First and the \p Lexer will
+  /// advance beyond the token.
+  [[nodiscard]] std::optional<StringRef>
+  tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
+
   [[nodiscard]] bool scanImpl(const char *First, const char *const End);
   [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
   [[nodiscard]] bool lexAt(const char *&First, const char *const End);
   [[nodiscard]] bool lexModule(const char *&First, const char *const End);
   [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
                                const char *const End);
   [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
+  [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
   [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
   [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
                                 const char *const End);
@@ -525,22 +549,18 @@ void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
   }
 }
 
-[[nodiscard]] std::optional<StringRef>
-Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
-  const dependency_directives_scan::Token &Tok = lexToken(First, End);
-  if (Tok.isNot(tok::raw_identifier)) {
-    if (!Tok.is(tok::eod))
-      skipLine(First, End);
-    return std::nullopt;
-  }
-
+StringRef
+Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
   if (LLVM_LIKELY(!NeedsCleaning))
     return Input.slice(Tok.Offset, Tok.getEnd());
 
   SmallString<64> Spelling;
   Spelling.resize(Tok.Length);
 
+  // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
+  // in the Lexer). Currently we cannot see them due to our LangOpts.
+
   unsigned SpellingLength = 0;
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
@@ -555,6 +575,18 @@ Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
       .first->first();
 }
 
+std::optional<StringRef>
+Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (Tok.isNot(tok::raw_identifier)) {
+    if (!Tok.is(tok::eod))
+      skipLine(First, End);
+    return std::nullopt;
+  }
+
+  return cleanStringIfNeeded(Tok);
+}
+
 StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
   std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
   assert(Id && "expected identifier token");
@@ -572,6 +604,28 @@ bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
   return false;
 }
 
+bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
+                                    const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (Tok.is(K))
+    return true;
+  skipLine(First, End);
+  return false;
+}
+
+std::optional<StringRef>
+Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
+                                       const char *const End) {
+  const dependency_directives_scan::Token &Tok = lexToken(First, End);
+  if (!tok::isStringLiteral(Tok.Kind)) {
+    if (!Tok.is(tok::eod))
+      skipLine(First, End);
+    return std::nullopt;
+  }
+
+  return cleanStringIfNeeded(Tok);
+}
+
 bool Scanner::lexAt(const char *&First, const char *const End) {
   // Handle "@import".
 
@@ -629,6 +683,41 @@ bool Scanner::lexModule(const char *&First, const char *const End) {
   return lexModuleDirectiveBody(Kind, First, End);
 }
 
+bool Scanner::lex_Pragma(const char *&First, const char *const End) {
+  if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
+    return false;
+
+  std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
+
+  if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
+    return false;
+
+  SmallString<64> Buffer(*Str);
+  prepare_PragmaString(Buffer);
+
+  // Use a new scanner instance since the tokens will be inside the allocated
+  // string. We should already have captured all the relevant tokens in the
+  // current scanner.
+  SmallVector<dependency_directives_scan::Token> DiscardTokens;
+  const char *Begin = Buffer.c_str();
+  Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
+                        InputSourceLoc};
+
+  PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
+  if (PragmaScanner.lexPragma(Begin, Buffer.end()))
+    return true;
+
+  DirectiveKind K = PragmaScanner.topDirective();
+  if (K == pp_none) {
+    skipLine(First, End);
+    return false;
+  }
+
+  assert(Begin == Buffer.end());
+  pushDirective(K);
+  return false;
+}
+
 bool Scanner::lexPragma(const char *&First, const char *const End) {
   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
   if (!FoundId)
@@ -713,6 +802,7 @@ static bool isStartOfRelevantLine(char First) {
   case 'i':
   case 'e':
   case 'm':
+  case '_':
     return true;
   }
   return false;
@@ -749,6 +839,12 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
   if (*First == 'i' || *First == 'e' || *First == 'm')
     return lexModule(First, End);
 
+  if (*First == '_') {
+    if (isNextIdentifierOrSkipLine("_Pragma", First, End))
+      return lex_Pragma(First, End);
+    return false;
+  }
+
   // Handle preprocessing directives.
 
   TheLexer.setParsingPreprocessorDirective(true);
diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
@@ -262,17 +262,48 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
 
   SourceLocation RParenLoc = Tok.getLocation();
   bool Invalid = false;
-  std::string StrVal = getSpelling(StrTok, &Invalid);
+  SmallString<64> StrVal;
+  StrVal.resize(StrTok.getLength());
+  StringRef StrValRef = getSpelling(StrTok, StrVal, &Invalid);
   if (Invalid) {
     Diag(PragmaLoc, diag::err__Pragma_malformed);
     return;
   }
 
-  // The _Pragma is lexically sound.  Destringize according to C11 6.10.9.1:
-  // "The string literal is destringized by deleting any encoding prefix,
-  // deleting the leading and trailing double-quotes, replacing each escape
-  // sequence \" by a double-quote, and replacing each escape sequence \\ by a
-  // single backslash."
+  assert(StrValRef.size() <= StrVal.size());
+
+  // If the token was spelled somewhere else, copy it.
+  if (StrValRef.begin() != StrVal.begin())
+    StrVal.assign(StrValRef);
+  // Truncate if necessary.
+  else if (StrValRef.size() != StrVal.size())
+    StrVal.resize(StrValRef.size());
+
+  // The _Pragma is lexically sound.  Destringize according to C11 6.10.9.1.
+  prepare_PragmaString(StrVal);
+
+  // Plop the string (including the newline and trailing null) into a buffer
+  // where we can lex it.
+  Token TmpTok;
+  TmpTok.startToken();
+  CreateString(StrVal, TmpTok);
+  SourceLocation TokLoc = TmpTok.getLocation();
+
+  // Make and enter a lexer object so that we lex and expand the tokens just
+  // like any others.
+  Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
+                                        StrVal.size(), *this);
+
+  EnterSourceFileWithLexer(TL, nullptr);
+
+  // With everything set up, lex this as a #pragma directive.
+  HandlePragmaDirective({PIK__Pragma, PragmaLoc});
+
+  // Finally, return whatever came after the pragma directive.
+  return Lex(Tok);
+}
+
+void clang::prepare_PragmaString(SmallVectorImpl<char> &StrVal) {
   if (StrVal[0] == 'L' || StrVal[0] == 'U' ||
       (StrVal[0] == 'u' && StrVal[1] != '8'))
     StrVal.erase(StrVal.begin());
@@ -296,8 +327,8 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
 
     // Remove 'R " d-char-sequence' and 'd-char-sequence "'. We'll replace the
     // parens below.
-    StrVal.erase(0, 2 + NumDChars);
-    StrVal.erase(StrVal.size() - 1 - NumDChars);
+    StrVal.erase(StrVal.begin(), StrVal.begin() + 2 + NumDChars);
+    StrVal.erase(StrVal.end() - 1 - NumDChars, StrVal.end());
   } else {
     assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' &&
            "Invalid string token!");
@@ -319,27 +350,7 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
   StrVal[0] = ' ';
 
   // Replace the terminating quote with a \n.
-  StrVal[StrVal.size()-1] = '\n';
-
-  // Plop the string (including the newline and trailing null) into a buffer
-  // where we can lex it.
-  Token TmpTok;
-  TmpTok.startToken();
-  CreateString(StrVal, TmpTok);
-  SourceLocation TokLoc = TmpTok.getLocation();
-
-  // Make and enter a lexer object so that we lex and expand the tokens just
-  // like any others.
-  Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
-                                        StrVal.size(), *this);
-
-  EnterSourceFileWithLexer(TL, nullptr);
-
-  // With everything set up, lex this as a #pragma directive.
-  HandlePragmaDirective({PIK__Pragma, PragmaLoc});
-
-  // Finally, return whatever came after the pragma directive.
-  return Lex(Tok);
+  StrVal[StrVal.size() - 1] = '\n';
 }
 
 /// HandleMicrosoft__pragma - Like Handle_Pragma except the pragma text
diff --git a/clang/test/ClangScanDeps/_Pragma-once.c b/clang/test/ClangScanDeps/_Pragma-once.c
@@ -0,0 +1,24 @@
+// Test scanning deps works with _Pragma syntax when not inside a macro.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
+
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1
+
+//--- cdb.json.template
+[{
+  "directory": "DIR",
+  "command": "clang -fsyntax-only DIR/tu.c",
+  "file": "DIR/tu.c"
+}]
+
+//--- a.h
+_Pragma("once")
+#include "b.h"
+
+//--- b.h
+#include "a.h"
+
+//--- tu.c
+#include "a.h"
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp