|
10 | 10 | //
|
11 | 11 | //===----------------------------------------------------------------------===//
|
12 | 12 | #include "clang/Tooling/Transformer/SourceCode.h"
|
| 13 | +#include "clang/AST/ASTContext.h" |
| 14 | +#include "clang/AST/Attr.h" |
| 15 | +#include "clang/AST/Comment.h" |
| 16 | +#include "clang/AST/Decl.h" |
| 17 | +#include "clang/AST/DeclCXX.h" |
| 18 | +#include "clang/AST/DeclTemplate.h" |
| 19 | +#include "clang/AST/Expr.h" |
13 | 20 | #include "clang/Lex/Lexer.h"
|
14 | 21 | #include "llvm/Support/Errc.h"
|
15 | 22 |
|
@@ -84,3 +91,302 @@ clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
|
84 | 91 | return Range;
|
85 | 92 |
|
86 | 93 | }
|
| 94 | + |
| 95 | +static bool startsWithNewline(const SourceManager &SM, const Token &Tok) { |
| 96 | + return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]); |
| 97 | +} |
| 98 | + |
| 99 | +static bool contains(const std::set<tok::TokenKind> &Terminators, |
| 100 | + const Token &Tok) { |
| 101 | + return Terminators.count(Tok.getKind()) > 0; |
| 102 | +} |
| 103 | + |
| 104 | +// Returns the exclusive, *file* end location of the entity whose last token is |
| 105 | +// at location 'EntityLast'. That is, it returns the location one past the last |
| 106 | +// relevant character. |
| 107 | +// |
| 108 | +// Associated tokens include comments, horizontal whitespace and 'Terminators' |
| 109 | +// -- optional tokens, which, if any are found, will be included; if |
| 110 | +// 'Terminators' is empty, we will not include any extra tokens beyond comments |
| 111 | +// and horizontal whitespace. |
| 112 | +static SourceLocation |
| 113 | +getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast, |
| 114 | + const std::set<tok::TokenKind> &Terminators, |
| 115 | + const LangOptions &LangOpts) { |
| 116 | + assert(EntityLast.isValid() && "Invalid end location found."); |
| 117 | + |
| 118 | + // We remember the last location of a non-horizontal-whitespace token we have |
| 119 | + // lexed; this is the location up to which we will want to delete. |
| 120 | + // FIXME: Support using the spelling loc here for cases where we want to |
| 121 | + // analyze the macro text. |
| 122 | + |
| 123 | + CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast); |
| 124 | + // FIXME: Should check isTokenRange(), for the (rare) case that |
| 125 | + // `ExpansionRange` is a character range. |
| 126 | + std::unique_ptr<Lexer> Lexer = [&]() { |
| 127 | + bool Invalid = false; |
| 128 | + auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd()); |
| 129 | + llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid); |
| 130 | + assert(!Invalid && "Cannot get file/offset"); |
| 131 | + return std::make_unique<clang::Lexer>( |
| 132 | + SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(), |
| 133 | + File.data() + FileOffset.second, File.end()); |
| 134 | + }(); |
| 135 | + |
| 136 | + // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown). |
| 137 | + Lexer->SetKeepWhitespaceMode(true); |
| 138 | + |
| 139 | + // Generally, the code we want to include looks like this ([] are optional), |
| 140 | + // If Terminators is empty: |
| 141 | + // [ <comment> ] [ <newline> ] |
| 142 | + // Otherwise: |
| 143 | + // ... <terminator> [ <comment> ] [ <newline> ] |
| 144 | + |
| 145 | + Token Tok; |
| 146 | + bool Terminated = false; |
| 147 | + |
| 148 | + // First, lex to the current token (which is the last token of the range that |
| 149 | + // is definitely associated with the decl). Then, we process the first token |
| 150 | + // separately from the rest based on conditions that hold specifically for |
| 151 | + // that first token. |
| 152 | + // |
| 153 | + // We do not search for a terminator if none is required or we've already |
| 154 | + // encountered it. Otherwise, if the original `EntityLast` location was in a |
| 155 | + // macro expansion, we don't have visibility into the text, so we assume we've |
| 156 | + // already terminated. However, we note this assumption with |
| 157 | + // `TerminatedByMacro`, because we'll want to handle it somewhat differently |
| 158 | + // for the terminators semicolon and comma. These terminators can be safely |
| 159 | + // associated with the entity when they appear after the macro -- extra |
| 160 | + // semicolons have no effect on the program and a well-formed program won't |
| 161 | + // have multiple commas in a row, so we're guaranteed that there is only one. |
| 162 | + // |
| 163 | + // FIXME: This handling of macros is more conservative than necessary. When |
| 164 | + // the end of the expansion coincides with the end of the node, we can still |
| 165 | + // safely analyze the code. But, it is more complicated, because we need to |
| 166 | + // start by lexing the spelling loc for the first token and then switch to the |
| 167 | + // expansion loc. |
| 168 | + bool TerminatedByMacro = false; |
| 169 | + Lexer->LexFromRawLexer(Tok); |
| 170 | + if (Terminators.empty() || contains(Terminators, Tok)) |
| 171 | + Terminated = true; |
| 172 | + else if (EntityLast.isMacroID()) { |
| 173 | + Terminated = true; |
| 174 | + TerminatedByMacro = true; |
| 175 | + } |
| 176 | + |
| 177 | + // We save the most recent candidate for the exclusive end location. |
| 178 | + SourceLocation End = Tok.getEndLoc(); |
| 179 | + |
| 180 | + while (!Terminated) { |
| 181 | + // Lex the next token we want to possibly expand the range with. |
| 182 | + Lexer->LexFromRawLexer(Tok); |
| 183 | + |
| 184 | + switch (Tok.getKind()) { |
| 185 | + case tok::eof: |
| 186 | + // Unexpected separators. |
| 187 | + case tok::l_brace: |
| 188 | + case tok::r_brace: |
| 189 | + case tok::comma: |
| 190 | + return End; |
| 191 | + // Whitespace pseudo-tokens. |
| 192 | + case tok::unknown: |
| 193 | + if (startsWithNewline(SM, Tok)) |
| 194 | + // Include at least until the end of the line. |
| 195 | + End = Tok.getEndLoc(); |
| 196 | + break; |
| 197 | + default: |
| 198 | + if (contains(Terminators, Tok)) |
| 199 | + Terminated = true; |
| 200 | + End = Tok.getEndLoc(); |
| 201 | + break; |
| 202 | + } |
| 203 | + } |
| 204 | + |
| 205 | + do { |
| 206 | + // Lex the next token we want to possibly expand the range with. |
| 207 | + Lexer->LexFromRawLexer(Tok); |
| 208 | + |
| 209 | + switch (Tok.getKind()) { |
| 210 | + case tok::unknown: |
| 211 | + if (startsWithNewline(SM, Tok)) |
| 212 | + // We're done, but include this newline. |
| 213 | + return Tok.getEndLoc(); |
| 214 | + break; |
| 215 | + case tok::comment: |
| 216 | + // Include any comments we find on the way. |
| 217 | + End = Tok.getEndLoc(); |
| 218 | + break; |
| 219 | + case tok::semi: |
| 220 | + case tok::comma: |
| 221 | + if (TerminatedByMacro && contains(Terminators, Tok)) { |
| 222 | + End = Tok.getEndLoc(); |
| 223 | + // We've found a real terminator. |
| 224 | + TerminatedByMacro = false; |
| 225 | + break; |
| 226 | + } |
| 227 | + // Found an unrelated token; stop and don't include it. |
| 228 | + return End; |
| 229 | + default: |
| 230 | + // Found an unrelated token; stop and don't include it. |
| 231 | + return End; |
| 232 | + } |
| 233 | + } while (true); |
| 234 | +} |
| 235 | + |
| 236 | +// Returns the expected terminator tokens for the given declaration. |
| 237 | +// |
| 238 | +// If we do not know the correct terminator token, returns an empty set. |
| 239 | +// |
| 240 | +// There are cases where we have more than one possible terminator (for example, |
| 241 | +// we find either a comma or a semicolon after a VarDecl). |
| 242 | +static std::set<tok::TokenKind> getTerminators(const Decl &D) { |
| 243 | + if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D)) |
| 244 | + return {tok::semi}; |
| 245 | + |
| 246 | + if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D)) |
| 247 | + return {tok::r_brace, tok::semi}; |
| 248 | + |
| 249 | + if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D)) |
| 250 | + return {tok::comma, tok::semi}; |
| 251 | + |
| 252 | + return {}; |
| 253 | +} |
| 254 | + |
| 255 | +// Starting from `Loc`, skips whitespace up to, and including, a single |
| 256 | +// newline. Returns the (exclusive) end of any skipped whitespace (that is, the |
| 257 | +// location immediately after the whitespace). |
| 258 | +static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM, |
| 259 | + SourceLocation Loc, |
| 260 | + const LangOptions &LangOpts) { |
| 261 | + const char *LocChars = SM.getCharacterData(Loc); |
| 262 | + int i = 0; |
| 263 | + while (isHorizontalWhitespace(LocChars[i])) |
| 264 | + ++i; |
| 265 | + if (isVerticalWhitespace(LocChars[i])) |
| 266 | + ++i; |
| 267 | + return Loc.getLocWithOffset(i); |
| 268 | +} |
| 269 | + |
| 270 | +// Is `Loc` separated from any following decl by something meaningful (e.g. an |
| 271 | +// empty line, a comment), ignoring horizontal whitespace? Since this is a |
| 272 | +// heuristic, we return false when in doubt. `Loc` cannot be the first location |
| 273 | +// in the file. |
| 274 | +static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc, |
| 275 | + const LangOptions &LangOpts) { |
| 276 | + // If the preceding character is a newline, we'll check for an empty line as a |
| 277 | + // separator. However, we can't identify an empty line using tokens, so we |
| 278 | + // analyse the characters. If we try to use tokens, we'll just end up with a |
| 279 | + // whitespace token, whose characters we'd have to analyse anyhow. |
| 280 | + bool Invalid = false; |
| 281 | + const char *LocChars = |
| 282 | + SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid); |
| 283 | + assert(!Invalid && |
| 284 | + "Loc must be a valid character and not the first of the source file."); |
| 285 | + if (isVerticalWhitespace(LocChars[0])) { |
| 286 | + for (int i = 1; isWhitespace(LocChars[i]); ++i) |
| 287 | + if (isVerticalWhitespace(LocChars[i])) |
| 288 | + return true; |
| 289 | + } |
| 290 | + // We didn't find an empty line, so lex the next token, skipping past any |
| 291 | + // whitespace we just scanned. |
| 292 | + Token Tok; |
| 293 | + bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts, |
| 294 | + /*IgnoreWhiteSpace=*/true); |
| 295 | + if (Failed) |
| 296 | + // Any text that confuses the lexer seems fair to consider a separation. |
| 297 | + return true; |
| 298 | + |
| 299 | + switch (Tok.getKind()) { |
| 300 | + case tok::comment: |
| 301 | + case tok::l_brace: |
| 302 | + case tok::r_brace: |
| 303 | + case tok::eof: |
| 304 | + return true; |
| 305 | + default: |
| 306 | + return false; |
| 307 | + } |
| 308 | +} |
| 309 | + |
| 310 | +CharSourceRange tooling::getAssociatedRange(const Decl &Decl, |
| 311 | + ASTContext &Context) { |
| 312 | + const SourceManager &SM = Context.getSourceManager(); |
| 313 | + const LangOptions &LangOpts = Context.getLangOpts(); |
| 314 | + CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange()); |
| 315 | + |
| 316 | + // First, expand to the start of the template<> declaration if necessary. |
| 317 | + if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) { |
| 318 | + if (const auto *T = Record->getDescribedClassTemplate()) |
| 319 | + if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) |
| 320 | + Range.setBegin(T->getBeginLoc()); |
| 321 | + } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) { |
| 322 | + if (const auto *T = F->getDescribedFunctionTemplate()) |
| 323 | + if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) |
| 324 | + Range.setBegin(T->getBeginLoc()); |
| 325 | + } |
| 326 | + |
| 327 | + // Next, expand the end location past trailing comments to include a potential |
| 328 | + // newline at the end of the decl's line. |
| 329 | + Range.setEnd( |
| 330 | + getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts)); |
| 331 | + Range.setTokenRange(false); |
| 332 | + |
| 333 | + // Expand to include preceeding associated comments. We ignore any comments |
| 334 | + // that are not preceeding the decl, since we've already skipped trailing |
| 335 | + // comments with getEntityEndLoc. |
| 336 | + if (const RawComment *Comment = |
| 337 | + Decl.getASTContext().getRawCommentForDeclNoCache(&Decl)) |
| 338 | + // Only include a preceding comment if: |
| 339 | + // * it is *not* separate from the declaration (not including any newline |
| 340 | + // that immediately follows the comment), |
| 341 | + // * the decl *is* separate from any following entity (so, there are no |
| 342 | + // other entities the comment could refer to), and |
| 343 | + // * it is not a IfThisThenThat lint check. |
| 344 | + if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(), |
| 345 | + Range.getBegin()) && |
| 346 | + !atOrBeforeSeparation( |
| 347 | + SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts), |
| 348 | + LangOpts) && |
| 349 | + atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) { |
| 350 | + const StringRef CommentText = Comment->getRawText(SM); |
| 351 | + if (!CommentText.contains("LINT.IfChange") && |
| 352 | + !CommentText.contains("LINT.ThenChange")) |
| 353 | + Range.setBegin(Comment->getBeginLoc()); |
| 354 | + } |
| 355 | + // Add leading attributes. |
| 356 | + for (auto *Attr : Decl.attrs()) { |
| 357 | + if (Attr->getLocation().isInvalid() || |
| 358 | + !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin())) |
| 359 | + continue; |
| 360 | + Range.setBegin(Attr->getLocation()); |
| 361 | + |
| 362 | + // Extend to the left '[[' or '__attribute((' if we saw the attribute, |
| 363 | + // unless it is not a valid location. |
| 364 | + bool Invalid; |
| 365 | + StringRef Source = |
| 366 | + SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid); |
| 367 | + if (Invalid) |
| 368 | + continue; |
| 369 | + llvm::StringRef BeforeAttr = |
| 370 | + Source.substr(0, SM.getFileOffset(Range.getBegin())); |
| 371 | + llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim(); |
| 372 | + |
| 373 | + for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) { |
| 374 | + // Handle whitespace between attribute prefix and attribute value. |
| 375 | + if (BeforeAttrStripped.endswith(Prefix)) { |
| 376 | + // Move start to start position of prefix, which is |
| 377 | + // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix) |
| 378 | + // positions to the left. |
| 379 | + Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>( |
| 380 | + -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size()))); |
| 381 | + break; |
| 382 | + // If we didn't see '[[' or '__attribute' it's probably coming from a |
| 383 | + // macro expansion which is already handled by makeFileCharRange(), |
| 384 | + // below. |
| 385 | + } |
| 386 | + } |
| 387 | + } |
| 388 | + |
| 389 | + // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But, |
| 390 | + // Range.getBegin() may be inside an expansion. |
| 391 | + return Lexer::makeFileCharRange(Range, SM, LangOpts); |
| 392 | +} |
0 commit comments