Skip to content

Commit 001e88a

Browse files
committed
[clangd] Performance improvements and cleanup
- Inline SymbolID hashing to header - Don't collect references for symbols without a SymbolID - Store referenced symbols, rather than separately storing decls and macros. - Don't defer ref collection to end of translation unit - Perform const_cast when updating reference counts (~0.5% saving) - Introduce caching for getSymbolID in SymbolCollector. (~30% saving) - Don't modify symbolslab if there's no definition location - Don't lex the whole file to deduce spelled tokens, just lex the relevant piece (~8%) Overall this achieves ~38% reduction in time spent inside SymbolCollector compared to baseline (on my machine :)). I'd expect the last optimization to affect dynamic index a lot more, I was testing with clangd-indexer on clangd subfolder of LLVM. As clangd-indexer runs indexing of whole TU at once, we indeed see almost every token from every source included in the TU (hence lexing full files vs just lexing referenced tokens are almost the same), whereas during dynamic indexing we mostly index main file symbols, but we would touch the files defining/declaring those symbols, and lex complete files for nothing, rather than just the token location. The last optimization is also a functional change (added test), previously we used raw tokens from syntax::tokenize, which didn't canonicalize trigraphs/newlines in identifiers, wheres Lexer::getSpelling canonicalizes them. Differential Revision: https://reviews.llvm.org/D122894
1 parent 5ef0ed7 commit 001e88a

File tree

5 files changed

+134
-119
lines changed

5 files changed

+134
-119
lines changed

clang-tools-extra/clangd/index/SymbolCollector.cpp

Lines changed: 92 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,14 @@
2121
#include "clang/AST/DeclBase.h"
2222
#include "clang/AST/DeclObjC.h"
2323
#include "clang/AST/DeclTemplate.h"
24+
#include "clang/AST/DeclarationName.h"
25+
#include "clang/Basic/LangOptions.h"
2426
#include "clang/Basic/SourceLocation.h"
2527
#include "clang/Basic/SourceManager.h"
2628
#include "clang/Index/IndexSymbol.h"
2729
#include "clang/Lex/Preprocessor.h"
28-
#include "clang/Tooling/Syntax/Tokens.h"
30+
#include "clang/Lex/Token.h"
31+
#include "llvm/ADT/ArrayRef.h"
2932
#include "llvm/Support/Casting.h"
3033
#include "llvm/Support/FileSystem.h"
3134
#include "llvm/Support/Path.h"
@@ -171,6 +174,22 @@ const Decl *getRefContainer(const Decl *Enclosing,
171174
return Enclosing;
172175
}
173176

177+
// Check if there is an exact spelling of \p ND at \p Loc.
178+
bool isSpelled(SourceLocation Loc, const NamedDecl &ND) {
179+
auto Name = ND.getDeclName();
180+
const auto NameKind = Name.getNameKind();
181+
if (NameKind != DeclarationName::Identifier &&
182+
NameKind != DeclarationName::CXXConstructorName)
183+
return false;
184+
const auto &AST = ND.getASTContext();
185+
const auto &SM = AST.getSourceManager();
186+
const auto &LO = AST.getLangOpts();
187+
clang::Token Tok;
188+
if (clang::Lexer::getRawToken(Loc, Tok, SM, LO))
189+
return false;
190+
auto StrName = Name.getAsString();
191+
return clang::Lexer::getSpelling(Tok, SM, LO) == StrName;
192+
}
174193
} // namespace
175194

176195
// Encapsulates decisions about how to record header paths in the index,
@@ -545,17 +564,17 @@ bool SymbolCollector::handleDeclOccurrence(
545564
if (!ND)
546565
return true;
547566

567+
auto ID = getSymbolIDCached(ND);
568+
if (!ID)
569+
return true;
570+
548571
// Mark D as referenced if this is a reference coming from the main file.
549572
// D may not be an interesting symbol, but it's cheaper to check at the end.
550573
auto &SM = ASTCtx->getSourceManager();
551574
if (Opts.CountReferences &&
552575
(Roles & static_cast<unsigned>(index::SymbolRole::Reference)) &&
553576
SM.getFileID(SM.getSpellingLoc(Loc)) == SM.getMainFileID())
554-
ReferencedDecls.insert(ND);
555-
556-
auto ID = getSymbolID(ND);
557-
if (!ID)
558-
return true;
577+
ReferencedSymbols.insert(ID);
559578

560579
// ND is the canonical (i.e. first) declaration. If it's in the main file
561580
// (which is not a header), then no public declaration was visible, so assume
@@ -576,27 +595,25 @@ bool SymbolCollector::handleDeclOccurrence(
576595
processRelations(*ND, ID, Relations);
577596

578597
bool CollectRef = static_cast<bool>(Opts.RefFilter & toRefKind(Roles));
579-
bool IsOnlyRef =
580-
!(Roles & (static_cast<unsigned>(index::SymbolRole::Declaration) |
581-
static_cast<unsigned>(index::SymbolRole::Definition)));
582-
583-
if (IsOnlyRef && !CollectRef)
584-
return true;
585-
586598
// Unlike other fields, e.g. Symbols (which use spelling locations), we use
587599
// file locations for references (as it aligns the behavior of clangd's
588600
// AST-based xref).
589601
// FIXME: we should try to use the file locations for other fields.
590602
if (CollectRef &&
591603
(!IsMainFileOnly || Opts.CollectMainFileRefs ||
592604
ND->isExternallyVisible()) &&
593-
!isa<NamespaceDecl>(ND) &&
594-
(Opts.RefsInHeaders ||
595-
SM.getFileID(SM.getFileLoc(Loc)) == SM.getMainFileID()))
596-
DeclRefs[ND].push_back(SymbolRef{SM.getFileLoc(Loc), Roles,
597-
getRefContainer(ASTNode.Parent, Opts)});
605+
!isa<NamespaceDecl>(ND)) {
606+
auto FileLoc = SM.getFileLoc(Loc);
607+
auto FID = SM.getFileID(FileLoc);
608+
if (Opts.RefsInHeaders || FID == SM.getMainFileID()) {
609+
addRef(ID, SymbolRef{FileLoc, FID, Roles,
610+
getRefContainer(ASTNode.Parent, Opts),
611+
isSpelled(FileLoc, *ND)});
612+
}
613+
}
598614
// Don't continue indexing if this is a mere reference.
599-
if (IsOnlyRef)
615+
if (!(Roles & (static_cast<unsigned>(index::SymbolRole::Declaration) |
616+
static_cast<unsigned>(index::SymbolRole::Definition))))
600617
return true;
601618

602619
// FIXME: ObjCPropertyDecl are not properly indexed here:
@@ -682,7 +699,7 @@ bool SymbolCollector::handleMacroOccurrence(const IdentifierInfo *Name,
682699
Name->getName() == "__GCC_HAVE_DWARF2_CFI_ASM")
683700
return true;
684701

685-
auto ID = getSymbolID(Name->getName(), MI, SM);
702+
auto ID = getSymbolIDCached(Name->getName(), MI, SM);
686703
if (!ID)
687704
return true;
688705

@@ -693,9 +710,13 @@ bool SymbolCollector::handleMacroOccurrence(const IdentifierInfo *Name,
693710
ASTCtx->getLangOpts());
694711
// Do not store references to main-file macros.
695712
if ((static_cast<unsigned>(Opts.RefFilter) & Roles) && !IsMainFileOnly &&
696-
(Opts.RefsInHeaders || SM.getFileID(SpellingLoc) == SM.getMainFileID()))
713+
(Opts.RefsInHeaders || SM.getFileID(SpellingLoc) == SM.getMainFileID())) {
697714
// FIXME: Populate container information for macro references.
698-
MacroRefs[ID].push_back({Loc, Roles, /*Container=*/nullptr});
715+
// FIXME: All MacroRefs are marked as Spelled now, but this should be
716+
// checked.
717+
addRef(ID, SymbolRef{Loc, SM.getFileID(Loc), Roles, /*Container=*/nullptr,
718+
/*Spelled=*/true});
719+
}
699720

700721
// Collect symbols.
701722
if (!Opts.CollectMacro)
@@ -711,7 +732,7 @@ bool SymbolCollector::handleMacroOccurrence(const IdentifierInfo *Name,
711732
if (Opts.CountReferences &&
712733
(Roles & static_cast<unsigned>(index::SymbolRole::Reference)) &&
713734
SM.getFileID(SpellingLoc) == SM.getMainFileID())
714-
ReferencedMacros.insert(Name);
735+
ReferencedSymbols.insert(ID);
715736

716737
// Don't continue indexing if this is a mere reference.
717738
// FIXME: remove macro with ID if it is undefined.
@@ -761,7 +782,7 @@ void SymbolCollector::processRelations(
761782
continue;
762783
const Decl *Object = R.RelatedSymbol;
763784

764-
auto ObjectID = getSymbolID(Object);
785+
auto ObjectID = getSymbolIDCached(Object);
765786
if (!ObjectID)
766787
continue;
767788

@@ -792,33 +813,25 @@ void SymbolCollector::setIncludeLocation(const Symbol &S, SourceLocation Loc) {
792813

793814
void SymbolCollector::finish() {
794815
// At the end of the TU, add 1 to the refcount of all referenced symbols.
795-
auto IncRef = [this](const SymbolID &ID) {
816+
for (const auto &ID : ReferencedSymbols) {
796817
if (const auto *S = Symbols.find(ID)) {
797-
Symbol Inc = *S;
798-
++Inc.References;
799-
Symbols.insert(Inc);
800-
}
801-
};
802-
for (const NamedDecl *ND : ReferencedDecls) {
803-
if (auto ID = getSymbolID(ND)) {
804-
IncRef(ID);
818+
// SymbolSlab::Builder returns const symbols because strings are interned
819+
// and modifying returned symbols without inserting again wouldn't go
820+
// well. const_cast is safe here as we're modifying a data owned by the
821+
// Symbol. This reduces time spent in SymbolCollector by ~1%.
822+
++const_cast<Symbol *>(S)->References;
805823
}
806824
}
807825
if (Opts.CollectMacro) {
808826
assert(PP);
809827
// First, drop header guards. We can't identify these until EOF.
810828
for (const IdentifierInfo *II : IndexedMacros) {
811829
if (const auto *MI = PP->getMacroDefinition(II).getMacroInfo())
812-
if (auto ID = getSymbolID(II->getName(), MI, PP->getSourceManager()))
830+
if (auto ID =
831+
getSymbolIDCached(II->getName(), MI, PP->getSourceManager()))
813832
if (MI->isUsedForHeaderGuard())
814833
Symbols.erase(ID);
815834
}
816-
// Now increment refcounts.
817-
for (const IdentifierInfo *II : ReferencedMacros) {
818-
if (const auto *MI = PP->getMacroDefinition(II).getMacroInfo())
819-
if (auto ID = getSymbolID(II->getName(), MI, PP->getSourceManager()))
820-
IncRef(ID);
821-
}
822835
}
823836
// Fill in IncludeHeaders.
824837
// We delay this until end of TU so header guards are all resolved.
@@ -852,58 +865,7 @@ void SymbolCollector::finish() {
852865
}
853866
}
854867

855-
const auto &SM = ASTCtx->getSourceManager();
856-
auto CollectRef = [&](SymbolID ID, const SymbolRef &LocAndRole,
857-
bool Spelled = false) {
858-
auto FileID = SM.getFileID(LocAndRole.Loc);
859-
// FIXME: use the result to filter out references.
860-
shouldIndexFile(FileID);
861-
if (const auto *FE = SM.getFileEntryForID(FileID)) {
862-
auto Range = getTokenRange(LocAndRole.Loc, SM, ASTCtx->getLangOpts());
863-
Ref R;
864-
R.Location.Start = Range.first;
865-
R.Location.End = Range.second;
866-
R.Location.FileURI = HeaderFileURIs->toURI(FE).c_str();
867-
R.Kind = toRefKind(LocAndRole.Roles, Spelled);
868-
R.Container = getSymbolID(LocAndRole.Container);
869-
Refs.insert(ID, R);
870-
}
871-
};
872-
// Populate Refs slab from MacroRefs.
873-
// FIXME: All MacroRefs are marked as Spelled now, but this should be checked.
874-
for (const auto &IDAndRefs : MacroRefs)
875-
for (const auto &LocAndRole : IDAndRefs.second)
876-
CollectRef(IDAndRefs.first, LocAndRole, /*Spelled=*/true);
877-
// Populate Refs slab from DeclRefs.
878-
llvm::DenseMap<FileID, std::vector<syntax::Token>> FilesToTokensCache;
879-
for (auto &DeclAndRef : DeclRefs) {
880-
if (auto ID = getSymbolID(DeclAndRef.first)) {
881-
for (auto &LocAndRole : DeclAndRef.second) {
882-
const auto FileID = SM.getFileID(LocAndRole.Loc);
883-
// FIXME: It's better to use TokenBuffer by passing spelled tokens from
884-
// the caller of SymbolCollector.
885-
if (!FilesToTokensCache.count(FileID))
886-
FilesToTokensCache[FileID] =
887-
syntax::tokenize(FileID, SM, ASTCtx->getLangOpts());
888-
llvm::ArrayRef<syntax::Token> Tokens = FilesToTokensCache[FileID];
889-
// Check if the referenced symbol is spelled exactly the same way the
890-
// corresponding NamedDecl is. If it is, mark this reference as spelled.
891-
const auto *IdentifierToken =
892-
spelledIdentifierTouching(LocAndRole.Loc, Tokens);
893-
DeclarationName Name = DeclAndRef.first->getDeclName();
894-
const auto NameKind = Name.getNameKind();
895-
bool IsTargetKind = NameKind == DeclarationName::Identifier ||
896-
NameKind == DeclarationName::CXXConstructorName;
897-
bool Spelled = IdentifierToken && IsTargetKind &&
898-
Name.getAsString() == IdentifierToken->text(SM);
899-
CollectRef(ID, LocAndRole, Spelled);
900-
}
901-
}
902-
}
903-
904-
ReferencedDecls.clear();
905-
ReferencedMacros.clear();
906-
DeclRefs.clear();
868+
ReferencedSymbols.clear();
907869
IncludeFiles.clear();
908870
}
909871

@@ -983,16 +945,18 @@ void SymbolCollector::addDefinition(const NamedDecl &ND,
983945
const Symbol &DeclSym) {
984946
if (DeclSym.Definition)
985947
return;
948+
const auto &SM = ND.getASTContext().getSourceManager();
949+
auto Loc = nameLocation(ND, SM);
950+
shouldIndexFile(SM.getFileID(Loc));
951+
auto DefLoc = getTokenLocation(Loc);
986952
// If we saw some forward declaration, we end up copying the symbol.
987953
// This is not ideal, but avoids duplicating the "is this a definition" check
988954
// in clang::index. We should only see one definition.
955+
if (!DefLoc)
956+
return;
989957
Symbol S = DeclSym;
990-
const auto &SM = ND.getASTContext().getSourceManager();
991-
auto Loc = nameLocation(ND, SM);
992958
// FIXME: use the result to filter out symbols.
993-
shouldIndexFile(SM.getFileID(Loc));
994-
if (auto DefLoc = getTokenLocation(Loc))
995-
S.Definition = *DefLoc;
959+
S.Definition = *DefLoc;
996960
Symbols.insert(S);
997961
}
998962

@@ -1005,5 +969,36 @@ bool SymbolCollector::shouldIndexFile(FileID FID) {
1005969
return I.first->second;
1006970
}
1007971

972+
void SymbolCollector::addRef(SymbolID ID, const SymbolRef &SR) {
973+
const auto &SM = ASTCtx->getSourceManager();
974+
// FIXME: use the result to filter out references.
975+
shouldIndexFile(SR.FID);
976+
if (const auto *FE = SM.getFileEntryForID(SR.FID)) {
977+
auto Range = getTokenRange(SR.Loc, SM, ASTCtx->getLangOpts());
978+
Ref R;
979+
R.Location.Start = Range.first;
980+
R.Location.End = Range.second;
981+
R.Location.FileURI = HeaderFileURIs->toURI(FE).c_str();
982+
R.Kind = toRefKind(SR.Roles, SR.Spelled);
983+
R.Container = getSymbolIDCached(SR.Container);
984+
Refs.insert(ID, R);
985+
}
986+
}
987+
988+
SymbolID SymbolCollector::getSymbolIDCached(const Decl *D) {
989+
auto It = DeclToIDCache.try_emplace(D, SymbolID{});
990+
if (It.second)
991+
It.first->second = getSymbolID(D);
992+
return It.first->second;
993+
}
994+
995+
SymbolID SymbolCollector::getSymbolIDCached(const llvm::StringRef MacroName,
996+
const MacroInfo *MI,
997+
const SourceManager &SM) {
998+
auto It = MacroToIDCache.try_emplace(MI, SymbolID{});
999+
if (It.second)
1000+
It.first->second = getSymbolID(MacroName, MI, SM);
1001+
return It.first->second;
1002+
}
10081003
} // namespace clangd
10091004
} // namespace clang

clang-tools-extra/clangd/index/SymbolCollector.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLCOLLECTOR_H
99
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLCOLLECTOR_H
1010

11-
#include "index/CanonicalIncludes.h"
1211
#include "CollectMacros.h"
12+
#include "index/CanonicalIncludes.h"
1313
#include "index/Ref.h"
1414
#include "index/Relation.h"
1515
#include "index/Symbol.h"
16+
#include "index/SymbolID.h"
1617
#include "index/SymbolOrigin.h"
1718
#include "clang/AST/ASTContext.h"
1819
#include "clang/AST/Decl.h"
@@ -21,6 +22,7 @@
2122
#include "clang/Index/IndexDataConsumer.h"
2223
#include "clang/Index/IndexSymbol.h"
2324
#include "clang/Sema/CodeCompleteConsumer.h"
25+
#include "llvm/ADT/ArrayRef.h"
2426
#include "llvm/ADT/DenseMap.h"
2527
#include <functional>
2628

@@ -142,6 +144,10 @@ class SymbolCollector : public index::IndexDataConsumer {
142144

143145
llvm::Optional<std::string> getIncludeHeader(const Symbol &S, FileID);
144146

147+
SymbolID getSymbolIDCached(const Decl *D);
148+
SymbolID getSymbolIDCached(const llvm::StringRef MacroName,
149+
const MacroInfo *MI, const SourceManager &SM);
150+
145151
// All Symbols collected from the AST.
146152
SymbolSlab::Builder Symbols;
147153
// File IDs for Symbol.IncludeHeaders.
@@ -164,14 +170,14 @@ class SymbolCollector : public index::IndexDataConsumer {
164170
Options Opts;
165171
struct SymbolRef {
166172
SourceLocation Loc;
173+
FileID FID;
167174
index::SymbolRoleSet Roles;
168175
const Decl *Container;
176+
bool Spelled;
169177
};
178+
void addRef(SymbolID ID, const SymbolRef &SR);
170179
// Symbols referenced from the current TU, flushed on finish().
171-
llvm::DenseSet<const NamedDecl *> ReferencedDecls;
172-
llvm::DenseSet<const IdentifierInfo *> ReferencedMacros;
173-
llvm::DenseMap<const NamedDecl *, std::vector<SymbolRef>> DeclRefs;
174-
llvm::DenseMap<SymbolID, std::vector<SymbolRef>> MacroRefs;
180+
llvm::DenseSet<SymbolID> ReferencedSymbols;
175181
// Maps canonical declaration provided by clang to canonical declaration for
176182
// an index symbol, if clangd prefers a different declaration than that
177183
// provided by clang. For example, friend declaration might be considered
@@ -184,6 +190,8 @@ class SymbolCollector : public index::IndexDataConsumer {
184190
// to insert for which symbol, etc.
185191
class HeaderFileURICache;
186192
std::unique_ptr<HeaderFileURICache> HeaderFileURIs;
193+
llvm::DenseMap<const Decl *, SymbolID> DeclToIDCache;
194+
llvm::DenseMap<const MacroInfo *, SymbolID> MacroToIDCache;
187195
};
188196

189197
} // namespace clangd

clang-tools-extra/clangd/index/SymbolID.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,5 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID) {
4646
return OS << llvm::toHex(ID.raw());
4747
}
4848

49-
llvm::hash_code hash_value(const SymbolID &ID) {
50-
// We already have a good hash, just return the first bytes.
51-
static_assert(sizeof(size_t) <= SymbolID::RawSize,
52-
"size_t longer than SHA1!");
53-
size_t Result;
54-
memcpy(&Result, ID.raw().data(), sizeof(size_t));
55-
return llvm::hash_code(Result);
56-
}
57-
5849
} // namespace clangd
5950
} // namespace clang

0 commit comments

Comments
 (0)