Skip to content

Commit 86f82f6

Browse files
committed
Merge remote-tracking branch 'origin/main' into AMX-TRANSPOSE
2 parents f822950 + c3edeaa commit 86f82f6

File tree

4,887 files changed

+267150
-74451
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

4,887 files changed

+267150
-74451
lines changed

bolt/include/bolt/Core/BinaryFunction.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,9 @@ class BinaryFunction {
386386
/// Raw branch count for this function in the profile.
387387
uint64_t RawBranchCount{0};
388388

389+
/// Dynamically executed function bytes, used for density computation.
390+
uint64_t SampleCountInBytes{0};
391+
389392
/// Indicates the type of profile the function is using.
390393
uint16_t ProfileFlags{PF_NONE};
391394

@@ -1844,6 +1847,9 @@ class BinaryFunction {
18441847
/// to this function.
18451848
void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; }
18461849

1850+
/// Return the number of dynamically executed bytes, from raw perf data.
1851+
uint64_t getSampleCountInBytes() const { return SampleCountInBytes; }
1852+
18471853
/// Return the execution count for functions with known profile.
18481854
/// Return 0 if the function has no profile.
18491855
uint64_t getKnownExecutionCount() const {

bolt/include/bolt/Utils/CommandLineOpts.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ extern llvm::cl::opt<bool> PrintSections;
5555
enum ProfileFormatKind { PF_Fdata, PF_YAML };
5656

5757
extern llvm::cl::opt<ProfileFormatKind> ProfileFormat;
58+
extern llvm::cl::opt<bool> ShowDensity;
5859
extern llvm::cl::opt<bool> SplitEH;
5960
extern llvm::cl::opt<bool> StrictMode;
6061
extern llvm::cl::opt<bool> TimeOpts;

bolt/lib/Core/BinaryFunction.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2577,6 +2577,7 @@ struct CFISnapshot {
25772577
case MCCFIInstruction::OpAdjustCfaOffset:
25782578
case MCCFIInstruction::OpWindowSave:
25792579
case MCCFIInstruction::OpNegateRAState:
2580+
case MCCFIInstruction::OpNegateRAStateWithPC:
25802581
case MCCFIInstruction::OpLLVMDefAspaceCfa:
25812582
case MCCFIInstruction::OpLabel:
25822583
llvm_unreachable("unsupported CFI opcode");
@@ -2715,6 +2716,7 @@ struct CFISnapshotDiff : public CFISnapshot {
27152716
case MCCFIInstruction::OpAdjustCfaOffset:
27162717
case MCCFIInstruction::OpWindowSave:
27172718
case MCCFIInstruction::OpNegateRAState:
2719+
case MCCFIInstruction::OpNegateRAStateWithPC:
27182720
case MCCFIInstruction::OpLLVMDefAspaceCfa:
27192721
case MCCFIInstruction::OpLabel:
27202722
llvm_unreachable("unsupported CFI opcode");
@@ -2864,6 +2866,7 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
28642866
case MCCFIInstruction::OpAdjustCfaOffset:
28652867
case MCCFIInstruction::OpWindowSave:
28662868
case MCCFIInstruction::OpNegateRAState:
2869+
case MCCFIInstruction::OpNegateRAStateWithPC:
28672870
case MCCFIInstruction::OpLLVMDefAspaceCfa:
28682871
case MCCFIInstruction::OpLabel:
28692872
llvm_unreachable("unsupported CFI opcode");

bolt/lib/Passes/BinaryPasses.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "bolt/Core/ParallelUtilities.h"
1616
#include "bolt/Passes/ReorderAlgorithm.h"
1717
#include "bolt/Passes/ReorderFunctions.h"
18+
#include "bolt/Utils/CommandLineOpts.h"
1819
#include "llvm/Support/CommandLine.h"
1920
#include <atomic>
2021
#include <mutex>
@@ -223,6 +224,18 @@ static cl::opt<unsigned> TopCalledLimit(
223224
"functions section"),
224225
cl::init(100), cl::Hidden, cl::cat(BoltCategory));
225226

227+
// Profile density options, synced with llvm-profgen/ProfileGenerator.cpp
228+
static cl::opt<int> ProfileDensityCutOffHot(
229+
"profile-density-cutoff-hot", cl::init(990000),
230+
cl::desc("Total samples cutoff for functions used to calculate "
231+
"profile density."));
232+
233+
static cl::opt<double> ProfileDensityThreshold(
234+
"profile-density-threshold", cl::init(60),
235+
cl::desc("If the profile density is below the given threshold, it "
236+
"will be suggested to increase the sampling rate."),
237+
cl::Optional);
238+
226239
} // namespace opts
227240

228241
namespace llvm {
@@ -1383,6 +1396,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
13831396
uint64_t StaleSampleCount = 0;
13841397
uint64_t InferredSampleCount = 0;
13851398
std::vector<const BinaryFunction *> ProfiledFunctions;
1399+
std::vector<std::pair<double, uint64_t>> FuncDensityList;
13861400
const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
13871401
for (auto &BFI : BC.getBinaryFunctions()) {
13881402
const BinaryFunction &Function = BFI.second;
@@ -1441,6 +1455,22 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
14411455
StaleSampleCount += SampleCount;
14421456
++NumAllStaleFunctions;
14431457
}
1458+
1459+
if (opts::ShowDensity) {
1460+
uint64_t Size = Function.getSize();
1461+
// In case of BOLT split functions registered in BAT, executed traces are
1462+
// automatically attributed to the main fragment. Add up function sizes
1463+
// for all fragments.
1464+
if (IsHotParentOfBOLTSplitFunction)
1465+
for (const BinaryFunction *Fragment : Function.getFragments())
1466+
Size += Fragment->getSize();
1467+
double Density = (double)1.0 * Function.getSampleCountInBytes() / Size;
1468+
FuncDensityList.emplace_back(Density, SampleCount);
1469+
LLVM_DEBUG(BC.outs() << Function << ": executed bytes "
1470+
<< Function.getSampleCountInBytes() << ", size (b) "
1471+
<< Size << ", density " << Density
1472+
<< ", sample count " << SampleCount << '\n');
1473+
}
14441474
}
14451475
BC.NumProfiledFuncs = ProfiledFunctions.size();
14461476
BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
@@ -1684,6 +1714,50 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
16841714
BC.outs() << ". Use -print-unknown to see the list.";
16851715
BC.outs() << '\n';
16861716
}
1717+
1718+
if (opts::ShowDensity) {
1719+
double Density = 0.0;
1720+
// Sorted by the density in descending order.
1721+
llvm::stable_sort(FuncDensityList,
1722+
[&](const std::pair<double, uint64_t> &A,
1723+
const std::pair<double, uint64_t> &B) {
1724+
if (A.first != B.first)
1725+
return A.first > B.first;
1726+
return A.second < B.second;
1727+
});
1728+
1729+
uint64_t AccumulatedSamples = 0;
1730+
uint32_t I = 0;
1731+
assert(opts::ProfileDensityCutOffHot <= 1000000 &&
1732+
"The cutoff value is greater than 1000000(100%)");
1733+
while (AccumulatedSamples <
1734+
TotalSampleCount *
1735+
static_cast<float>(opts::ProfileDensityCutOffHot) /
1736+
1000000 &&
1737+
I < FuncDensityList.size()) {
1738+
AccumulatedSamples += FuncDensityList[I].second;
1739+
Density = FuncDensityList[I].first;
1740+
I++;
1741+
}
1742+
if (Density == 0.0) {
1743+
BC.errs() << "BOLT-WARNING: the output profile is empty or the "
1744+
"--profile-density-cutoff-hot option is "
1745+
"set too low. Please check your command.\n";
1746+
} else if (Density < opts::ProfileDensityThreshold) {
1747+
BC.errs()
1748+
<< "BOLT-WARNING: BOLT is estimated to optimize better with "
1749+
<< format("%.1f", opts::ProfileDensityThreshold / Density)
1750+
<< "x more samples. Please consider increasing sampling rate or "
1751+
"profiling for longer duration to get more samples.\n";
1752+
}
1753+
1754+
BC.outs() << "BOLT-INFO: Functions with density >= "
1755+
<< format("%.1f", Density) << " account for "
1756+
<< format("%.2f",
1757+
static_cast<double>(opts::ProfileDensityCutOffHot) /
1758+
10000)
1759+
<< "% total sample counts.\n";
1760+
}
16871761
return Error::success();
16881762
}
16891763

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -638,8 +638,12 @@ void DataAggregator::processProfile(BinaryContext &BC) {
638638
: BinaryFunction::PF_LBR;
639639
for (auto &BFI : BC.getBinaryFunctions()) {
640640
BinaryFunction &BF = BFI.second;
641-
if (getBranchData(BF) || getFuncSampleData(BF.getNames()))
641+
FuncBranchData *FBD = getBranchData(BF);
642+
if (FBD || getFuncSampleData(BF.getNames())) {
642643
BF.markProfiled(Flags);
644+
if (FBD)
645+
BF.RawBranchCount = FBD->getNumExecutedBranches();
646+
}
643647
}
644648

645649
for (auto &FuncBranches : NamesToBranches)
@@ -845,6 +849,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
845849
return false;
846850
}
847851

852+
// Set ParentFunc to BAT parent function or FromFunc itself.
853+
BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
854+
if (!ParentFunc)
855+
ParentFunc = FromFunc;
856+
ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);
857+
848858
std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
849859
BAT ? BAT->getFallthroughsInTrace(FromFunc->getAddress(), First.To,
850860
Second.From)
@@ -864,13 +874,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
864874
<< FromFunc->getPrintName() << ":"
865875
<< Twine::utohexstr(First.To) << " to "
866876
<< Twine::utohexstr(Second.From) << ".\n");
867-
BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
868877
for (auto [From, To] : *FTs) {
869878
if (BAT) {
870879
From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
871880
To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false);
872881
}
873-
doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false);
882+
doIntraBranch(*ParentFunc, From, To, Count, false);
874883
}
875884

876885
return true;

bolt/lib/Utils/CommandLineOpts.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,10 @@ cl::opt<std::string> SaveProfile("w",
175175
cl::desc("save recorded profile to a file"),
176176
cl::cat(BoltOutputCategory));
177177

178+
cl::opt<bool> ShowDensity("show-density",
179+
cl::desc("show profile density details"),
180+
cl::Optional, cl::cat(AggregatorCategory));
181+
178182
cl::opt<bool> SplitEH("split-eh", cl::desc("split C++ exception handling code"),
179183
cl::Hidden, cl::cat(BoltOptCategory));
180184

bolt/test/X86/pre-aggregated-perf.test

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,21 @@ REQUIRES: system-linux
1111

1212
RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
1313
RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
14-
RUN: --profile-use-dfs | FileCheck %s
14+
RUN: --show-density \
15+
RUN: --profile-density-threshold=9 --profile-density-cutoff-hot=970000 \
16+
RUN: --profile-use-dfs | FileCheck %s --check-prefix=CHECK-P2B
17+
18+
CHECK-P2B: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
19+
CHECK-P2B: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts.
20+
21+
RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
22+
RUN: --show-density \
23+
RUN: --profile-density-cutoff-hot=970000 \
24+
RUN: --profile-use-dfs 2>&1 | FileCheck %s --check-prefix=CHECK-WARNING
25+
26+
CHECK-WARNING: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
27+
CHECK-WARNING: BOLT-WARNING: BOLT is estimated to optimize better with 2.8x more samples.
28+
CHECK-WARNING: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts.
1529

1630
RUN: llvm-bolt %t.exe -data %t -o %t.null | FileCheck %s
1731
RUN: llvm-bolt %t.exe -data %t.new -o %t.null | FileCheck %s

bolt/tools/driver/llvm-bolt.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ void perf2boltMode(int argc, char **argv) {
129129
exit(1);
130130
}
131131
opts::AggregateOnly = true;
132+
opts::ShowDensity = true;
132133
}
133134

134135
void boltDiffMode(int argc, char **argv) {

clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include "MultipleStatementMacroCheck.h"
5050
#include "NoEscapeCheck.h"
5151
#include "NonZeroEnumToBoolConversionCheck.h"
52+
#include "NondeterministicPointerIterationOrderCheck.h"
5253
#include "NotNullTerminatedResultCheck.h"
5354
#include "OptionalValueConversionCheck.h"
5455
#include "ParentVirtualCallCheck.h"
@@ -174,6 +175,8 @@ class BugproneModule : public ClangTidyModule {
174175
"bugprone-multiple-new-in-one-expression");
175176
CheckFactories.registerCheck<MultipleStatementMacroCheck>(
176177
"bugprone-multiple-statement-macro");
178+
CheckFactories.registerCheck<NondeterministicPointerIterationOrderCheck>(
179+
"bugprone-nondeterministic-pointer-iteration-order");
177180
CheckFactories.registerCheck<OptionalValueConversionCheck>(
178181
"bugprone-optional-value-conversion");
179182
CheckFactories.registerCheck<PointerArithmeticOnPolymorphicObjectCheck>(

clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ add_clang_library(clangTidyBugproneModule STATIC
4545
MultipleNewInOneExpressionCheck.cpp
4646
MultipleStatementMacroCheck.cpp
4747
NoEscapeCheck.cpp
48+
NondeterministicPointerIterationOrderCheck.cpp
4849
NonZeroEnumToBoolConversionCheck.cpp
4950
NotNullTerminatedResultCheck.cpp
5051
OptionalValueConversionCheck.cpp
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
//===----- NondeterministicPointerIterationOrderCheck.cpp - clang-tidy ----===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "NondeterministicPointerIterationOrderCheck.h"
10+
#include "clang/AST/ASTContext.h"
11+
#include "clang/Lex/Lexer.h"
12+
13+
using namespace clang::ast_matchers;
14+
15+
namespace clang::tidy::bugprone {
16+
17+
void NondeterministicPointerIterationOrderCheck::registerMatchers(
18+
MatchFinder *Finder) {
19+
20+
auto LoopVariable = varDecl(hasType(
21+
qualType(hasCanonicalType(anyOf(referenceType(), pointerType())))));
22+
23+
auto RangeInit = declRefExpr(to(varDecl(
24+
hasType(recordDecl(hasAnyName("std::unordered_set", "std::unordered_map",
25+
"std::unordered_multiset",
26+
"std::unordered_multimap"))
27+
.bind("recorddecl")))));
28+
29+
Finder->addMatcher(cxxForRangeStmt(hasLoopVariable(LoopVariable),
30+
hasRangeInit(RangeInit.bind("rangeinit")))
31+
.bind("cxxForRangeStmt"),
32+
this);
33+
34+
auto SortFuncM = callee(functionDecl(hasAnyName(
35+
"std::is_sorted", "std::nth_element", "std::sort", "std::partial_sort",
36+
"std::partition", "std::stable_partition", "std::stable_sort")));
37+
38+
auto IteratesPointerEltsM = hasArgument(
39+
0,
40+
cxxMemberCallExpr(on(hasType(cxxRecordDecl(has(fieldDecl(hasType(qualType(
41+
hasCanonicalType(pointsTo(hasCanonicalType(pointerType()))))))))))));
42+
43+
Finder->addMatcher(
44+
callExpr(allOf(SortFuncM, IteratesPointerEltsM)).bind("sortsemantic"),
45+
this);
46+
}
47+
48+
void NondeterministicPointerIterationOrderCheck::check(
49+
const MatchFinder::MatchResult &Result) {
50+
const auto *ForRangePointers =
51+
Result.Nodes.getNodeAs<CXXForRangeStmt>("cxxForRangeStmt");
52+
53+
if ((ForRangePointers) && !(ForRangePointers->getBeginLoc().isMacroID())) {
54+
const auto *RangeInit = Result.Nodes.getNodeAs<Stmt>("rangeinit");
55+
if (const auto *ClassTemplate =
56+
Result.Nodes.getNodeAs<ClassTemplateSpecializationDecl>(
57+
"recorddecl")) {
58+
const TemplateArgumentList &TemplateArgs =
59+
ClassTemplate->getTemplateArgs();
60+
const bool IsAlgoArgPointer =
61+
TemplateArgs[0].getAsType()->isPointerType();
62+
63+
if (IsAlgoArgPointer) {
64+
SourceRange R = RangeInit->getSourceRange();
65+
diag(R.getBegin(), "iteration of pointers is nondeterministic") << R;
66+
}
67+
}
68+
return;
69+
}
70+
const auto *SortPointers = Result.Nodes.getNodeAs<Stmt>("sortsemantic");
71+
72+
if ((SortPointers) && !(SortPointers->getBeginLoc().isMacroID())) {
73+
SourceRange R = SortPointers->getSourceRange();
74+
diag(R.getBegin(), "sorting pointers is nondeterministic") << R;
75+
}
76+
}
77+
78+
} // namespace clang::tidy::bugprone
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
//=== NondeterministicPointerIterationOrderCheck.h - clang-tidy -*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H
10+
#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H
11+
12+
#include "../ClangTidyCheck.h"
13+
14+
namespace clang::tidy::bugprone {
15+
16+
/// Finds nondeterministic usages of pointers in unordered containers. The
17+
/// check also finds calls to sorting-like algorithms on a container of
18+
/// pointers.
19+
///
20+
/// For the user-facing documentation see:
21+
/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/nondeterministic-pointer-iteration-order.html
22+
class NondeterministicPointerIterationOrderCheck : public ClangTidyCheck {
23+
public:
24+
NondeterministicPointerIterationOrderCheck(StringRef Name,
25+
ClangTidyContext *Context)
26+
: ClangTidyCheck(Name, Context) {}
27+
bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
28+
return LangOpts.CPlusPlus;
29+
}
30+
void registerMatchers(ast_matchers::MatchFinder *Finder) override;
31+
void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
32+
std::optional<TraversalKind> getCheckTraversalKind() const override {
33+
return TK_IgnoreUnlessSpelledInSource;
34+
}
35+
};
36+
37+
} // namespace clang::tidy::bugprone
38+
39+
#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_NONDETERMINISTIC_POINTER_ITERATION_ORDER_CHECK_H

0 commit comments

Comments
 (0)