Skip to content

Commit 73eb9b3

Browse files
authored
[InstrProf] Evaluate function order using test traces (#92451)
The `llvm-profdata order` command is used to compute a function order using traces from the input profile. Add the `--num-test-traces` flag to keep aside N traces to evalute this order. These test traces are assumed to be the actual function execution order in some experiment. The output is a number that represents how many page faults we got. Lower is better. I tested on a large profile I already had. ``` llvm-profdata order default.profdata --num-test-traces=30 # Ordered 149103 functions # Total area under the page fault curve: 2.271827e+09 ... ``` I also improved `TemporalProfTraceTy::createBPFunctionNodes()` in a few ways: * Simplified how `UN`s are computed * Change how the initial `Node` order is computed * Filter out rare and common `UN`s * Output vector is an aliased argument instead of a return These changes slightly improved the evaluation in my test. ``` llvm-profdata order default.profdata --num-test-traces=30 # Ordered 149103 functions # Total area under the page fault curve: 2.268586e+09 ... ```
1 parent dd2d132 commit 73eb9b3

File tree

6 files changed

+144
-51
lines changed

6 files changed

+144
-51
lines changed

llvm/include/llvm/ProfileData/InstrProf.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,9 @@ struct TemporalProfTraceTy {
385385
/// Use a set of temporal profile traces to create a list of balanced
386386
/// partitioning function nodes used by BalancedPartitioning to generate a
387387
/// function order that reduces page faults during startup
388-
static std::vector<BPFunctionNode>
389-
createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces);
388+
static void createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces,
389+
std::vector<BPFunctionNode> &Nodes,
390+
bool RemoveOutlierUNs = true);
390391
};
391392

392393
inline std::error_code make_error_code(instrprof_error E) {

llvm/lib/ProfileData/InstrProf.cpp

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,46 +1002,60 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
10021002
ValueSites.emplace_back(VData, VData + N);
10031003
}
10041004

1005-
std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
1006-
ArrayRef<TemporalProfTraceTy> Traces) {
1005+
void TemporalProfTraceTy::createBPFunctionNodes(
1006+
ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes,
1007+
bool RemoveOutlierUNs) {
10071008
using IDT = BPFunctionNode::IDT;
10081009
using UtilityNodeT = BPFunctionNode::UtilityNodeT;
1009-
// Collect all function IDs ordered by their smallest timestamp. This will be
1010-
// used as the initial FunctionNode order.
1011-
SetVector<IDT> FunctionIds;
1012-
size_t LargestTraceSize = 0;
1013-
for (auto &Trace : Traces)
1014-
LargestTraceSize =
1015-
std::max(LargestTraceSize, Trace.FunctionNameRefs.size());
1016-
for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
1017-
for (auto &Trace : Traces)
1018-
if (Timestamp < Trace.FunctionNameRefs.size())
1019-
FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]);
1020-
1021-
const int N = Log2_64(LargestTraceSize) + 1;
1022-
1010+
UtilityNodeT MaxUN = 0;
1011+
DenseMap<IDT, size_t> IdToFirstTimestamp;
1012+
DenseMap<IDT, UtilityNodeT> IdToFirstUN;
1013+
DenseMap<IDT, SmallVector<UtilityNodeT>> IdToUNs;
10231014
// TODO: We need to use the Trace.Weight field to give more weight to more
10241015
// important utilities
1025-
DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
1026-
for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
1027-
auto &Trace = Traces[TraceIdx].FunctionNameRefs;
1028-
for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
1029-
for (int I = Log2_64(Timestamp + 1); I < N; I++) {
1030-
auto FunctionId = Trace[Timestamp];
1031-
UtilityNodeT GroupId = TraceIdx * N + I;
1032-
FuncGroups[FunctionId].push_back(GroupId);
1016+
for (auto &Trace : Traces) {
1017+
size_t CutoffTimestamp = 1;
1018+
for (size_t Timestamp = 0; Timestamp < Trace.FunctionNameRefs.size();
1019+
Timestamp++) {
1020+
IDT Id = Trace.FunctionNameRefs[Timestamp];
1021+
auto [It, WasInserted] = IdToFirstTimestamp.try_emplace(Id, Timestamp);
1022+
if (!WasInserted)
1023+
It->getSecond() = std::min<size_t>(It->getSecond(), Timestamp);
1024+
if (Timestamp >= CutoffTimestamp) {
1025+
++MaxUN;
1026+
CutoffTimestamp = 2 * Timestamp;
10331027
}
1028+
IdToFirstUN.try_emplace(Id, MaxUN);
10341029
}
1030+
for (auto &[Id, FirstUN] : IdToFirstUN)
1031+
for (auto UN = FirstUN; UN <= MaxUN; ++UN)
1032+
IdToUNs[Id].push_back(UN);
1033+
++MaxUN;
1034+
IdToFirstUN.clear();
10351035
}
10361036

1037-
std::vector<BPFunctionNode> Nodes;
1038-
for (auto Id : FunctionIds) {
1039-
auto &UNs = FuncGroups[Id];
1040-
llvm::sort(UNs);
1041-
UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
1042-
Nodes.emplace_back(Id, UNs);
1037+
if (RemoveOutlierUNs) {
1038+
DenseMap<UtilityNodeT, unsigned> UNFrequency;
1039+
for (auto &[Id, UNs] : IdToUNs)
1040+
for (auto &UN : UNs)
1041+
++UNFrequency[UN];
1042+
// Filter out utility nodes that are too infrequent or too prevalent to make
1043+
// BalancedPartitioning more effective.
1044+
for (auto &[Id, UNs] : IdToUNs)
1045+
llvm::erase_if(UNs, [&](auto &UN) {
1046+
return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
1047+
});
10431048
}
1044-
return Nodes;
1049+
1050+
for (auto &[Id, UNs] : IdToUNs)
1051+
Nodes.emplace_back(Id, UNs);
1052+
1053+
// Since BalancedPartitioning is sensitive to the initial order, we explicitly
1054+
// order nodes by their earliest timestamp.
1055+
llvm::sort(Nodes, [&](auto &L, auto &R) {
1056+
return std::make_pair(IdToFirstTimestamp[L.Id], L.Id) <
1057+
std::make_pair(IdToFirstTimestamp[R.Id], R.Id);
1058+
});
10451059
}
10461060

10471061
#define INSTR_PROF_COMMON_API_IMPL
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# RUN: not llvm-profdata order %s --num-test-traces=10 2>&1 | FileCheck %s
2+
3+
# CHECK: --num-test-traces must be smaller than the total number of traces
4+
5+
# Header
6+
:ir
7+
:temporal_prof_traces
8+
# Num Traces
9+
1
10+
# Trace Stream Size:
11+
1
12+
# Weight
13+
1
14+
a, b
15+
16+
a
17+
# Func Hash:
18+
0x1234
19+
# Num Counters:
20+
1
21+
# Counter Values:
22+
101
23+
24+
b
25+
0x5678
26+
1
27+
202

llvm/test/tools/llvm-profdata/show-order.proftext

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
# RUN: llvm-profdata order %s | FileCheck %s
1+
# RUN: llvm-profdata order %s --num-test-traces=1 | FileCheck %s
2+
3+
# CHECK: # Total area under the page fault curve: 4.000000e+00
24

35
# CHECK: a
46
# CHECK: b
@@ -9,9 +11,9 @@
911
:ir
1012
:temporal_prof_traces
1113
# Num Traces
12-
3
14+
4
1315
# Trace Stream Size:
14-
3
16+
4
1517
# Weight
1618
1
1719
a, main.c:b, c
@@ -21,6 +23,9 @@ a, x, main.c:b, c
2123
# Weight
2224
1
2325
a, main.c:b, c
26+
# Weight
27+
1
28+
a, main.c:b, c, x
2429

2530
a
2631
# Func Hash:

llvm/tools/llvm-profdata/llvm-profdata.cpp

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ cl::opt<unsigned long long> OverlapValueCutoff(
340340
"profile with max count value greater then the parameter value"),
341341
cl::sub(OverlapSubcommand));
342342

343-
// Options unique to show subcommand.
343+
// Options specific to show subcommand.
344344
cl::opt<bool> ShowCounts("counts", cl::init(false),
345345
cl::desc("Show counter values for shown functions"),
346346
cl::sub(ShowSubcommand));
@@ -439,6 +439,14 @@ cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
439439
cl::desc("Show profile version. "),
440440
cl::sub(ShowSubcommand));
441441

442+
// Options specific to order subcommand.
443+
cl::opt<unsigned>
444+
NumTestTraces("num-test-traces", cl::init(0),
445+
cl::desc("Keep aside the last <num-test-traces> traces in "
446+
"the profile when computing the function order and "
447+
"instead use them to evaluate that order"),
448+
cl::sub(OrderSubcommand));
449+
442450
// We use this string to indicate that there are
443451
// multiple static functions map to the same name.
444452
const std::string DuplicateNameStr = "----";
@@ -3277,13 +3285,42 @@ static int order_main() {
32773285
// Read all entries
32783286
(void)I;
32793287
}
3280-
auto &Traces = Reader->getTemporalProfTraces();
3281-
auto Nodes = TemporalProfTraceTy::createBPFunctionNodes(Traces);
3288+
ArrayRef Traces = Reader->getTemporalProfTraces();
3289+
if (NumTestTraces && NumTestTraces >= Traces.size())
3290+
exitWithError(
3291+
"--" + NumTestTraces.ArgStr +
3292+
" must be smaller than the total number of traces: expected: < " +
3293+
Twine(Traces.size()) + ", actual: " + Twine(NumTestTraces));
3294+
ArrayRef TestTraces = Traces.take_back(NumTestTraces);
3295+
Traces = Traces.drop_back(NumTestTraces);
3296+
3297+
std::vector<BPFunctionNode> Nodes;
3298+
TemporalProfTraceTy::createBPFunctionNodes(Traces, Nodes);
32823299
BalancedPartitioningConfig Config;
32833300
BalancedPartitioning BP(Config);
32843301
BP.run(Nodes);
32853302

32863303
OS << "# Ordered " << Nodes.size() << " functions\n";
3304+
if (!TestTraces.empty()) {
3305+
// Since we don't know the symbol sizes, we assume 32 functions per page.
3306+
DenseMap<BPFunctionNode::IDT, unsigned> IdToPageNumber;
3307+
for (auto &Node : Nodes)
3308+
IdToPageNumber[Node.Id] = IdToPageNumber.size() / 32;
3309+
3310+
SmallSet<unsigned, 0> TouchedPages;
3311+
unsigned Area = 0;
3312+
for (auto &Trace : TestTraces) {
3313+
for (auto Id : Trace.FunctionNameRefs) {
3314+
auto It = IdToPageNumber.find(Id);
3315+
if (It == IdToPageNumber.end())
3316+
continue;
3317+
TouchedPages.insert(It->getSecond());
3318+
Area += TouchedPages.size();
3319+
}
3320+
TouchedPages.clear();
3321+
}
3322+
OS << "# Total area under the page fault curve: " << (float)Area << "\n";
3323+
}
32873324
OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the "
32883325
"linkage and this output does not take that into account. Some "
32893326
"post-processing may be required before passing to the linker via "

llvm/unittests/ProfileData/BPFunctionNodeTest.cpp

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
#include "llvm/ProfileData/InstrProf.h"
1010
#include "llvm/Support/BalancedPartitioning.h"
11-
#include "llvm/Testing/Support/SupportHelpers.h"
1211
#include "gmock/gmock.h"
1312
#include "gtest/gtest.h"
1413

@@ -31,22 +30,32 @@ TEST(BPFunctionNodeTest, Basic) {
3130
UnorderedElementsAreArray(UNs)));
3231
};
3332

34-
auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
35-
TemporalProfTraceTy({0, 1, 2, 3}),
36-
});
33+
std::vector<BPFunctionNode> Nodes;
34+
TemporalProfTraceTy::createBPFunctionNodes(
35+
{TemporalProfTraceTy({0, 1, 2, 3})}, Nodes, /*RemoveOutlierUNs=*/false);
36+
// Utility nodes that are too infrequent or too prevalent are filtered out.
3737
EXPECT_THAT(Nodes,
3838
UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
39-
NodeIs(2, {1, 2}), NodeIs(3, {2})));
39+
NodeIs(2, {2}), NodeIs(3, {2})));
4040

41-
Nodes = TemporalProfTraceTy::createBPFunctionNodes({
42-
TemporalProfTraceTy({0, 1, 2, 3, 4}),
43-
TemporalProfTraceTy({4, 2}),
44-
});
41+
Nodes.clear();
42+
TemporalProfTraceTy::createBPFunctionNodes(
43+
{TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
44+
Nodes, /*RemoveOutlierUNs=*/false);
4545

4646
EXPECT_THAT(Nodes,
47-
UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
48-
NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
49-
NodeIs(4, {2, 3, 4, 5})));
47+
UnorderedElementsAre(NodeIs(0, {0, 1, 2, 3}),
48+
NodeIs(1, {1, 2, 3}), NodeIs(2, {2, 3, 5}),
49+
NodeIs(3, {2, 3}), NodeIs(4, {3, 4, 5})));
50+
51+
Nodes.clear();
52+
TemporalProfTraceTy::createBPFunctionNodes(
53+
{TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
54+
Nodes, /*RemoveOutlierUNs=*/true);
55+
56+
EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
57+
NodeIs(2, {5}), NodeIs(3, {}),
58+
NodeIs(4, {5})));
5059
}
5160

5261
} // end namespace llvm

0 commit comments

Comments
 (0)