Skip to content

Commit 93a2c29

Browse files
spupyrevhtyu
authored andcommitted
profi - a flow-based profile inference algorithm: Part III (out of 3)
This is a continuation of D109860 and D109903. An important challenge for profile inference is caused by the fact that the sample profile is collected on a fully optimized binary, while the block and edge frequencies are consumed on an early stage of the compilation that operates with a non-optimized IR. As a result, some of the basic blocks may not have associated sample counts, and it is up to the algorithm to deduce missing frequencies. The problem is illustrated in the figure where three basic blocks are not present in the optimized binary and hence, receive no samples during profiling. We found that it is beneficial to treat all such blocks equally. Otherwise the compiler may decide that some blocks are “cold” and apply undesirable optimizations (e.g., hot-cold splitting) regressing the performance. Therefore, we want to distribute the counts evenly along the blocks with missing samples. This is achieved by a post-processing step that identifies "dangling" subgraphs consisting of basic blocks with no sampled counts; once the subgraphs are found, we rebalance the flow so as every branch probability is 50:50 within the subgraphs. Our experiments indicate up to 1% performance win using the optimization on some binaries and a significant improvement in the quality of profile counts (when compared to ground-truth instrumentation-based counts) {F19093045} Reviewed By: hoy Differential Revision: https://reviews.llvm.org/D109980
1 parent b87fe58 commit 93a2c29

File tree

6 files changed

+564
-9
lines changed

6 files changed

+564
-9
lines changed

llvm/lib/Transforms/Utils/SampleProfileInference.cpp

Lines changed: 184 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ class MinCostMaxFlow {
220220
Now = Pred;
221221
}
222222

223-
assert(PathCapacity > 0 && "found incorrect augmenting path");
223+
assert(PathCapacity > 0 && "found an incorrect augmenting path");
224224

225225
// Update the flow along the path
226226
Now = Target;
@@ -271,7 +271,22 @@ class MinCostMaxFlow {
271271
uint64_t Target;
272272
};
273273

274-
/// Post-processing adjustment of the control flow.
274+
/// A post-processing adjustment of control flow. It applies two steps by
275+
/// rerouting some flow and making it more realistic:
276+
///
277+
/// - First, it removes all isolated components ("islands") with a positive flow
278+
/// that are unreachable from the entry block. For every such component, we
279+
/// find the shortest from the entry to an exit passing through the component,
280+
/// and increase the flow by one unit along the path.
281+
///
282+
/// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
283+
/// with no sampled counts. Then it rebalnces the flow that goes through such
284+
/// a subgraph so that each branch is taken with probability 50%.
285+
/// An unknown subgraph is such that for every two nodes u and v:
286+
/// - u dominates v and u is not unknown;
287+
/// - v post-dominates u; and
288+
/// - all inner-nodes of all (u,v)-paths are unknown.
289+
///
275290
class FlowAdjuster {
276291
public:
277292
FlowAdjuster(FlowFunction &Func) : Func(Func) {
@@ -281,14 +296,16 @@ class FlowAdjuster {
281296

282297
// Run the post-processing
283298
void run() {
284-
/// We adjust the control flow in a function so as to remove all
285-
/// "isolated" components with positive flow that are unreachable
286-
/// from the entry block. For every such component, we find the shortest
287-
/// path from the entry to an exit passing through the component, and
288-
/// increase the flow by one unit along the path.
299+
/// Adjust the flow to get rid of isolated components.
289300
joinIsolatedComponents();
301+
302+
/// Rebalance the flow inside unknown subgraphs.
303+
rebalanceUnknownSubgraphs();
290304
}
291305

306+
/// The probability for the first successor of a unknown subgraph
307+
static constexpr double UnknownFirstSuccProbability = 0.5;
308+
292309
private:
293310
void joinIsolatedComponents() {
294311
// Find blocks that are reachable from the source
@@ -315,7 +332,7 @@ class FlowAdjuster {
315332
}
316333
}
317334

318-
/// Run bfs from a given block along the jumps with a positive flow and mark
335+
/// Run BFS from a given block along the jumps with a positive flow and mark
319336
/// all reachable blocks.
320337
void findReachable(uint64_t Src, std::vector<bool> &Visited) {
321338
if (Visited[Src])
@@ -435,6 +452,164 @@ class FlowAdjuster {
435452

436453
uint64_t NumBlocks() const { return Func.Blocks.size(); }
437454

455+
/// Rebalance unknown subgraphs so as each branch splits with probabilities
456+
/// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability
457+
void rebalanceUnknownSubgraphs() {
458+
assert(UnknownFirstSuccProbability >= 0.0 &&
459+
UnknownFirstSuccProbability <= 1.0 &&
460+
"the share of the unknown successor should be between 0 and 1");
461+
// Try to find unknown subgraphs from each non-unknown block
462+
for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
463+
auto SrcBlock = &Func.Blocks[I];
464+
// Do not attempt to find unknown successors from a unknown or a
465+
// zero-flow block
466+
if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
467+
continue;
468+
469+
std::vector<FlowBlock *> UnknownSuccs;
470+
FlowBlock *DstBlock = nullptr;
471+
// Find a unknown subgraphs starting at block SrcBlock
472+
if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs))
473+
continue;
474+
// At the moment, we do not rebalance subgraphs containing cycles among
475+
// unknown blocks
476+
if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs))
477+
continue;
478+
479+
// Rebalance the flow
480+
rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs);
481+
}
482+
}
483+
484+
/// Find a unknown subgraph starting at block SrcBlock.
485+
/// If the search is successful, the method sets DstBlock and UnknownSuccs.
486+
bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock,
487+
std::vector<FlowBlock *> &UnknownSuccs) {
488+
// Run BFS from SrcBlock and make sure all paths are going through unknown
489+
// blocks and end at a non-unknown DstBlock
490+
auto Visited = std::vector<bool>(NumBlocks(), false);
491+
std::queue<uint64_t> Queue;
492+
DstBlock = nullptr;
493+
494+
Queue.push(SrcBlock->Index);
495+
Visited[SrcBlock->Index] = true;
496+
while (!Queue.empty()) {
497+
auto &Block = Func.Blocks[Queue.front()];
498+
Queue.pop();
499+
// Process blocks reachable from Block
500+
for (auto Jump : Block.SuccJumps) {
501+
uint64_t Dst = Jump->Target;
502+
if (Visited[Dst])
503+
continue;
504+
Visited[Dst] = true;
505+
if (!Func.Blocks[Dst].UnknownWeight) {
506+
// If we see non-unique non-unknown block reachable from SrcBlock,
507+
// stop processing and skip rebalancing
508+
FlowBlock *CandidateDstBlock = &Func.Blocks[Dst];
509+
if (DstBlock != nullptr && DstBlock != CandidateDstBlock)
510+
return false;
511+
DstBlock = CandidateDstBlock;
512+
} else {
513+
Queue.push(Dst);
514+
UnknownSuccs.push_back(&Func.Blocks[Dst]);
515+
}
516+
}
517+
}
518+
519+
// If the list of unknown blocks is empty, we don't need rebalancing
520+
if (UnknownSuccs.empty())
521+
return false;
522+
// If all reachable nodes from SrcBlock are unknown, skip rebalancing
523+
if (DstBlock == nullptr)
524+
return false;
525+
// If any of the unknown blocks is an exit block, skip rebalancing
526+
for (auto Block : UnknownSuccs) {
527+
if (Block->isExit())
528+
return false;
529+
}
530+
531+
return true;
532+
}
533+
534+
/// Verify if the given unknown subgraph is acyclic, and if yes, reorder
535+
/// UnknownSuccs in the topological order (so that all jumps are "forward").
536+
bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
537+
std::vector<FlowBlock *> &UnknownSuccs) {
538+
// Extract local in-degrees in the considered subgraph
539+
auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
540+
for (auto Jump : SrcBlock->SuccJumps) {
541+
LocalInDegree[Jump->Target]++;
542+
}
543+
for (uint64_t I = 0; I < UnknownSuccs.size(); I++) {
544+
for (auto Jump : UnknownSuccs[I]->SuccJumps) {
545+
LocalInDegree[Jump->Target]++;
546+
}
547+
}
548+
// A loop containing SrcBlock
549+
if (LocalInDegree[SrcBlock->Index] > 0)
550+
return false;
551+
552+
std::vector<FlowBlock *> AcyclicOrder;
553+
std::queue<uint64_t> Queue;
554+
Queue.push(SrcBlock->Index);
555+
while (!Queue.empty()) {
556+
auto &Block = Func.Blocks[Queue.front()];
557+
Queue.pop();
558+
// Stop propagation once we reach DstBlock
559+
if (Block.Index == DstBlock->Index)
560+
break;
561+
562+
AcyclicOrder.push_back(&Block);
563+
// Add to the queue all successors with zero local in-degree
564+
for (auto Jump : Block.SuccJumps) {
565+
uint64_t Dst = Jump->Target;
566+
LocalInDegree[Dst]--;
567+
if (LocalInDegree[Dst] == 0) {
568+
Queue.push(Dst);
569+
}
570+
}
571+
}
572+
573+
// If there is a cycle in the subgraph, AcyclicOrder contains only a subset
574+
// of all blocks
575+
if (UnknownSuccs.size() + 1 != AcyclicOrder.size())
576+
return false;
577+
UnknownSuccs = AcyclicOrder;
578+
return true;
579+
}
580+
581+
/// Rebalance a given subgraph.
582+
void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
583+
std::vector<FlowBlock *> &UnknownSuccs) {
584+
assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
585+
assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns");
586+
587+
for (auto Block : UnknownSuccs) {
588+
// Block's flow is the sum of incoming flows
589+
uint64_t TotalFlow = 0;
590+
if (Block == SrcBlock) {
591+
TotalFlow = Block->Flow;
592+
} else {
593+
for (auto Jump : Block->PredJumps) {
594+
TotalFlow += Jump->Flow;
595+
}
596+
Block->Flow = TotalFlow;
597+
}
598+
599+
// Process all successor jumps and update corresponding flow values
600+
for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) {
601+
auto Jump = Block->SuccJumps[I];
602+
if (I + 1 == Block->SuccJumps.size()) {
603+
Jump->Flow = TotalFlow;
604+
continue;
605+
}
606+
uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability);
607+
Jump->Flow = Flow;
608+
TotalFlow -= Flow;
609+
}
610+
}
611+
}
612+
438613
/// A constant indicating an arbitrary exit block of a function.
439614
static constexpr uint64_t AnyExitBlock = uint64_t(-1);
440615

@@ -622,7 +797,7 @@ void verifyWeights(const FlowFunction &Func) {
622797
}
623798
}
624799

625-
// Run bfs from the source along edges with positive flow
800+
// Run BFS from the source along edges with positive flow
626801
std::queue<uint64_t> Queue;
627802
auto Visited = std::vector<bool>(NumBlocks, false);
628803
Queue.push(Func.Entry);
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
test_4:100:13293
2+
0: 100
3+
1: 100
4+
2: 40
5+
6+
empty:100:13293
7+
0: 7
8+
1: 100
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
countMultipliers:37078302:0
2+
2: 65536
3+
3: 10
4+
4: 65536
5+
7: 65536
6+
9: 65536
7+
10: 65536
8+
!CFGChecksum: 223598586707
9+
10+
countMultipliers2:37078302:0
11+
1: 2100
12+
2: 2000
13+
6: 2100
14+
!CFGChecksum: 2235985
15+
16+
countMultipliers3:37078302:0
17+
1: 100
18+
2: 100
19+
3: 100
20+
!CFGChecksum: 22985
21+
22+
countMultipliers4:37078302:0
23+
1: 100
24+
2: 50
25+
3: 50
26+
5: 100
27+
!CFGChecksum: 2298578
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; RUN: opt < %s -passes=sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-noprobes.prof -S | FileCheck %s
2+
; RUN: opt < %s -passes=sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-noprobes.prof | opt -analyze -block-freq -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2
3+
4+
5+
; The test verifies that profile inference can be applied for non-probe-based
6+
; profiles.
7+
;
8+
; +---------+ +----------+
9+
; | b3 [40] | <-- | b1 [100] |
10+
; +---------+ +----------+
11+
; |
12+
; |
13+
; v
14+
; +----------+
15+
; | b2 [0] |
16+
; +----------+
17+
18+
@yydebug = dso_local global i32 0, align 4
19+
20+
define void @test_4() #0 !dbg !4 {
21+
;entry:
22+
; ret void, !dbg !9
23+
b1:
24+
%0 = load i32, i32* @yydebug, align 4
25+
%cmp = icmp ne i32 %0, 0, !dbg !9
26+
br i1 %cmp, label %b2, label %b3, !dbg !9
27+
; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 100
28+
29+
b2:
30+
ret void
31+
; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 60
32+
33+
b3:
34+
ret void, !dbg !10
35+
; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 40
36+
}
37+
38+
; CHECK: {{.*}} = !{!"function_entry_count", i64 100}
39+
; CHECK: {{.*}} = !{!"branch_weights", i32 60, i32 40}
40+
41+
attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
42+
43+
!llvm.module.flags = !{!6, !7}
44+
45+
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (trunk 237249) (llvm/trunk 237261)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
46+
!1 = !DIFile(filename: "entry_counts.c", directory: ".")
47+
!2 = !{}
48+
!4 = distinct !DISubprogram(name: "test_4", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, unit: !0, retainedNodes: !2)
49+
!5 = !DISubroutineType(types: !2)
50+
!6 = !{i32 2, !"Dwarf Version", i32 4}
51+
!7 = !{i32 2, !"Debug Info Version", i32 3}
52+
!8 = !{!"clang version 3.7.0 (trunk 237249) (llvm/trunk 237261)"}
53+
!9 = !DILocation(line: 1, column: 15, scope: !4)
54+
!10 = !DILocation(line: 3, column: 15, scope: !4)

0 commit comments

Comments
 (0)