Skip to content

Commit eb07c55

Browse files
SC llvm teamSC llvm team
authored andcommitted
Merged main:3bd517205799 into amd-gfx:404f2546852a
Local branch amd-gfx 404f254 Merged main:ca66df3b0210 into amd-gfx:a98fd9a07d19 Remote branch main 3bd5172 Reland "[CodeGen] Port SafeStack to new pass manager (llvm#74027)
2 parents 404f254 + 3bd5172 commit eb07c55

File tree

82 files changed

+752
-118
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+752
-118
lines changed

bolt/lib/Passes/SplitFunctions.cpp

Lines changed: 200 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@ static cl::opt<double> CallScale(
114114
"call-scale",
115115
cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
116116
cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
117+
118+
static cl::opt<double>
119+
CallPower("call-power",
120+
cl::desc("Call score power (when --split-strategy=cdsplit)"),
121+
cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
122+
123+
static cl::opt<double>
124+
JumpPower("jump-power",
125+
cl::desc("Jump score power (when --split-strategy=cdsplit)"),
126+
cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
117127
} // namespace opts
118128

119129
namespace {
@@ -195,6 +205,13 @@ struct SplitCacheDirected final : public SplitStrategy {
195205
size_t Count;
196206
};
197207

208+
struct SplitScore {
209+
size_t SplitIndex;
210+
size_t HotSizeReduction = 0;
211+
double LocalScore = 0;
212+
double CoverCallScore = 0;
213+
};
214+
198215
// Auxiliary variables used by the algorithm.
199216
size_t TotalNumBlocks{0};
200217
size_t OrigHotSectionSize{0};
@@ -340,8 +357,9 @@ struct SplitCacheDirected final : public SplitStrategy {
340357
// We only care about new addresses of blocks in hot/warm.
341358
if (BB->getFragmentNum() == FragmentNum::cold())
342359
break;
360+
const size_t NewSize = BB->getOutputSize();
343361
BB->setOutputStartAddress(CurrentAddr);
344-
CurrentAddr += BB->getOutputSize();
362+
CurrentAddr += NewSize;
345363
BB->setOutputEndAddress(CurrentAddr);
346364
if (BB->getLayoutIndex() == SplitIndex) {
347365
NewHotEndAddr = CurrentAddr;
@@ -402,13 +420,192 @@ struct SplitCacheDirected final : public SplitStrategy {
402420
return CoverCalls;
403421
}
404422

423+
/// Compute the edge score of a call edge.
424+
double computeCallScore(uint64_t CallCount, size_t CallLength) {
425+
// Increase call lengths by 1 to avoid raising 0 to a negative power.
426+
return opts::CallScale * static_cast<double>(CallCount) /
427+
std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
428+
}
429+
430+
/// Compute the edge score of a jump (branch) edge.
431+
double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
432+
// Increase jump lengths by 1 to avoid raising 0 to a negative power.
433+
return static_cast<double>(JumpCount) /
434+
std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
435+
}
436+
437+
/// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
438+
/// Increament Score.LocalScore in place by the sum.
439+
void computeJumpScore(const BasicBlockOrder &BlockOrder,
440+
const size_t SplitIndex, SplitScore &Score) {
441+
442+
for (const BinaryBasicBlock *SrcBB : BlockOrder) {
443+
if (SrcBB->getKnownExecutionCount() == 0)
444+
continue;
445+
446+
const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
447+
448+
for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
449+
const BinaryBasicBlock *DstBB = std::get<0>(Pair);
450+
const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
451+
const size_t JumpCount = Branch.Count;
452+
453+
if (JumpCount == 0)
454+
continue;
455+
456+
const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
457+
const size_t NewJumpLength =
458+
AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
459+
Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
460+
}
461+
}
462+
}
463+
464+
/// Compute sum of scores over calls originated in the current function
465+
/// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
466+
void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
467+
const size_t SplitIndex, SplitScore &Score) {
468+
if (opts::CallScale == 0)
469+
return;
470+
471+
// Global index of the last block in the current function.
472+
// This is later used to determine whether a call originated in the current
473+
// function is to a function that comes after the current function.
474+
const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
475+
476+
// The length of calls originated in the input function can increase /
477+
// decrease depending on the splitting decision.
478+
for (const BinaryBasicBlock *SrcBB : BlockOrder) {
479+
const size_t CallCount = SrcBB->getKnownExecutionCount();
480+
// If SrcBB does not call any functions, skip it.
481+
if (CallCount == 0)
482+
continue;
483+
484+
// Obtain an estimate on the end address of the src basic block
485+
// after splitting at SplitIndex.
486+
const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
487+
488+
for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
489+
// Obtain an estimate on the start address of the dst basic block
490+
// after splitting at SplitIndex. If DstBB is in a function before
491+
// the current function, then its start address remains unchanged.
492+
size_t DstBBStartAddr = BBOffsets[DstBB];
493+
// If DstBB is in a function after the current function, then its
494+
// start address should be adjusted based on the reduction in hot size.
495+
if (GlobalIndices[DstBB] > LastGlobalIndex) {
496+
assert(DstBBStartAddr >= Score.HotSizeReduction);
497+
DstBBStartAddr -= Score.HotSizeReduction;
498+
}
499+
const size_t NewCallLength =
500+
AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
501+
Score.LocalScore += computeCallScore(CallCount, NewCallLength);
502+
}
503+
}
504+
}
505+
506+
/// Compute sum of splitting scores for cover calls of the input function.
507+
/// Increament Score.CoverCallScore in place by the sum.
508+
void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
509+
const size_t SplitIndex,
510+
const std::vector<CallInfo> &CoverCalls,
511+
SplitScore &Score) {
512+
if (opts::CallScale == 0)
513+
return;
514+
515+
for (const CallInfo CI : CoverCalls) {
516+
assert(CI.Length >= Score.HotSizeReduction &&
517+
"Length of cover calls must exceed reduced size of hot fragment.");
518+
// Compute the new length of the call, which is shorter than the original
519+
// one by the size of the splitted fragment minus the total size increase.
520+
const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
521+
Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
522+
}
523+
}
524+
525+
/// Compute the split score of splitting a function at a given index.
526+
/// The split score consists of local score and cover score. Cover call score
527+
/// is expensive to compute. As a result, we pass in a \p ReferenceScore and
528+
/// compute cover score only when the local score exceeds that in the
529+
/// ReferenceScore or that the size reduction of the hot fragment is larger
530+
/// than that achieved by the split index of the ReferenceScore. This function
531+
/// returns \p Score of SplitScore type. It contains the local score and cover
532+
/// score (if computed) of the current splitting index. For easier book
533+
/// keeping and comparison, it also stores the split index and the resulting
534+
/// reduction in hot fragment size.
535+
SplitScore computeSplitScore(const BinaryFunction &BF,
536+
const BasicBlockOrder &BlockOrder,
537+
const size_t SplitIndex,
538+
const std::vector<CallInfo> &CoverCalls,
539+
const SplitScore &ReferenceScore) {
540+
// Populate BinaryBasicBlock::OutputAddressRange with estimated
541+
// new start and end addresses after hot-warm splitting at SplitIndex.
542+
size_t OldHotEnd;
543+
size_t NewHotEnd;
544+
std::tie(OldHotEnd, NewHotEnd) =
545+
estimatePostSplitBBAddress(BlockOrder, SplitIndex);
546+
547+
SplitScore Score;
548+
Score.SplitIndex = SplitIndex;
549+
550+
// It's not worth splitting if OldHotEnd < NewHotEnd.
551+
if (OldHotEnd < NewHotEnd)
552+
return Score;
553+
554+
// Hot fragment size reduction due to splitting.
555+
Score.HotSizeReduction = OldHotEnd - NewHotEnd;
556+
557+
// First part of LocalScore is the sum over call edges originated in the
558+
// input function. These edges can get shorter or longer depending on
559+
// SplitIndex. Score.LocalScore is increamented in place.
560+
computeLocalCallScore(BlockOrder, SplitIndex, Score);
561+
562+
// Second part of LocalScore is the sum over jump edges with src basic block
563+
// and dst basic block in the current function. Score.LocalScore is
564+
// increamented in place.
565+
computeJumpScore(BlockOrder, SplitIndex, Score);
566+
567+
// There is no need to compute CoverCallScore if we have already found
568+
// another split index with a bigger LocalScore and bigger HotSizeReduction.
569+
if (Score.LocalScore <= ReferenceScore.LocalScore &&
570+
Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
571+
return Score;
572+
573+
// Compute CoverCallScore and store in Score in place.
574+
computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
575+
return Score;
576+
}
577+
405578
/// Find the best index for splitting. The returned value is the index of the
406579
/// last hot basic block. Hence, "no splitting" is equivalent to returning the
407580
/// value which is one less than the size of the function.
408581
size_t findSplitIndex(const BinaryFunction &BF,
409582
const BasicBlockOrder &BlockOrder) {
410-
// Placeholder: hot-warm split after entry block.
411-
return 0;
583+
// Find all function calls that can be shortened if we move blocks of the
584+
// current function to warm/cold
585+
const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
586+
587+
// Try all possible split indices (blocks with Index <= SplitIndex are in
588+
// hot) and find the one maximizing the splitting score.
589+
SplitScore BestScore;
590+
double BestScoreSum = -1.0;
591+
SplitScore ReferenceScore;
592+
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
593+
const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
594+
// No need to keep cold blocks in the hot section.
595+
if (LastHotBB->getFragmentNum() == FragmentNum::cold())
596+
break;
597+
const SplitScore Score =
598+
computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
599+
double ScoreSum = Score.LocalScore + Score.CoverCallScore;
600+
if (ScoreSum > BestScoreSum) {
601+
BestScoreSum = ScoreSum;
602+
BestScore = Score;
603+
}
604+
if (Score.LocalScore > ReferenceScore.LocalScore)
605+
ReferenceScore = Score;
606+
}
607+
608+
return BestScore.SplitIndex;
412609
}
413610
};
414611

bolt/test/X86/cdsplit-call-scale.s

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Test the control of aggressiveness of 3-way splitting by -call-scale.
2+
# When -call-scale=0.0, the tested function is 2-way splitted.
3+
# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
4+
# in warm because of the increased benefit of shortening the call edges.
5+
# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
6+
# in warm because of the strong benefit of shortening the call edges.
7+
8+
# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
9+
# RUN: link_fdata %s %t.o %t.fdata
10+
# RUN: llvm-strip --strip-unneeded %t.o
11+
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
12+
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
13+
# RUN: --call-scale=0.0 --print-split --print-only=chain \
14+
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
15+
# RUN: 2>&1 | FileCheck --check-prefix=LOWINCENTIVE %s
16+
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
17+
# RUN: --call-scale=1.0 --print-split --print-only=chain \
18+
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
19+
# RUN: 2>&1 | FileCheck --check-prefix=MEDINCENTIVE %s
20+
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
21+
# RUN: --call-scale=1000.0 --print-split --print-only=chain \
22+
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
23+
# RUN: 2>&1 | FileCheck --check-prefix=HIGHINCENTIVE %s
24+
25+
# LOWINCENTIVE: Binary Function "chain" after split-functions
26+
# LOWINCENTIVE: {{^\.Ltmp5}}
27+
# LOWINCENTIVE: ------- HOT-COLD SPLIT POINT -------
28+
# LOWINCENTIVE: {{^\.LFT1}}
29+
30+
# MEDINCENTIVE: Binary Function "chain" after split-functions
31+
# MEDINCENTIVE: {{^\.Ltmp1}}
32+
# MEDINCENTIVE: ------- HOT-COLD SPLIT POINT -------
33+
# MEDINCENTIVE: {{^\.LFT1}}
34+
# MEDINCENTIVE: ------- HOT-COLD SPLIT POINT -------
35+
# MEDINCENTIVE: {{^\.Ltmp0}}
36+
# MEDINCENTIVE: {{^\.Ltmp2}}
37+
# MEDINCENTIVE: {{^\.Ltmp3}}
38+
# MEDINCENTIVE: {{^\.Ltmp4}}
39+
# MEDINCENTIVE: {{^\.Ltmp5}}
40+
41+
# HIGHINCENTIVE: Binary Function "chain" after split-functions
42+
# HIGHINCENTIVE: {{^\.LBB00}}
43+
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
44+
# HIGHINCENTIVE: {{^\.LFT1}}
45+
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
46+
# HIGHINCENTIVE: {{^\.LFT0}}
47+
# HIGHINCENTIVE: {{^\.Ltmp1}}
48+
# HIGHINCENTIVE: {{^\.Ltmp0}}
49+
# HIGHINCENTIVE: {{^\.Ltmp2}}
50+
# HIGHINCENTIVE: {{^\.Ltmp3}}
51+
# HIGHINCENTIVE: {{^\.Ltmp4}}
52+
# HIGHINCENTIVE: {{^\.Ltmp5}}
53+
54+
55+
56+
.text
57+
.globl chain
58+
.type chain, @function
59+
chain:
60+
pushq %rbp
61+
movq %rsp, %rbp
62+
cmpl $2, %edi
63+
LLentry_LLchain_start:
64+
jge LLchain_start
65+
# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10
66+
# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500
67+
LLfast:
68+
movl $5, %eax
69+
LLfast_LLexit:
70+
jmp LLexit
71+
# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500
72+
LLchain_start:
73+
movl $10, %eax
74+
LLchain_start_LLchain1:
75+
jge LLchain1
76+
# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10
77+
# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0
78+
LLcold:
79+
addl $1, %eax
80+
addl $1, %eax
81+
addl $1, %eax
82+
addl $1, %eax
83+
addl $1, %eax
84+
addl $1, %eax
85+
LLchain1:
86+
addl $1, %eax
87+
LLchain1_LLchain2:
88+
jmp LLchain2
89+
# FDATA: 1 chain #LLchain1_LLchain2# 1 chain #LLchain2# 0 10
90+
LLchain2:
91+
addl $1, %eax
92+
LLchain2_LLchain3:
93+
jmp LLchain3
94+
# FDATA: 1 chain #LLchain2_LLchain3# 1 chain #LLchain3# 0 10
95+
LLchain3:
96+
addl $1, %eax
97+
addl $1, %eax
98+
addl $1, %eax
99+
addl $1, %eax
100+
addl $1, %eax
101+
LLchain3_LLchain4:
102+
jmp LLchain4
103+
# FDATA: 1 chain #LLchain3_LLchain4# 1 chain #LLchain4# 0 10
104+
LLchain4:
105+
addl $1, %eax
106+
addl $1, %eax
107+
addl $1, %eax
108+
addl $1, %eax
109+
addl $1, %eax
110+
LLchain4_LLexit:
111+
jmp LLexit
112+
# FDATA: 1 chain #LLchain4_LLexit# 1 chain #LLexit# 0 10
113+
LLexit:
114+
popq %rbp
115+
ret
116+
LLchain_end:
117+
.size chain, LLchain_end-chain
118+
119+
120+
.globl main
121+
.type main, @function
122+
main:
123+
pushq %rbp
124+
movq %rsp, %rbp
125+
movl $1, %edi
126+
LLmain_chain1:
127+
call chain
128+
# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500
129+
movl $4, %edi
130+
LLmain_chain2:
131+
call chain
132+
# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10
133+
xorl %eax, %eax
134+
popq %rbp
135+
retq
136+
.Lmain_end:
137+
.size main, .Lmain_end-main

0 commit comments

Comments
 (0)