-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MachineSink] Sink into consistent blocks for optsize funcs #115367
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Ellis Hoag (ellishg) ChangesDo not consider profile data when choosing a successor block to sink into for optsize functions. This should result in more consistent instruction sequences which will improve outlining and ICF. We've observed a slight codesize improvement in a large binary. This is similar reasoning to #114607. Using profile data to select a block to sink into was original added in d04f759. Full diff: https://github.com/llvm/llvm-project/pull/115367.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 658ebd47488c71..3293f7c982749d 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -38,6 +39,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -122,6 +124,7 @@ namespace {
MachineDominatorTree *DT = nullptr; // Machine dominator tree
MachinePostDominatorTree *PDT = nullptr; // Machine post dominator tree
MachineCycleInfo *CI = nullptr;
+ ProfileSummaryInfo *PSI = nullptr;
MachineBlockFrequencyInfo *MBFI = nullptr;
const MachineBranchProbabilityInfo *MBPI = nullptr;
AliasAnalysis *AA = nullptr;
@@ -198,6 +201,7 @@ namespace {
AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
AU.addPreserved<MachineCycleInfoWrapperPass>();
AU.addPreserved<MachineLoopInfoWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
if (UseBlockFreqInfo)
AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
@@ -284,6 +288,7 @@ char &llvm::MachineSinkingID = MachineSinking::ID;
INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE,
"Machine code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
@@ -722,6 +727,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
CI = &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+ PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
MBFI = UseBlockFreqInfo
? &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI()
: nullptr;
@@ -1217,12 +1223,12 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
// Sort Successors according to their cycle depth or block frequency info.
llvm::stable_sort(
- AllSuccs, [this](const MachineBasicBlock *L, const MachineBasicBlock *R) {
+ AllSuccs, [&](const MachineBasicBlock *L, const MachineBasicBlock *R) {
uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0;
uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0;
- bool HasBlockFreq = LHSFreq != 0 || RHSFreq != 0;
- return HasBlockFreq ? LHSFreq < RHSFreq
- : CI->getCycleDepth(L) < CI->getCycleDepth(R);
+ if (llvm::shouldOptimizeForSize(MBB, PSI, MBFI) || !LHSFreq || !RHSFreq)
+ return CI->getCycleDepth(L) < CI->getCycleDepth(R);
+ return LHSFreq < RHSFreq;
});
auto it = AllSuccessors.insert(std::make_pair(MBB, AllSuccs));
diff --git a/llvm/test/CodeGen/X86/sink-blockfreq.ll b/llvm/test/CodeGen/X86/sink-blockfreq.ll
index cad9cf81905cd4..c2653a86f53aff 100644
--- a/llvm/test/CodeGen/X86/sink-blockfreq.ll
+++ b/llvm/test/CodeGen/X86/sink-blockfreq.ll
@@ -1,12 +1,13 @@
; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
+; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=true -force-pgso -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
; Test that by changing BlockFrequencyInfo we change the order in which
; machine-sink looks for successor blocks. By not using BFI, both G and B
; have the same loop depth and no instructions is sinked - B is selected but
; can't be used as to avoid breaking a non profitable critical edge. By using
; BFI, "mul" is sinked into the less frequent block G.
-define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp {
+define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp !prof !14 {
; MSINK_BFI-LABEL: sink_freqinfo
; MSINK_BFI: jl
; MSINK_BFI-NEXT: ## %bb.
@@ -22,24 +23,40 @@ B:
%ee = phi i32 [ 0, %entry ], [ %inc, %F ]
%xx = sub i32 %a, %ee
%cond0 = icmp slt i32 %xx, 0
- br i1 %cond0, label %F, label %exit, !prof !0
+ br i1 %cond0, label %F, label %exit, !prof !15
F:
%inc = add nsw i32 %xx, 2
%aa = mul nsw i32 %b, %inc
%exitcond = icmp slt i32 %inc, %a
- br i1 %exitcond, label %B, label %G, !prof !1
+ br i1 %exitcond, label %B, label %G, !prof !16
G:
%ii = add nsw i32 %aa, %a
%ll = add i32 %b, 45
%exitcond2 = icmp sge i32 %ii, %b
- br i1 %exitcond2, label %G, label %exit, !prof !2
+ br i1 %exitcond2, label %G, label %exit, !prof !17
exit:
ret i32 0
}
-!0 = !{!"branch_weights", i32 4, i32 1}
-!1 = !{!"branch_weights", i32 128, i32 1}
-!2 = !{!"branch_weights", i32 1, i32 1}
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 1000}
+!15 = !{!"branch_weights", i32 4, i32 1}
+!16 = !{!"branch_weights", i32 128, i32 1}
+!17 = !{!"branch_weights", i32 1, i32 1}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good to me, but let others review it as well.
The new comparator passed to stable-sort doesn't follow a strict-weak ordering, and so when running under Address Sanitizer, we can (and do!) fail at runtime with |
(I'm not sure how to give you a repro for this -- I've only seen it happen once, on a Google-proprietary JIT test that I don't think I can easily provide IR for. That said, assuming the diagnostic re: strict-weak-ordering is correct, it should be simple to verify and fix via inspection...) |
Thanks! I'm looking into it now. |
Should be fixed by #116705. Sorry this was just a logic bug. Let me know if your test is fixed. |
Thanks -- this does indeed fix the failure case I have! LGTM |
Fix the comparator in `stable_sort()` to satisfy the strict weak ordering requirement. In #115367 this comparator was changed to use `getCycleDepth()` when `shouldOptimizeForSize()` is true. However, I mistakenly changed to logic so that we use `LHSFreq < RHSFreq` if **either** of them are zero. This causes us to fail the last requirment (https://en.cppreference.com/w/cpp/named_req/Compare). > if comp(a, b) == true and comp(b, c) == true then comp(a, c) == true
Do not consider profile data when choosing a successor block to sink into for optsize functions. This should result in more consistent instruction sequences which will improve outlining and ICF. We've observed a slight codesize improvement in a large binary. This is similar reasoning to #114607.
Using profile data to select a block to sink into was original added in d04f759.