-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[regalloc][basic] Change spill weight for optsize funcs #112960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
29e080e
to
d5ff0ed
Compare
@@ -113,14 +114,18 @@ class LiveIntervals { | |||
~LiveIntervals(); | |||
|
|||
/// Calculate the spill weight to assign to a single instruction. | |||
/// If \p PSI is provided the calculation is altered for optsize functions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand that this may be confusing, but I thought it was less disruptive than adding a separate ConsiderOptSize
parameter
d5ff0ed
to
750531e
Compare
@llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-backend-aarch64 Author: Ellis Hoag (ellishg) ChangesChange the spill weight calculations for I built a large app with the basic and greedy (default) register allocator enabled.
Since I only saw a size win with the basic register allocator, I decided to only change the behavior for that type. Full diff: https://github.com/llvm/llvm-project/pull/112960.diff 6 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/CalcSpillWeights.h b/llvm/include/llvm/CodeGen/CalcSpillWeights.h
index 41b7f10cfc38ac..acb8b762efc643 100644
--- a/llvm/include/llvm/CodeGen/CalcSpillWeights.h
+++ b/llvm/include/llvm/CodeGen/CalcSpillWeights.h
@@ -18,6 +18,7 @@ class LiveIntervals;
class MachineBlockFrequencyInfo;
class MachineFunction;
class MachineLoopInfo;
+class ProfileSummaryInfo;
class VirtRegMap;
/// Normalize the spill weight of a live interval
@@ -47,6 +48,7 @@ class VirtRegMap;
LiveIntervals &LIS;
const VirtRegMap &VRM;
const MachineLoopInfo &Loops;
+ ProfileSummaryInfo *PSI;
const MachineBlockFrequencyInfo &MBFI;
/// Returns true if Reg of live interval LI is used in instruction with many
@@ -56,8 +58,9 @@ class VirtRegMap;
public:
VirtRegAuxInfo(MachineFunction &MF, LiveIntervals &LIS,
const VirtRegMap &VRM, const MachineLoopInfo &Loops,
- const MachineBlockFrequencyInfo &MBFI)
- : MF(MF), LIS(LIS), VRM(VRM), Loops(Loops), MBFI(MBFI) {}
+ const MachineBlockFrequencyInfo &MBFI,
+ ProfileSummaryInfo *PSI = nullptr)
+ : MF(MF), LIS(LIS), VRM(VRM), Loops(Loops), PSI(PSI), MBFI(MBFI) {}
virtual ~VirtRegAuxInfo() = default;
diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index 4c45a9676d6bd1..161bb247a0e968 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -47,6 +47,7 @@ class MachineDominatorTree;
class MachineFunction;
class MachineInstr;
class MachineRegisterInfo;
+class ProfileSummaryInfo;
class raw_ostream;
class TargetInstrInfo;
class VirtRegMap;
@@ -113,14 +114,18 @@ class LiveIntervals {
~LiveIntervals();
/// Calculate the spill weight to assign to a single instruction.
+ /// If \p PSI is provided the calculation is altered for optsize functions.
static float getSpillWeight(bool isDef, bool isUse,
const MachineBlockFrequencyInfo *MBFI,
- const MachineInstr &MI);
+ const MachineInstr &MI,
+ ProfileSummaryInfo *PSI = nullptr);
/// Calculate the spill weight to assign to a single instruction.
+ /// If \p PSI is provided the calculation is altered for optsize functions.
static float getSpillWeight(bool isDef, bool isUse,
const MachineBlockFrequencyInfo *MBFI,
- const MachineBasicBlock *MBB);
+ const MachineBasicBlock *MBB,
+ ProfileSummaryInfo *PSI = nullptr);
LiveInterval &getInterval(Register Reg) {
if (hasInterval(Reg))
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 88ed2291313c95..f361c956092e88 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -199,8 +199,10 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
// localLI = COPY other
// ...
// other = COPY localLI
- TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB);
- TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB);
+ TotalWeight +=
+ LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB, PSI);
+ TotalWeight +=
+ LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB, PSI);
NumInstr += 2;
}
@@ -272,7 +274,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
// Calculate instr weight.
bool Reads, Writes;
std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg());
- Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI);
+ Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI, PSI);
// Give extra weight to what looks like a loop induction variable update.
if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB))
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 7ddaaaa915ef17..21a316cf99a217 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/StackMaps.h"
@@ -37,6 +38,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/ProfileSummary.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -875,14 +877,23 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
const MachineBlockFrequencyInfo *MBFI,
- const MachineInstr &MI) {
- return getSpillWeight(isDef, isUse, MBFI, MI.getParent());
+ const MachineInstr &MI,
+ ProfileSummaryInfo *PSI) {
+ return getSpillWeight(isDef, isUse, MBFI, MI.getParent(), PSI);
}
float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
const MachineBlockFrequencyInfo *MBFI,
- const MachineBasicBlock *MBB) {
- return (isDef + isUse) * MBFI->getBlockFreqRelativeToEntryBlock(MBB);
+ const MachineBasicBlock *MBB,
+ ProfileSummaryInfo *PSI) {
+ float Weight = isDef + isUse;
+ const auto *MF = MBB->getParent();
+ // When optimizing for size we only consider the codesize impact of spilling
+ // the register, not the runtime impact.
+ if (PSI && (MF->getFunction().hasOptSize() ||
+ llvm::shouldOptimizeForSize(MF, PSI, MBFI)))
+ return Weight;
+ return Weight * MBFI->getBlockFreqRelativeToEntryBlock(MBB);
}
LiveRange::Segment
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index caf9c32a5a3498..046784c386e301 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -14,6 +14,7 @@
#include "AllocationOrder.h"
#include "RegAllocBase.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -140,6 +141,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false,
false)
@@ -182,6 +184,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<LiveDebugVariables>();
AU.addRequired<LiveStacks>();
AU.addPreserved<LiveStacks>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
AU.addRequiredID(MachineDominatorsID);
@@ -312,7 +315,8 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
getAnalysis<LiveRegMatrix>());
VirtRegAuxInfo VRAI(
*MF, *LIS, *VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(),
- getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
+ getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(),
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
VRAI.calculateSpillWeightsAndHints();
SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI));
diff --git a/llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll b/llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll
new file mode 100644
index 00000000000000..5c3bd984087ec1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc < %s -mtriple=aarch64 -regalloc=basic | FileCheck %s
+
+; Test that the register allocator behaves differently with minsize functions.
+
+declare void @foo(i32, ptr)
+
+define void @optsize(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %arg5, i1 %arg6) minsize {
+; CHECK-LABEL: optsize:
+; CHECK: // %bb.0: // %bb
+; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w21, -24
+; CHECK-NEXT: .cfi_offset w22, -32
+; CHECK-NEXT: .cfi_offset w23, -40
+; CHECK-NEXT: .cfi_offset w30, -48
+; CHECK-NEXT: mov w23, w5
+; CHECK-NEXT: mov x22, x4
+; CHECK-NEXT: mov x21, x3
+; CHECK-NEXT: mov x20, x2
+; CHECK-NEXT: mov w19, w1
+; CHECK-NEXT: .LBB0_1: // %bb8
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cbz w19, .LBB0_1
+; CHECK-NEXT: // %bb.2: // %bb8
+; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: cmp w19, #39
+; CHECK-NEXT: b.eq .LBB0_6
+; CHECK-NEXT: // %bb.3: // %bb8
+; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: cmp w19, #34
+; CHECK-NEXT: b.eq .LBB0_6
+; CHECK-NEXT: // %bb.4: // %bb8
+; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: cmp w19, #10
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.5: // %bb9
+; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: str wzr, [x20]
+; CHECK-NEXT: b .LBB0_1
+; CHECK-NEXT: .LBB0_6: // %bb10
+; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: mov w0, w23
+; CHECK-NEXT: mov x1, x21
+; CHECK-NEXT: str wzr, [x22]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: b .LBB0_1
+bb:
+ br label %bb7
+
+bb7: ; preds = %bb13, %bb
+ %phi = phi i32 [ 0, %bb ], [ %spec.select, %bb13 ]
+ br label %bb8
+
+bb8: ; preds = %bb10, %bb9, %bb8, %bb7
+ switch i32 %arg1, label %bb8 [
+ i32 10, label %bb9
+ i32 1, label %bb16
+ i32 0, label %bb13
+ i32 39, label %bb10
+ i32 34, label %bb10
+ ]
+
+bb9: ; preds = %bb8
+ store i32 0, ptr %arg2, align 4
+ br label %bb8
+
+bb10: ; preds = %bb8, %bb8
+ store i32 0, ptr %arg4, align 4
+ tail call void @foo(i32 %arg5, ptr %arg3)
+ br label %bb8
+
+bb13: ; preds = %bb8
+ %not.arg6 = xor i1 %arg6, true
+ %spec.select = zext i1 %not.arg6 to i32
+ br label %bb7
+
+bb16: ; preds = %bb8
+ unreachable
+}
+
+define void @optspeed(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %arg5, i1 %arg6) {
+; CHECK-LABEL: optspeed:
+; CHECK: // %bb.0: // %bb
+; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w21, -24
+; CHECK-NEXT: .cfi_offset w22, -32
+; CHECK-NEXT: .cfi_offset w23, -40
+; CHECK-NEXT: .cfi_offset w30, -48
+; CHECK-NEXT: mov w22, w5
+; CHECK-NEXT: mov x21, x4
+; CHECK-NEXT: mov x20, x3
+; CHECK-NEXT: mov x23, x2
+; CHECK-NEXT: mov w19, w1
+; CHECK-NEXT: b .LBB1_2
+; CHECK-NEXT: .LBB1_1: // %bb10
+; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: mov w0, w22
+; CHECK-NEXT: mov x1, x20
+; CHECK-NEXT: str wzr, [x21]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: .LBB1_2: // %bb8
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cmp w19, #33
+; CHECK-NEXT: b.gt .LBB1_6
+; CHECK-NEXT: // %bb.3: // %bb8
+; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: cbz w19, .LBB1_2
+; CHECK-NEXT: // %bb.4: // %bb8
+; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: cmp w19, #10
+; CHECK-NEXT: b.ne .LBB1_2
+; CHECK-NEXT: // %bb.5: // %bb9
+; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: str wzr, [x23]
+; CHECK-NEXT: b .LBB1_2
+; CHECK-NEXT: .LBB1_6: // %bb8
+; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: cmp w19, #34
+; CHECK-NEXT: b.eq .LBB1_1
+; CHECK-NEXT: // %bb.7: // %bb8
+; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: cmp w19, #39
+; CHECK-NEXT: b.eq .LBB1_1
+; CHECK-NEXT: b .LBB1_2
+bb:
+ br label %bb7
+
+bb7: ; preds = %bb13, %bb
+ %phi = phi i32 [ 0, %bb ], [ %spec.select, %bb13 ]
+ br label %bb8
+
+bb8: ; preds = %bb10, %bb9, %bb8, %bb7
+ switch i32 %arg1, label %bb8 [
+ i32 10, label %bb9
+ i32 1, label %bb16
+ i32 0, label %bb13
+ i32 39, label %bb10
+ i32 34, label %bb10
+ ]
+
+bb9: ; preds = %bb8
+ store i32 0, ptr %arg2, align 4
+ br label %bb8
+
+bb10: ; preds = %bb8, %bb8
+ store i32 0, ptr %arg4, align 4
+ tail call void @foo(i32 %arg5, ptr %arg3)
+ br label %bb8
+
+bb13: ; preds = %bb8
+ %not.arg6 = xor i1 %arg6, true
+ %spec.select = zext i1 %not.arg6 to i32
+ br label %bb7
+
+bb16: ; preds = %bb8
+ unreachable
+}
|
; CHECK-NEXT: mov w23, w5 | ||
; CHECK-NEXT: mov x22, x4 | ||
; CHECK-NEXT: mov x21, x3 | ||
; CHECK-NEXT: mov x20, x2 | ||
; CHECK-NEXT: mov w19, w1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's interesting that the minsize spill weight seems to produce more regular assembly. This might improve outlining which would explain the size win.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In fact, I'd consider the basic RA for the size optimization in general when the target enables the machine outliner by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/154/builds/6237 Here is the relevant piece of the build log for the reference
|
Change the spill weight calculations for
optsize
functions to remove the block frequency multiplier. For those functions, we do not want to consider the runtime cost of spilling, only the codesize cost.I built a large app with the basic and greedy (default) register allocator enabled.
Since I only saw a size win with the basic register allocator, I decided to only change the behavior for that type.