Skip to content

[InlineSpiller] Check rematerialization before folding operand #134015

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions llvm/include/llvm/CodeGen/Spiller.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class LiveIntervals;
class LiveStacks;
class MachineDominatorTree;
class MachineBlockFrequencyInfo;
class AllocationOrder;

/// Spiller interface.
///
Expand All @@ -35,7 +36,7 @@ class Spiller {
virtual ~Spiller() = 0;

/// spill - Spill the LRE.getParent() live interval.
virtual void spill(LiveRangeEdit &LRE) = 0;
virtual void spill(LiveRangeEdit &LRE, AllocationOrder *Order = nullptr) = 0;

/// Return the registers that were spilled.
virtual ArrayRef<Register> getSpilledRegs() = 0;
Expand All @@ -58,7 +59,8 @@ class Spiller {
/// of deferring though VirtRegMap.
Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses,
MachineFunction &MF, VirtRegMap &VRM,
VirtRegAuxInfo &VRAI);
VirtRegAuxInfo &VRAI,
LiveRegMatrix *Matrix = nullptr);

} // end namespace llvm

Expand Down
38 changes: 31 additions & 7 deletions llvm/lib/CodeGen/InlineSpiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//

#include "AllocationOrder.h"
#include "SplitKit.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
Expand All @@ -23,6 +24,7 @@
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRangeEdit.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
Expand Down Expand Up @@ -149,12 +151,14 @@ class InlineSpiller : public Spiller {
MachineRegisterInfo &MRI;
const TargetInstrInfo &TII;
const TargetRegisterInfo &TRI;
LiveRegMatrix *Matrix = nullptr;

// Variables that are valid during spill(), but used by multiple methods.
LiveRangeEdit *Edit = nullptr;
LiveInterval *StackInt = nullptr;
int StackSlot;
Register Original;
AllocationOrder *Order = nullptr;

// All registers to spill to StackSlot, including the main register.
SmallVector<Register, 8> RegsToSpill;
Expand Down Expand Up @@ -184,13 +188,13 @@ class InlineSpiller : public Spiller {

public:
InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF,
VirtRegMap &VRM, VirtRegAuxInfo &VRAI)
VirtRegMap &VRM, VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix)
: MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM),
MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
TRI(*MF.getSubtarget().getRegisterInfo()), HSpiller(Analyses, MF, VRM),
VRAI(VRAI) {}
TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix),
HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}

void spill(LiveRangeEdit &) override;
void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override;
ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
ArrayRef<Register> getReplacedRegs() override { return RegsReplaced; }
void postOptimization() override;
Expand All @@ -207,6 +211,7 @@ class InlineSpiller : public Spiller {

void markValueUsed(LiveInterval*, VNInfo*);
bool canGuaranteeAssignmentAfterRemat(Register VReg, MachineInstr &MI);
bool hasPhysRegAvailable(const MachineInstr &MI);
bool reMaterializeFor(LiveInterval &, MachineInstr &MI);
void reMaterializeAll();

Expand All @@ -229,8 +234,8 @@ void Spiller::anchor() {}
Spiller *
llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses,
MachineFunction &MF, VirtRegMap &VRM,
VirtRegAuxInfo &VRAI) {
return new InlineSpiller(Analyses, MF, VRM, VRAI);
VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix) {
return new InlineSpiller(Analyses, MF, VRM, VRAI, Matrix);
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -615,6 +620,23 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg,
return true;
}

/// hasPhysRegAvailable - Check if there is an available physical register for
/// rematerialization.
bool InlineSpiller::hasPhysRegAvailable(const MachineInstr &MI) {
if (!Order || !Matrix)
return false;
Comment on lines +626 to +627
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should not be optional

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The InlineSpiller is used by multiple register allocators, but not all of them use AllocationOrder, so we may not always get a valid Order. Maybe that allocator(PBQP) should be fixed.


SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
SlotIndex PrevIdx = UseIdx.getPrevSlot();

for (MCPhysReg PhysReg : *Order) {
if (!Matrix->checkInterference(PrevIdx, UseIdx, PhysReg))
return true;
}
Comment on lines +632 to +635
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inline spiller should not be doing full availability checks. This is algorithmically really bad. The existing tryAssign loop is bad enough

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you worry about compile time or something else?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This is quite likely the single slowest operation in LLVM. The inline spiller should not need to do this and you should find another approach

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is called only when an MI has following properties:

 MI->isRematerializable() && MI->canFoldAsLoad() && !MI->mayLoad()

I don't expect many instructions satisfy them. I will run a compile time test for it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping, @arsenm

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically nothing here is going to matter for X86, this could be disastrous on a GPU.

I also think this is conceptually not the way to address this problem, please revert


return false;
}

/// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
// Analyze instruction
Expand Down Expand Up @@ -661,6 +683,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
// Before rematerializing into a register for a single instruction, try to
// fold a load into the instruction. That avoids allocating a new register.
if (RM.OrigMI->canFoldAsLoad() &&
(RM.OrigMI->mayLoad() || !hasPhysRegAvailable(MI)) &&
foldMemoryOperand(Ops, RM.OrigMI)) {
Edit->markRematerialized(RM.ParentVNI);
++NumFoldedLoads;
Expand Down Expand Up @@ -1282,9 +1305,10 @@ void InlineSpiller::spillAll() {
Edit->eraseVirtReg(Reg);
}

void InlineSpiller::spill(LiveRangeEdit &edit) {
void InlineSpiller::spill(LiveRangeEdit &edit, AllocationOrder *order) {
++NumSpilledRanges;
Edit = &edit;
Order = order;
assert(!edit.getReg().isStack() && "Trying to spill a stack slot.");
// Share a stack slot among all descendants of Original.
Original = VRM.getOriginal(edit.getReg());
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/CodeGen/RegAllocGreedy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2664,7 +2664,7 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
NamedRegionTimer T("spill", "Spiller", TimerGroupName,
TimerGroupDescription, TimePassesIsEnabled);
LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
spiller().spill(LRE);
spiller().spill(LRE, &Order);
ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);

// Tell LiveDebugVariables about the new ranges. Ranges not being covered by
Expand Down Expand Up @@ -2908,8 +2908,8 @@ bool RAGreedy::run(MachineFunction &mf) {
PriorityAdvisor = PriorityProvider->getAdvisor(*MF, *this, *Indexes);

VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
SpillerInstance.reset(
createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
SpillerInstance.reset(createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF,
*VRM, *VRAI, Matrix));

VRAI->calculateSpillWeightsAndHints();

Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/avx-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ define void @render(double %a0) nounwind {
; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vucomisd %xmm1, %xmm0
; CHECK-NEXT: jne .LBB2_4
; CHECK-NEXT: jnp .LBB2_2
; CHECK-NEXT: .LBB2_4: # %if.then
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
; AVX512-NEXT: callq use.v4.i32@PLT
; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: vpcmpltud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
; AVX512-NEXT: vmovdqa32 {{.*#+}} xmm0 {%k1} {z} = [4294967295,4294967295,4294967295,4294967295]
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: addq $24, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
Expand Down
21 changes: 14 additions & 7 deletions llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
Expand All @@ -581,7 +582,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
Expand All @@ -593,7 +595,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
Expand All @@ -609,7 +612,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
Expand All @@ -621,7 +625,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
Expand All @@ -634,7 +639,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
Expand All @@ -646,7 +652,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
Expand Down
Loading
Loading