-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[InlineSpiller] Check rematerialization before folding operand #134015
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice.
@llvm/pr-subscribers-llvm-regalloc Author: None (weiguozhi) ChangesCurrent implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. Then we can produce
instead of
Patch is 42.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134015.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h
index 3132cefeb6c68..84fc872a07606 100644
--- a/llvm/include/llvm/CodeGen/Spiller.h
+++ b/llvm/include/llvm/CodeGen/Spiller.h
@@ -23,6 +23,7 @@ class LiveIntervals;
class LiveStacks;
class MachineDominatorTree;
class MachineBlockFrequencyInfo;
+class AllocationOrder;
/// Spiller interface.
///
@@ -35,7 +36,7 @@ class Spiller {
virtual ~Spiller() = 0;
/// spill - Spill the LRE.getParent() live interval.
- virtual void spill(LiveRangeEdit &LRE) = 0;
+ virtual void spill(LiveRangeEdit &LRE, AllocationOrder *Order = nullptr) = 0;
/// Return the registers that were spilled.
virtual ArrayRef<Register> getSpilledRegs() = 0;
@@ -58,7 +59,8 @@ class Spiller {
/// of deferring though VirtRegMap.
Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses,
MachineFunction &MF, VirtRegMap &VRM,
- VirtRegAuxInfo &VRAI);
+ VirtRegAuxInfo &VRAI,
+ LiveRegMatrix *Matrix = nullptr);
} // end namespace llvm
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 920873c739f46..f384740be2e33 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AllocationOrder.h"
#include "SplitKit.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -23,6 +24,7 @@
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -149,12 +151,14 @@ class InlineSpiller : public Spiller {
MachineRegisterInfo &MRI;
const TargetInstrInfo &TII;
const TargetRegisterInfo &TRI;
+ LiveRegMatrix *Matrix = nullptr;
// Variables that are valid during spill(), but used by multiple methods.
LiveRangeEdit *Edit = nullptr;
LiveInterval *StackInt = nullptr;
int StackSlot;
Register Original;
+ AllocationOrder *Order = nullptr;
// All registers to spill to StackSlot, including the main register.
SmallVector<Register, 8> RegsToSpill;
@@ -184,13 +188,13 @@ class InlineSpiller : public Spiller {
public:
InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF,
- VirtRegMap &VRM, VirtRegAuxInfo &VRAI)
+ VirtRegMap &VRM, VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix)
: MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM),
MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
- TRI(*MF.getSubtarget().getRegisterInfo()), HSpiller(Analyses, MF, VRM),
- VRAI(VRAI) {}
+ TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix),
+ HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}
- void spill(LiveRangeEdit &) override;
+ void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override;
ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
ArrayRef<Register> getReplacedRegs() override { return RegsReplaced; }
void postOptimization() override;
@@ -207,6 +211,7 @@ class InlineSpiller : public Spiller {
void markValueUsed(LiveInterval*, VNInfo*);
bool canGuaranteeAssignmentAfterRemat(Register VReg, MachineInstr &MI);
+ bool hasPhysRegAvailable(const MachineInstr &MI);
bool reMaterializeFor(LiveInterval &, MachineInstr &MI);
void reMaterializeAll();
@@ -229,8 +234,8 @@ void Spiller::anchor() {}
Spiller *
llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses,
MachineFunction &MF, VirtRegMap &VRM,
- VirtRegAuxInfo &VRAI) {
- return new InlineSpiller(Analyses, MF, VRM, VRAI);
+ VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix) {
+ return new InlineSpiller(Analyses, MF, VRM, VRAI, Matrix);
}
//===----------------------------------------------------------------------===//
@@ -615,6 +620,23 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg,
return true;
}
+/// hasPhysRegAvailable - Check if there is an available physical register for
+/// rematerialization.
+bool InlineSpiller::hasPhysRegAvailable(const MachineInstr &MI) {
+ if (!Order || !Matrix)
+ return false;
+
+ SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
+ SlotIndex PrevIdx = UseIdx.getPrevSlot();
+
+ for (MCPhysReg PhysReg : *Order) {
+ if (!Matrix->checkInterference(PrevIdx, UseIdx, PhysReg))
+ return true;
+ }
+
+ return false;
+}
+
/// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
// Analyze instruction
@@ -661,6 +683,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
// Before rematerializing into a register for a single instruction, try to
// fold a load into the instruction. That avoids allocating a new register.
if (RM.OrigMI->canFoldAsLoad() &&
+ (RM.OrigMI->mayLoad() || !hasPhysRegAvailable(MI)) &&
foldMemoryOperand(Ops, RM.OrigMI)) {
Edit->markRematerialized(RM.ParentVNI);
++NumFoldedLoads;
@@ -1282,9 +1305,10 @@ void InlineSpiller::spillAll() {
Edit->eraseVirtReg(Reg);
}
-void InlineSpiller::spill(LiveRangeEdit &edit) {
+void InlineSpiller::spill(LiveRangeEdit &edit, AllocationOrder *order) {
++NumSpilledRanges;
Edit = &edit;
+ Order = order;
assert(!edit.getReg().isStack() && "Trying to spill a stack slot.");
// Share a stack slot among all descendants of Original.
Original = VRM.getOriginal(edit.getReg());
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index a5cd9fc7a5360..f86a3ea8dc55e 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2664,7 +2664,7 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
NamedRegionTimer T("spill", "Spiller", TimerGroupName,
TimerGroupDescription, TimePassesIsEnabled);
LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
- spiller().spill(LRE);
+ spiller().spill(LRE, &Order);
ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
// Tell LiveDebugVariables about the new ranges. Ranges not being covered by
@@ -2909,7 +2909,8 @@ bool RAGreedy::run(MachineFunction &mf) {
VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
SpillerInstance.reset(
- createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
+ createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI,
+ Matrix));
VRAI->calculateSpillWeightsAndHints();
diff --git a/llvm/test/CodeGen/X86/avx-cmp.ll b/llvm/test/CodeGen/X86/avx-cmp.ll
index d31107bfeb7bb..3ced4f71bad8c 100644
--- a/llvm/test/CodeGen/X86/avx-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx-cmp.ll
@@ -43,7 +43,8 @@ define void @render(double %a0) nounwind {
; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vucomisd %xmm1, %xmm0
; CHECK-NEXT: jne .LBB2_4
; CHECK-NEXT: jnp .LBB2_2
; CHECK-NEXT: .LBB2_4: # %if.then
diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
index 95a7a10d50f59..3243d950740ca 100644
--- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
+++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
@@ -111,7 +111,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
; AVX512-NEXT: callq use.v4.i32@PLT
; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: vpcmpltud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
-; AVX512-NEXT: vmovdqa32 {{.*#+}} xmm0 {%k1} {z} = [4294967295,4294967295,4294967295,4294967295]
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: addq $24, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 67c9e7cc22236..59a61722927de 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -195,7 +195,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
index 536a1ae3b918d..7f6d64c21724a 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
@@ -567,7 +567,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -581,7 +582,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -593,7 +595,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -609,7 +612,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -621,7 +625,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -634,7 +639,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -646,7 +652,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
index 4305886168abe..ffbdd66529f5c 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
@@ -280,11 +280,12 @@ define <4 x i128> @test_unsigned_v4i128_v4f32(<4 x float> %f) nounwind {
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: movq %rax, %r12
; CHECK-NEXT: movq %rdx, %r13
-; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: ucomiss %xmm0, %xmm1
; CHECK-NEXT: cmovbq %r14, %r13
; CHECK-NEXT: cmovbq %r14, %r12
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: cmovaq %rbp, %r12
; CHECK-NEXT: cmovaq %rbp, %r13
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -293,19 +294,21 @@ define <4 x i128> @test_unsigned_v4i128_v4f32(<4 x float> %f) nounwind {
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: movq %rax, %rbp
; CHECK-NEXT: movq %rdx, %r14
-; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: ucomiss %xmm0, %xmm1
; CHECK-NEXT: movl $0, %eax
; CHECK-NEXT: cmovbq %rax, %r14
; CHECK-NEXT: cmovbq %rax, %rbp
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: movq $-1, %rax
; CHECK-NEXT: cmovaq %rax, %rbp
; CHECK-NEXT: cmovaq %rax, %r14
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: movl $0, %ecx
; CHECK-NEXT: cmovbq %rcx, %rdx
; CHECK-NEXT: cmovbq %rcx, %rax
@@ -506,7 +509,8 @@ define <2 x i128> @test_unsigned_v2i128_v2f64(<2 x double> %f) nounwind {
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixunsdfti@PLT
; CHECK-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
; CHECK-NEXT: cmovbq %r12, %rdx
; CHECK-NEXT: cmovbq %r12, %rax
; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -562,7 +566,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -574,7 +579,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -584,7 +590,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -598,7 +605,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: psrlq $48, %xmm0
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -608,7 +616,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -619,7 +628,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -629,7 +639,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: psrld $16, %xmm0
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -673,7 +684,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -682,7 +694,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -691,7 +704,8 @@ define <8 x i...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: None (weiguozhi) ChangesCurrent implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. Then we can produce
instead of
Patch is 42.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134015.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h
index 3132cefeb6c68..84fc872a07606 100644
--- a/llvm/include/llvm/CodeGen/Spiller.h
+++ b/llvm/include/llvm/CodeGen/Spiller.h
@@ -23,6 +23,7 @@ class LiveIntervals;
class LiveStacks;
class MachineDominatorTree;
class MachineBlockFrequencyInfo;
+class AllocationOrder;
/// Spiller interface.
///
@@ -35,7 +36,7 @@ class Spiller {
virtual ~Spiller() = 0;
/// spill - Spill the LRE.getParent() live interval.
- virtual void spill(LiveRangeEdit &LRE) = 0;
+ virtual void spill(LiveRangeEdit &LRE, AllocationOrder *Order = nullptr) = 0;
/// Return the registers that were spilled.
virtual ArrayRef<Register> getSpilledRegs() = 0;
@@ -58,7 +59,8 @@ class Spiller {
/// of deferring though VirtRegMap.
Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses,
MachineFunction &MF, VirtRegMap &VRM,
- VirtRegAuxInfo &VRAI);
+ VirtRegAuxInfo &VRAI,
+ LiveRegMatrix *Matrix = nullptr);
} // end namespace llvm
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 920873c739f46..f384740be2e33 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AllocationOrder.h"
#include "SplitKit.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -23,6 +24,7 @@
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -149,12 +151,14 @@ class InlineSpiller : public Spiller {
MachineRegisterInfo &MRI;
const TargetInstrInfo &TII;
const TargetRegisterInfo &TRI;
+ LiveRegMatrix *Matrix = nullptr;
// Variables that are valid during spill(), but used by multiple methods.
LiveRangeEdit *Edit = nullptr;
LiveInterval *StackInt = nullptr;
int StackSlot;
Register Original;
+ AllocationOrder *Order = nullptr;
// All registers to spill to StackSlot, including the main register.
SmallVector<Register, 8> RegsToSpill;
@@ -184,13 +188,13 @@ class InlineSpiller : public Spiller {
public:
InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF,
- VirtRegMap &VRM, VirtRegAuxInfo &VRAI)
+ VirtRegMap &VRM, VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix)
: MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM),
MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
- TRI(*MF.getSubtarget().getRegisterInfo()), HSpiller(Analyses, MF, VRM),
- VRAI(VRAI) {}
+ TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix),
+ HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}
- void spill(LiveRangeEdit &) override;
+ void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override;
ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
ArrayRef<Register> getReplacedRegs() override { return RegsReplaced; }
void postOptimization() override;
@@ -207,6 +211,7 @@ class InlineSpiller : public Spiller {
void markValueUsed(LiveInterval*, VNInfo*);
bool canGuaranteeAssignmentAfterRemat(Register VReg, MachineInstr &MI);
+ bool hasPhysRegAvailable(const MachineInstr &MI);
bool reMaterializeFor(LiveInterval &, MachineInstr &MI);
void reMaterializeAll();
@@ -229,8 +234,8 @@ void Spiller::anchor() {}
Spiller *
llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses,
MachineFunction &MF, VirtRegMap &VRM,
- VirtRegAuxInfo &VRAI) {
- return new InlineSpiller(Analyses, MF, VRM, VRAI);
+ VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix) {
+ return new InlineSpiller(Analyses, MF, VRM, VRAI, Matrix);
}
//===----------------------------------------------------------------------===//
@@ -615,6 +620,23 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg,
return true;
}
+/// hasPhysRegAvailable - Check if there is an available physical register for
+/// rematerialization.
+bool InlineSpiller::hasPhysRegAvailable(const MachineInstr &MI) {
+ if (!Order || !Matrix)
+ return false;
+
+ SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
+ SlotIndex PrevIdx = UseIdx.getPrevSlot();
+
+ for (MCPhysReg PhysReg : *Order) {
+ if (!Matrix->checkInterference(PrevIdx, UseIdx, PhysReg))
+ return true;
+ }
+
+ return false;
+}
+
/// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
// Analyze instruction
@@ -661,6 +683,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
// Before rematerializing into a register for a single instruction, try to
// fold a load into the instruction. That avoids allocating a new register.
if (RM.OrigMI->canFoldAsLoad() &&
+ (RM.OrigMI->mayLoad() || !hasPhysRegAvailable(MI)) &&
foldMemoryOperand(Ops, RM.OrigMI)) {
Edit->markRematerialized(RM.ParentVNI);
++NumFoldedLoads;
@@ -1282,9 +1305,10 @@ void InlineSpiller::spillAll() {
Edit->eraseVirtReg(Reg);
}
-void InlineSpiller::spill(LiveRangeEdit &edit) {
+void InlineSpiller::spill(LiveRangeEdit &edit, AllocationOrder *order) {
++NumSpilledRanges;
Edit = &edit;
+ Order = order;
assert(!edit.getReg().isStack() && "Trying to spill a stack slot.");
// Share a stack slot among all descendants of Original.
Original = VRM.getOriginal(edit.getReg());
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index a5cd9fc7a5360..f86a3ea8dc55e 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2664,7 +2664,7 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
NamedRegionTimer T("spill", "Spiller", TimerGroupName,
TimerGroupDescription, TimePassesIsEnabled);
LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
- spiller().spill(LRE);
+ spiller().spill(LRE, &Order);
ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
// Tell LiveDebugVariables about the new ranges. Ranges not being covered by
@@ -2909,7 +2909,8 @@ bool RAGreedy::run(MachineFunction &mf) {
VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
SpillerInstance.reset(
- createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
+ createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI,
+ Matrix));
VRAI->calculateSpillWeightsAndHints();
diff --git a/llvm/test/CodeGen/X86/avx-cmp.ll b/llvm/test/CodeGen/X86/avx-cmp.ll
index d31107bfeb7bb..3ced4f71bad8c 100644
--- a/llvm/test/CodeGen/X86/avx-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx-cmp.ll
@@ -43,7 +43,8 @@ define void @render(double %a0) nounwind {
; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vucomisd %xmm1, %xmm0
; CHECK-NEXT: jne .LBB2_4
; CHECK-NEXT: jnp .LBB2_2
; CHECK-NEXT: .LBB2_4: # %if.then
diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
index 95a7a10d50f59..3243d950740ca 100644
--- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
+++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
@@ -111,7 +111,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
; AVX512-NEXT: callq use.v4.i32@PLT
; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: vpcmpltud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
-; AVX512-NEXT: vmovdqa32 {{.*#+}} xmm0 {%k1} {z} = [4294967295,4294967295,4294967295,4294967295]
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: addq $24, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 67c9e7cc22236..59a61722927de 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -195,7 +195,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
index 536a1ae3b918d..7f6d64c21724a 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
@@ -567,7 +567,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -581,7 +582,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -593,7 +595,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -609,7 +612,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -621,7 +625,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -634,7 +639,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
@@ -646,7 +652,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmovbl %ebp, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmoval %ebx, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovpl %ebx, %eax
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
index 4305886168abe..ffbdd66529f5c 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
@@ -280,11 +280,12 @@ define <4 x i128> @test_unsigned_v4i128_v4f32(<4 x float> %f) nounwind {
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: movq %rax, %r12
; CHECK-NEXT: movq %rdx, %r13
-; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: ucomiss %xmm0, %xmm1
; CHECK-NEXT: cmovbq %r14, %r13
; CHECK-NEXT: cmovbq %r14, %r12
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: cmovaq %rbp, %r12
; CHECK-NEXT: cmovaq %rbp, %r13
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
@@ -293,19 +294,21 @@ define <4 x i128> @test_unsigned_v4i128_v4f32(<4 x float> %f) nounwind {
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: movq %rax, %rbp
; CHECK-NEXT: movq %rdx, %r14
-; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT: ucomiss %xmm0, %xmm1
; CHECK-NEXT: movl $0, %eax
; CHECK-NEXT: cmovbq %rax, %r14
; CHECK-NEXT: cmovbq %rax, %rbp
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: movq $-1, %rax
; CHECK-NEXT: cmovaq %rax, %rbp
; CHECK-NEXT: cmovaq %rax, %r14
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: movl $0, %ecx
; CHECK-NEXT: cmovbq %rcx, %rdx
; CHECK-NEXT: cmovbq %rcx, %rax
@@ -506,7 +509,8 @@ define <2 x i128> @test_unsigned_v2i128_v2f64(<2 x double> %f) nounwind {
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __fixunsdfti@PLT
; CHECK-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
; CHECK-NEXT: cmovbq %r12, %rdx
; CHECK-NEXT: cmovbq %r12, %rax
; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -562,7 +566,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -574,7 +579,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -584,7 +590,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -598,7 +605,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: psrlq $48, %xmm0
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -608,7 +616,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -619,7 +628,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -629,7 +639,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: psrld $16, %xmm0
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -673,7 +684,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -682,7 +694,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind {
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
-; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: cmovbl %ebx, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: cmoval %ebp, %eax
@@ -691,7 +704,8 @@ define <8 x i...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
if (!Order || !Matrix) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should not be optional
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The InlineSpiller is used by multiple register allocators, but not all of them use AllocationOrder, so we may not always get a valid Order. Maybe that allocator(PBQP) should be fixed.
for (MCPhysReg PhysReg : *Order) { | ||
if (!Matrix->checkInterference(PrevIdx, UseIdx, PhysReg)) | ||
return true; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The inline spiller should not be doing full availability checks. This is algorithmically really bad. The existing tryAssign loop is bad enough
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you worry about compile time or something else?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. This is quite likely the single slowest operation in LLVM. The inline spiller should not need to do this and you should find another approach
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function is called only when an MI has following properties:
MI->isRematerializable() && MI->canFoldAsLoad() && !MI->mayLoad()
I don't expect many instructions satisfy them. I will run a compile time test for it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It has almost no impact on compile time test.
http://llvm-compile-time-tracker.com/compare.php?from=1407f5bee9aa8e2a8a4fcab63ab0a3030a8b0dcf&to=3b1c437f19f43ec5b4c6579eb8f7b9c66003a063&stat=instructions:u
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping, @arsenm
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Basically nothing here is going to matter for X86, this could be disastrous on a GPU.
I also think this is conceptually not the way to address this problem, please revert
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test changes LG. @arsenm should take another look for his concern though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I object to the approach, revert in #137801
…134015) Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. Then we can produce xorps %xmm1, %xmm1 ucomiss %xmm1, %xmm0 instead of ucomiss LCPI0_1(%rip), %xmm0
…134015) Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. Then we can produce xorps %xmm1, %xmm1 ucomiss %xmm1, %xmm0 instead of ucomiss LCPI0_1(%rip), %xmm0
…134015) Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. Then we can produce xorps %xmm1, %xmm1 ucomiss %xmm1, %xmm0 instead of ucomiss LCPI0_1(%rip), %xmm0
…134015) Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. Then we can produce xorps %xmm1, %xmm1 ucomiss %xmm1, %xmm0 instead of ucomiss LCPI0_1(%rip), %xmm0
…134015) Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. Then we can produce xorps %xmm1, %xmm1 ucomiss %xmm1, %xmm0 instead of ucomiss LCPI0_1(%rip), %xmm0
Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure.
This patch do this check to find the better choice. Then we can produce
instead of