-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][SILoadStoreOptimizer] Merge constrained sloads #96162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Christudasan Devadasan (cdevadas) ChangesConsider the constrained multi-dword loads while merging Patch is 1023.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96162.diff 116 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index db5b467f22389..19d5b950d7142 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -967,6 +967,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
+ bool hasXnackReplay() const { return GFX8Insts; }
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 8b42d4a1dee7a..0b285d52b539e 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
CombineInfo &Paired, bool Modify = false);
static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
const CombineInfo &Paired);
- static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
+ static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired,
+ const GCNSubtarget *STI = nullptr);
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
const CombineInfo &Paired);
const TargetRegisterClass *
@@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
case AMDGPU::GLOBAL_STORE_DWORD:
@@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX3_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return 8;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return S_LOAD_IMM;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
return AMDGPU::S_LOAD_DWORD_IMM;
case AMDGPU::GLOBAL_LOAD_DWORD:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORD_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
Result.SBase = true;
return Result;
case AMDGPU::DS_READ_B32:
@@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs(
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
- const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
- const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+ auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
+ auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
+
+ // The constrained sload instructions in S_LOAD_IMM class will have
+ // `early-clobber` flag in the dst operand. Remove the flag before using the
+ // MOs in copies.
+ if (Dest0->isEarlyClobber())
+ Dest0->setIsEarlyClobber(false);
+
+ if (Dest1->isEarlyClobber())
+ Dest1->setIsEarlyClobber(false);
BuildMI(*MBB, InsertBefore, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
MachineBasicBlock::iterator InsertBefore) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI, Paired);
+ const unsigned Opcode = getNewOpcode(CI, Paired, STM);
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
}
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
- const CombineInfo &Paired) {
+ const CombineInfo &Paired,
+ const GCNSubtarget *STI) {
const unsigned Width = CI.Width + Paired.Width;
switch (getCommonInstClass(CI, Paired)) {
@@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
case S_LOAD_IMM:
- switch (Width) {
- default:
- return 0;
- case 2:
- return AMDGPU::S_LOAD_DWORDX2_IMM;
- case 3:
- return AMDGPU::S_LOAD_DWORDX3_IMM;
- case 4:
- return AMDGPU::S_LOAD_DWORDX4_IMM;
- case 8:
- return AMDGPU::S_LOAD_DWORDX8_IMM;
+ // For targets that support XNACK replay, use the constrained load opcode.
+ if (STI && STI->hasXnackReplay()) {
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
+ case 3:
+ return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
+ case 4:
+ return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
+ case 8:
+ return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
+ }
+ } else {
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::S_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_LOAD_DWORDX3_IMM;
+ case 4:
+ return AMDGPU::S_LOAD_DWORDX4_IMM;
+ case 8:
+ return AMDGPU::S_LOAD_DWORDX8_IMM;
+ }
}
case GLOBAL_LOAD:
switch (Width) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index eb20178f9f4d8..3f034eaca4997 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -468,18 +468,18 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
;
; VI-LABEL: load_i8_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -513,16 +513,16 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v2i8_to_v2f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v1, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -562,16 +562,16 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v3i8_to_v3f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -612,16 +612,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v4i8_to_v4f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
@@ -679,11 +679,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
;
; VI-LABEL: load_v4i8_to_v4f32_unaligned:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
@@ -706,12 +706,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v1, v2, v3
; VI-NEXT: v_or_b32_e32 v3, v1, v0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -770,6 +770,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v6, 9
; VI-NEXT: v_mov_b32_e32 v7, 8
@@ -779,11 +780,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xff
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -804,8 +803,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v2
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -858,11 +857,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v7i8_to_v7f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
@@ -884,10 +883,10 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
; VI-NEXT: flat_load_ubyte v4, v[8:9]
; VI-NEXT: flat_load_ubyte v5, v[10:11]
; VI-NEXT: flat_load_ubyte v6, v[12:13]
-; VI-NEXT: v_mov_b32_e32 v8, s1
-; VI-NEXT: v_mov_b32_e32 v7, s0
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v8, s5
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v7, s4
; VI-NEXT: v_mov_b32_e32 v10, s1
; VI-NEXT: v_mov_b32_e32 v9, s0
; VI-NEXT: s_waitcnt vmcnt(6)
@@ -949,18 +948,18 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: load_v8i8_to_v8f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v9, s1
-; VI-NEXT: v_mov_b32_e32 v8, s0
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v9, s5
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v8, s4
; VI-NEXT: v_mov_b32_e32 v11, s1
; VI-NEXT: v_mov_b32_e32 v10, s0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1005,19 +1004,19 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
;
; VI-LABEL: i8_zext_inreg_i32_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1051,18 +1050,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
;
; VI-LABEL: i8_zext_inreg_hi1_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1096,18 +1095,18 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
;
; VI-LABEL: i8_zext_i32_to_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1157,11 +1156,1...
[truncated]
|
This looks like it is affecting codegen even when xnack is disabled? That should not happen. |
It shouldn't. I put the xnack replay subtarget check before using *_ec equivalents. See the code here: |
I'm still not sure why we have so much in this pass. The load and store vectorization should have happened in the IR. This pass originally was for the multi offset DS instructions |
You're checking |
65eb443
to
26e0864
Compare
786a670
to
e7e6cbc
Compare
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> | |||
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { | |||
; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: | |||
; GFX940: ; %bb.0: | |||
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 | |||
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this patch have so many test diffs? Before this patch, we would have just missed out on a few folds after the _ec variants were introduced?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Earlier I wrongly used the dword size (Width) in the the alignment check here as Jay pointed out. Now, I fixed it to use Byte size while comparing it with the existing alignment of the first load.
e7e6cbc#diff-35f4d1b6c4c17815f6989f86abbac2e606ca760f9d93f501ff503449048bf760R1730
These tests actually needed the *_ec variant due to the under-alignment which is fixed now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But surely we aren't merging this many scalar loads in MIR? The IR vectorizer should have gotten most of these?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately, that's not happening. The IR load-store-vectorizer doesn't combine the two loads.
I still see the two loads after the IR vectorizer and they become two loads in the selected code. Can this happen because the alignment for the two loads differ and the IR vectorizer safely ignores them?
*** IR Dump before Selection ***
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) #0 {
%local_atomic_fadd_v2bf16_noret.kernarg.segment = call nonnull align 16 dereferenceable(44) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
%ptr.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %local_atomic_fadd_v2bf16_noret.kernarg.segment, i64 36, !amdgpu.uniform !0
%ptr.load = load ptr addrspace(3), ptr addrspace(4) %ptr.kernarg.offset, align 4, !invariant.load !0
%data.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %local_atomic_fadd_v2bf16_noret.kernarg.segment, i64 40, !amdgpu.uniform !0
%data.load = load <2 x i16>, ptr addrspace(4) %data.kernarg.offset, align 8, !invariant.load !0
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr.load, <2 x i16> %data.load)
ret void
}
*** IR Dump After selection ***
Machine code for function local_atomic_fadd_v2bf16_noret: IsSSA, TracksLiveness
Function Live Ins: $sgpr0_sgpr1 in %1
bb.0 (%ir-block.0):
liveins: $sgpr0_sgpr1
%1:sgpr_64(p4) = COPY $sgpr0_sgpr1
%3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.ptr.kernarg.offset, addrspace 4)
%4:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1:sgpr_64(p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir.data.kernarg.offset, align 8, addrspace 4)
%5:vgpr_32 = COPY %3:sreg_32_xm0_xexec
%6:vgpr_32 = COPY %4:sreg_32_xm0_xexec
DS_PK_ADD_BF16 killed %5:vgpr_32, killed %6:vgpr_32, 0, 0, implicit $m0, implicit $exec
S_ENDPGM 0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LSV should have gotten this case, I don't see why it didn't. Someone should look into this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you open an issue for this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Opened #97715.
Ping |
Dest0->setIsEarlyClobber(false); | ||
Dest1->setIsEarlyClobber(false); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a bit ugly to modify in-place the operands of CI.I
and Paired.I
. But I guess it is harmless since they will be erased soon, when the merged load instruction is created.
@@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, | |||
; | |||
; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: | |||
; GFX1013: ; %bb.0: | |||
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 | |||
; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this code changes because xnack is enabled by default for GFX10.1? Is there anything we could do to add known alignment info here, to avoid the code pessimization?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this code changes because xnack is enabled by default for GFX10.1?
Yes.
Is there anything we could do to add known alignment info here, to avoid the code pessimization?
I'm not sure what can be done for it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still think it is terribly surprising all of the test diff shows up in this commit, and not the selection case
Because the selection support is done in the next PR of the review stack, #96162. This patch takes care of choosing the right opcode while merging the loads. |
It's more the lack of vectorization of these loads in the IR that's surprising. Ideally we wouldn't have to rely on any of this post-isel load merging |
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32) | |||
|
|||
; GCN-LABEL: {{^}}global_atomic_csub_rtn: | |||
; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc | |||
; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN | |||
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You shouldn't need any changes in this file.
95cd3f1
to
d7c254b
Compare
a1b2665
to
a8394e2
Compare
d7c254b
to
f7c8bca
Compare
b004bd6
to
4f5de35
Compare
Merge activity
|
f7c8bca
to
2850b4f
Compare
363be34
to
9ab38a2
Compare
Consider the constrained multi-dword loads while merging individual loads to a single multi-dword load.
9ab38a2
to
276fb59
Compare
Summary: Consider the constrained multi-dword loads while merging individual loads to a single multi-dword load. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251148
Consider the constrained multi-dword loads while merging
individual loads to a single multi-dword load.