-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][CodeGen] Implement sgpr folding in true16 #128929
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
b2e8e36
to
739fc21
Compare
c00f816
to
254c576
Compare
254c576
to
41d8a99
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesWe haven't implemented 16 bit SGPRs. Currently allow 32-bit SGPRs to be folded into True16 bit instructions taking 16 bit values. This improves generated code quality. Patch is 129.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128929.diff 26 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..049228edc7ba8 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,8 +12,11 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.OpToFold;
+ // TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
+ // Rework once the VS_16 register class is updated to include proper
+ // 16-bit SGPRs instead of 32-bit ones.
+ if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
+ Old.setSubReg(AMDGPU::NoSubRegister);
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
Old.setIsUndef(New->isUndef());
return true;
@@ -947,9 +955,15 @@ void SIFoldOperandsImpl::foldOperand(
return;
// FIXME: Fold operands with subregs.
- if (UseOp->isReg() && OpToFold.isReg() &&
- (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
- return;
+ if (UseOp->isReg() && OpToFold.isReg()) {
+ if (UseOp->isImplicit())
+ return;
+ // Allow folding from SGPRs to 16-bit VGPRs.
+ if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
+ (UseOp->getSubReg() != AMDGPU::lo16 ||
+ !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
+ return;
+ }
// Special case for REG_SEQUENCE: We can't fold literals into
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1040,6 +1054,14 @@ void SIFoldOperandsImpl::foldOperand(
}
}
+ // Allow immediates COPYd into sgpr_lo16 to be further folded while
+ // still being legal if not further folded
+ if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
+ assert(ST->useRealTrue16Insts());
+ MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
+ DestRC = &AMDGPU::SGPR_32RegClass;
+ }
+
// In order to fold immediates into copies, we need to change the
// copy to a MOV.
@@ -1073,9 +1095,43 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
+ unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
- UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+ unsigned SubRegIdx = OpToFold.getSubReg();
+ // Hack to allow 32-bit SGPRs to be folded into True16 instructions
+ // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
+ // VS_16RegClass
+ //
+ // Excerpt from AMDGPUGenRegisterInfo.inc
+ // NoSubRegister, //0
+ // hi16, // 1
+ // lo16, // 2
+ // sub0, // 3
+ // ...
+ // sub1, // 11
+ // sub1_hi16, // 12
+ // sub1_lo16, // 13
+ static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
+ if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isSGPRReg(*MRI, UseReg)) {
+ // Produce the 32 bit subregister index to which the 16-bit subregister
+ // is aligned.
+ if (SubRegIdx > AMDGPU::sub1) {
+ LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
+ M |= M.getLane(M.getHighestLane() - 1);
+ SmallVector<unsigned, 4> Indexes;
+ TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
+ Indexes);
+ assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
+ SubRegIdx = Indexes[0];
+ // 32-bit registers do not have a sub0 index
+ } else if (TII->getOpSize(*UseMI, 1) == 4)
+ SubRegIdx = 0;
+ else
+ SubRegIdx = AMDGPU::sub0;
+ }
+ UseMI->getOperand(1).setSubReg(SubRegIdx);
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
@@ -1713,6 +1769,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
return false;
+ // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
+ // Can remove this code if proper 16-bit SGPRs are implemented
+ // Example: Pre-peephole-opt
+ // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
+ // %32:sreg_32 = COPY %29:sgpr_lo16
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // Post-peephole-opt and DCE
+ // %32:sreg_32 = COPY %16.lo16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After this transform
+ // %32:sreg_32 = COPY %16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After the fold operands pass
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
+ if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
+ OpToFold.getSubReg()) {
+ const TargetRegisterClass *DstRC =
+ MRI->getRegClass(MI.getOperand(0).getReg());
+ if (DstRC == &AMDGPU::SReg_32RegClass &&
+ DstRC == MRI->getRegClass(OpToFold.getReg())) {
+ assert(OpToFold.getSubReg() == AMDGPU::lo16);
+ OpToFold.setSubReg(0);
+ }
+ }
+
// Prevent folding operands backwards in the function. For example,
// the COPY opcode must not be replaced by 1 in this example:
//
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index def06c1e9a0d7..db5b1e5c9a035 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
[], /*VOP1Only=*/ 1>;
+ let isAsCheapAsAMove = 1 in
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index efcaa8807367b..ec7d7379a4233 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38259,16 +38259,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX11TRUE16-LABEL: s_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
@@ -38376,19 +38374,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
@@ -40095,30 +40091,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
-; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
+; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index a95a1aba0c914..e70cd2400172d 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -303,18 +303,32 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s2, 0x10203
+; GFX11-REAL16-NEXT: s_mov_b32 s2, -1
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in, align 8
%bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
store i64 %bswap, ptr addrspace(1) %out, align 8
@@ -364,20 +378,36 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_v2i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_v2i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-REAL16-NEXT: s_mov_b32 s2, -1
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_v2i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s6, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s7, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <2 x i64>, ptr addrspace(1) %in, align 16
%bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
store <2 x i64> %bswap, ptr addrspace(1) %out, align 16
@@ -445,26 +475,49 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_v4i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v7, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v6, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v5, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v4, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_v4i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX11-REAL16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-REAL16-NEXT: s_mov_b32 s10, -1
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v4, 0, s0, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v6, 0, s2, 0x10203
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v5, v4
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-REAL16-NEXT: s_clause 0x1
+; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 offset:16
+; GFX11-REAL16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_v4i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, 0, s6, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, 0, s7, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s2, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s3, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <4 x i64>, ptr addrspace(1) %in, align 32
%bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
store <4 x i64> %bswap, ptr addrspace(1) %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 14ddf7daad1c6..03cb3b28480c4 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -216,34 +216,19 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LA...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the commit title I would change workaround to 'Implement'
LGTM besides that and comment change.
I will let someone else approve it.
// sub1, // 11 | ||
// sub1_hi16, // 12 | ||
// sub1_lo16, // 13 | ||
static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there no way to avoid the hardcoded value 12 here? These fields are autogenerated and they are bound to change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The point of the hardcoded value here is as an alarm that something has changed. The folding code below will not work if the indices change, and it is difficult to predict how they would change. In that sense being brittle is a feature, rather than a bug. That said, the code has existed for almost 2 years downstream, and these values haven't changed in that time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see your point, and I don't have a better solution to recommend here. But it looks like any sub-reg layout change in the future needed a fixup in this hardcoded value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see how the code below depends on the exact values cited in this comment. Is it just the condition SubRegIdx > AMDGPU::sub1
, which could be written as SubRegIdx == AMDGPU::sub1_hi16 || SubRegIdx == AMDGPU::sub1_lo16
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, SubRegIdx can lo16, sub0_lo16, sub1_lo16, sub2_lo16, sub3_lo16, sub4_lo16 etc. I'm pretty sure SubRegIdx is always lo16. Also It requires that the order goes ... subX, subX_hi16, subX_lo16 ..., so we add subX_hi16 to subX_lo16 to form M
50a9fc4
to
48153d9
Compare
rebased |
Gentle ping! |
@@ -1040,6 +1053,14 @@ void SIFoldOperandsImpl::foldOperand( | |||
} | |||
} | |||
|
|||
// Allow immediates COPYd into sgpr_lo16 to be further folded while |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is COPYd
intentional?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, to indicate the use of the COPY instruction. But if it's confusing I'd be happy to change it.
48153d9
to
b8c01d6
Compare
rebased |
Hi everyone, another Gentle ping! Any other comments for this patch? Thanks! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The hardcoded subreg indexes is still a hack, but LGTM as it was discussed and approved downstream long time ago.
b8c01d6
to
2002527
Compare
rebased |
We haven't implemented 16 bit SGPRs. Currently allow 32-bit SGPRs to be folded into True16 bit instructions taking 16 bit values. Also use sgpr_32 when Imm is copied to spgr_lo16 so it could be further folded. This improves generated code quality.