Skip to content

Commit fb0e7b5

Browse files
authored
[AMDGPU][True16][CodeGen] Implement sgpr folding in true16 (#128929)
We haven't implemented 16 bit SGPRs. Currently allow 32-bit SGPRs to be folded into True16 bit instructions taking 16 bit values. Also use sgpr_32 when Imm is copied to spgr_lo16 so it could be further folded. This improves generated code quality.
1 parent 6f1347d commit fb0e7b5

27 files changed

+764
-1055
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
#include "AMDGPU.h"
1313
#include "GCNSubtarget.h"
1414
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15+
#include "SIInstrInfo.h"
1516
#include "SIMachineFunctionInfo.h"
17+
#include "SIRegisterInfo.h"
1618
#include "llvm/ADT/DepthFirstIterator.h"
19+
#include "llvm/CodeGen/MachineFunction.h"
1720
#include "llvm/CodeGen/MachineFunctionPass.h"
1821
#include "llvm/CodeGen/MachineOperand.h"
1922

@@ -576,6 +579,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
576579
}
577580

578581
MachineOperand *New = Fold.OpToFold;
582+
// Rework once the VS_16 register class is updated to include proper
583+
// 16-bit SGPRs instead of 32-bit ones.
584+
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
585+
Old.setSubReg(AMDGPU::NoSubRegister);
579586
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
580587
Old.setIsUndef(New->isUndef());
581588
return true;
@@ -947,9 +954,15 @@ void SIFoldOperandsImpl::foldOperand(
947954
return;
948955

949956
// FIXME: Fold operands with subregs.
950-
if (UseOp->isReg() && OpToFold.isReg() &&
951-
(UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
952-
return;
957+
if (UseOp->isReg() && OpToFold.isReg()) {
958+
if (UseOp->isImplicit())
959+
return;
960+
// Allow folding from SGPRs to 16-bit VGPRs.
961+
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
962+
(UseOp->getSubReg() != AMDGPU::lo16 ||
963+
!TRI->isSGPRReg(*MRI, OpToFold.getReg())))
964+
return;
965+
}
953966

954967
// Special case for REG_SEQUENCE: We can't fold literals into
955968
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1040,6 +1053,14 @@ void SIFoldOperandsImpl::foldOperand(
10401053
}
10411054
}
10421055

1056+
// Allow immediates COPYd into sgpr_lo16 to be further folded while
1057+
// still being legal if not further folded
1058+
if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
1059+
assert(ST->useRealTrue16Insts());
1060+
MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
1061+
DestRC = &AMDGPU::SGPR_32RegClass;
1062+
}
1063+
10431064
// In order to fold immediates into copies, we need to change the
10441065
// copy to a MOV.
10451066

@@ -1073,9 +1094,43 @@ void SIFoldOperandsImpl::foldOperand(
10731094
UseMI->getOperand(0).getReg().isVirtual() &&
10741095
!UseMI->getOperand(1).getSubReg()) {
10751096
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1097+
unsigned Size = TII->getOpSize(*UseMI, 1);
10761098
Register UseReg = OpToFold.getReg();
10771099
UseMI->getOperand(1).setReg(UseReg);
1078-
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1100+
unsigned SubRegIdx = OpToFold.getSubReg();
1101+
// Hack to allow 32-bit SGPRs to be folded into True16 instructions
1102+
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1103+
// VS_16RegClass
1104+
//
1105+
// Excerpt from AMDGPUGenRegisterInfo.inc
1106+
// NoSubRegister, //0
1107+
// hi16, // 1
1108+
// lo16, // 2
1109+
// sub0, // 3
1110+
// ...
1111+
// sub1, // 11
1112+
// sub1_hi16, // 12
1113+
// sub1_lo16, // 13
1114+
static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1115+
if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1116+
TRI->isSGPRReg(*MRI, UseReg)) {
1117+
// Produce the 32 bit subregister index to which the 16-bit subregister
1118+
// is aligned.
1119+
if (SubRegIdx > AMDGPU::sub1) {
1120+
LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1121+
M |= M.getLane(M.getHighestLane() - 1);
1122+
SmallVector<unsigned, 4> Indexes;
1123+
TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1124+
Indexes);
1125+
assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1126+
SubRegIdx = Indexes[0];
1127+
// 32-bit registers do not have a sub0 index
1128+
} else if (TII->getOpSize(*UseMI, 1) == 4)
1129+
SubRegIdx = 0;
1130+
else
1131+
SubRegIdx = AMDGPU::sub0;
1132+
}
1133+
UseMI->getOperand(1).setSubReg(SubRegIdx);
10791134
UseMI->getOperand(1).setIsKill(false);
10801135
CopiesToReplace.push_back(UseMI);
10811136
OpToFold.setIsKill(false);
@@ -1713,6 +1768,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
17131768
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
17141769
return false;
17151770

1771+
// True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
1772+
// Can remove this code if proper 16-bit SGPRs are implemented
1773+
// Example: Pre-peephole-opt
1774+
// %29:sgpr_lo16 = COPY %16.lo16:sreg_32
1775+
// %32:sreg_32 = COPY %29:sgpr_lo16
1776+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1777+
// Post-peephole-opt and DCE
1778+
// %32:sreg_32 = COPY %16.lo16:sreg_32
1779+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1780+
// After this transform
1781+
// %32:sreg_32 = COPY %16:sreg_32
1782+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1783+
// After the fold operands pass
1784+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
1785+
if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
1786+
OpToFold.getSubReg()) {
1787+
const TargetRegisterClass *DstRC =
1788+
MRI->getRegClass(MI.getOperand(0).getReg());
1789+
if (DstRC == &AMDGPU::SReg_32RegClass &&
1790+
DstRC == MRI->getRegClass(OpToFold.getReg())) {
1791+
assert(OpToFold.getSubReg() == AMDGPU::lo16);
1792+
OpToFold.setSubReg(0);
1793+
}
1794+
}
1795+
17161796
// Prevent folding operands backwards in the function. For example,
17171797
// the COPY opcode must not be replaced by 1 in this example:
17181798
//

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in {
776776
// Restrict src0 to be VGPR
777777
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
778778
[], /*VOP1Only=*/ 1>;
779+
let isAsCheapAsAMove = 1 in
779780
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
780781
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
781782
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -38819,16 +38819,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3881938819
; GFX11TRUE16-LABEL: s_select_v2bf16:
3882038820
; GFX11TRUE16: ; %bb.0:
3882138821
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
38822-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
3882338822
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
38824-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38825-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
38826-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38827-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38828-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38829-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
38830-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38831-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38823+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
38824+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
38825+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38826+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38827+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
38828+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38829+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
3883238830
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
3883338831
; GFX11TRUE16-NEXT: ; return to shader part epilog
3883438832
;
@@ -38936,19 +38934,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3893638934
;
3893738935
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
3893838936
; GFX11TRUE16: ; %bb.0:
38939-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
38940-
; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
38937+
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
3894138938
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3894238939
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3894338940
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38944-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
38945-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38946-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38947-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38948-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
38949-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38950-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38951-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
38941+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
38942+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38943+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38944+
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
38945+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38946+
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
38947+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
3895238948
; GFX11TRUE16-NEXT: ; return to shader part epilog
3895338949
;
3895438950
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
@@ -40655,30 +40651,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
4065540651
;
4065640652
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
4065740653
; GFX11TRUE16: ; %bb.0:
40658-
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
40654+
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
40655+
; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
4065940656
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4066040657
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
40661-
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
40662-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40663-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
40664-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
40665-
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
4066640658
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
4066740659
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
40668-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
40669-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
40670-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
40671-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
40672-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
40673-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
40674-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
40675-
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
40676-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
40677-
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
40678-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
40660+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40661+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
40662+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
40663+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
40664+
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
40665+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
40666+
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
40667+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
40668+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
40669+
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
4067940670
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
40680-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
40681-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
40671+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
40672+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
4068240673
; GFX11TRUE16-NEXT: ; return to shader part epilog
4068340674
;
4068440675
; GFX11FAKE16-LABEL: s_vselect_v4bf16:

0 commit comments

Comments
 (0)