Skip to content

Commit 25ba839

Browse files
committed
16bit sgpr folding
1 parent 2edf534 commit 25ba839

26 files changed

+702
-932
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
#include "AMDGPU.h"
1313
#include "GCNSubtarget.h"
1414
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15+
#include "SIInstrInfo.h"
1516
#include "SIMachineFunctionInfo.h"
17+
#include "SIRegisterInfo.h"
1618
#include "llvm/ADT/DepthFirstIterator.h"
19+
#include "llvm/CodeGen/MachineFunction.h"
1720
#include "llvm/CodeGen/MachineFunctionPass.h"
1821
#include "llvm/CodeGen/MachineOperand.h"
1922

@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
576579
}
577580

578581
MachineOperand *New = Fold.OpToFold;
582+
// TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
583+
// Rework once the VS_16 register class is updated to include proper
584+
// 16-bit SGPRs instead of 32-bit ones.
585+
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
586+
Old.setSubReg(AMDGPU::NoSubRegister);
579587
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
580588
Old.setIsUndef(New->isUndef());
581589
return true;
@@ -947,9 +955,15 @@ void SIFoldOperandsImpl::foldOperand(
947955
return;
948956

949957
// FIXME: Fold operands with subregs.
950-
if (UseOp->isReg() && OpToFold.isReg() &&
951-
(UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
952-
return;
958+
if (UseOp->isReg() && OpToFold.isReg()) {
959+
if (UseOp->isImplicit())
960+
return;
961+
// Allow folding from SGPRs to 16-bit VGPRs.
962+
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
963+
(UseOp->getSubReg() != AMDGPU::lo16 ||
964+
!TRI->isSGPRReg(*MRI, OpToFold.getReg())))
965+
return;
966+
}
953967

954968
// Special case for REG_SEQUENCE: We can't fold literals into
955969
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1040,6 +1054,14 @@ void SIFoldOperandsImpl::foldOperand(
10401054
}
10411055
}
10421056

1057+
// Allow immediates COPYd into sgpr_lo16 to be further folded while
1058+
// still being legal if not further folded
1059+
if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
1060+
assert(ST->useRealTrue16Insts());
1061+
MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
1062+
DestRC = &AMDGPU::SGPR_32RegClass;
1063+
}
1064+
10431065
// In order to fold immediates into copies, we need to change the
10441066
// copy to a MOV.
10451067

@@ -1073,9 +1095,43 @@ void SIFoldOperandsImpl::foldOperand(
10731095
UseMI->getOperand(0).getReg().isVirtual() &&
10741096
!UseMI->getOperand(1).getSubReg()) {
10751097
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1098+
unsigned Size = TII->getOpSize(*UseMI, 1);
10761099
Register UseReg = OpToFold.getReg();
10771100
UseMI->getOperand(1).setReg(UseReg);
1078-
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1101+
unsigned SubRegIdx = OpToFold.getSubReg();
1102+
// Hack to allow 32-bit SGPRs to be folded into True16 instructions
1103+
// Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1104+
// VS_16RegClass
1105+
//
1106+
// Excerpt from AMDGPUGenRegisterInfo.inc
1107+
// NoSubRegister, //0
1108+
// hi16, // 1
1109+
// lo16, // 2
1110+
// sub0, // 3
1111+
// ...
1112+
// sub1, // 11
1113+
// sub1_hi16, // 12
1114+
// sub1_lo16, // 13
1115+
static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1116+
if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1117+
TRI->isSGPRReg(*MRI, UseReg)) {
1118+
// Produce the 32 bit subregister index to which the 16-bit subregister
1119+
// is aligned.
1120+
if (SubRegIdx > AMDGPU::sub1) {
1121+
LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1122+
M |= M.getLane(M.getHighestLane() - 1);
1123+
SmallVector<unsigned, 4> Indexes;
1124+
TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1125+
Indexes);
1126+
assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1127+
SubRegIdx = Indexes[0];
1128+
// 32-bit registers do not have a sub0 index
1129+
} else if (TII->getOpSize(*UseMI, 1) == 4)
1130+
SubRegIdx = 0;
1131+
else
1132+
SubRegIdx = AMDGPU::sub0;
1133+
}
1134+
UseMI->getOperand(1).setSubReg(SubRegIdx);
10791135
UseMI->getOperand(1).setIsKill(false);
10801136
CopiesToReplace.push_back(UseMI);
10811137
OpToFold.setIsKill(false);
@@ -1713,6 +1769,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
17131769
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
17141770
return false;
17151771

1772+
// True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
1773+
// Can remove this code if proper 16-bit SGPRs are implemented
1774+
// Example: Pre-peephole-opt
1775+
// %29:sgpr_lo16 = COPY %16.lo16:sreg_32
1776+
// %32:sreg_32 = COPY %29:sgpr_lo16
1777+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1778+
// Post-peephole-opt and DCE
1779+
// %32:sreg_32 = COPY %16.lo16:sreg_32
1780+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1781+
// After this transform
1782+
// %32:sreg_32 = COPY %16:sreg_32
1783+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
1784+
// After the fold operands pass
1785+
// %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
1786+
if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
1787+
OpToFold.getSubReg()) {
1788+
const TargetRegisterClass *DstRC =
1789+
MRI->getRegClass(MI.getOperand(0).getReg());
1790+
if (DstRC == &AMDGPU::SReg_32RegClass &&
1791+
DstRC == MRI->getRegClass(OpToFold.getReg())) {
1792+
assert(OpToFold.getSubReg() == AMDGPU::lo16);
1793+
OpToFold.setSubReg(0);
1794+
}
1795+
}
1796+
17161797
// Prevent folding operands backwards in the function. For example,
17171798
// the COPY opcode must not be replaced by 1 in this example:
17181799
//

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in {
776776
// Restrict src0 to be VGPR
777777
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
778778
[], /*VOP1Only=*/ 1>;
779+
let isAsCheapAsAMove = 1 in
779780
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
780781
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
781782
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -38259,16 +38259,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3825938259
; GFX11TRUE16-LABEL: s_select_v2bf16:
3826038260
; GFX11TRUE16: ; %bb.0:
3826138261
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
38262-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
3826338262
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
38264-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38265-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
38266-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38267-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38268-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38269-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
38270-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38271-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38263+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
38264+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
38265+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38266+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38267+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
38268+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38269+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
3827238270
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
3827338271
; GFX11TRUE16-NEXT: ; return to shader part epilog
3827438272
;
@@ -38376,19 +38374,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3837638374
;
3837738375
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
3837838376
; GFX11TRUE16: ; %bb.0:
38379-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
38380-
; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
38377+
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
3838138378
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3838238379
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3838338380
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38384-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
38385-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38386-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38387-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38388-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
38389-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38390-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38391-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
38381+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
38382+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38383+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38384+
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
38385+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38386+
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
38387+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
3839238388
; GFX11TRUE16-NEXT: ; return to shader part epilog
3839338389
;
3839438390
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
@@ -40095,30 +40091,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
4009540091
;
4009640092
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
4009740093
; GFX11TRUE16: ; %bb.0:
40098-
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
40094+
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
40095+
; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
4009940096
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4010040097
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
40101-
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
40102-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40103-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
40104-
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
40105-
; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
4010640098
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
4010740099
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
40108-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
40109-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
40110-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
40111-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
40112-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
40113-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
40114-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
40115-
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
40116-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
40117-
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
40118-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
40100+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40101+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
40102+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
40103+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
40104+
; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
40105+
; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
40106+
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
40107+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
40108+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
40109+
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
4011940110
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
40120-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
40121-
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
40111+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
40112+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
4012240113
; GFX11TRUE16-NEXT: ; return to shader part epilog
4012340114
;
4012440115
; GFX11FAKE16-LABEL: s_vselect_v4bf16:

0 commit comments

Comments
 (0)