Skip to content

Commit bf388f8

Browse files
authored
[AMDGPU][True16][CodeGen] legalize operands when move16bit SALU to VALU (#133985)
This is a follow up PR from #132089. When a V2S copy and its useMI are lowered to VALU, this patch check: If the generated new VALU is a true16 inst. Add subreg access on all operands if necessary. an example MIR looks like: ``` %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ... %2:sreg_32 = COPY %1:vgpr_32 %3:sreg_32 = S_FLOOR_F16 %2:sreg_32, ... ``` currently lowered to ``` %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ... %2:vgpr_16 = V_FLOOR_F16_t16_e64 0, %1:vgpr_32, 0, 0, 0 ... ``` after this patch ``` %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ... %2:vgpr_16 = V_FLOOR_F16_t16_e64 0, %1.lo16:vgpr_32, 0, 0, 0 ... ```
1 parent bec5cfd commit bf388f8

File tree

4 files changed

+72
-15
lines changed

4 files changed

+72
-15
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7228,6 +7228,29 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
72287228
return DeferredList.contains(MI);
72297229
}
72307230

7231+
// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
7232+
// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
7233+
// subreg access properly. This can be removed after we have sgpr16 in place
7234+
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
7235+
MachineRegisterInfo &MRI) const {
7236+
unsigned Opcode = Inst.getOpcode();
7237+
if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
7238+
return;
7239+
7240+
for (MachineOperand &Op : Inst.explicit_operands()) {
7241+
unsigned OpIdx = Op.getOperandNo();
7242+
if (!OpIdx)
7243+
continue;
7244+
if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
7245+
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7246+
const TargetRegisterClass *RC = RI.getRegClass(RCID);
7247+
if (RI.getRegSizeInBits(*RC) == 16) {
7248+
Op.setSubReg(AMDGPU::lo16);
7249+
}
7250+
}
7251+
}
7252+
}
7253+
72317254
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
72327255
MachineDominatorTree *MDT) const {
72337256

@@ -7613,6 +7636,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
76137636
.add(Inst.getOperand(0))
76147637
.add(Inst.getOperand(1));
76157638
}
7639+
legalizeOperandsVALUt16(*NewInstr, MRI);
76167640
legalizeOperands(*NewInstr, MDT);
76177641
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
76187642
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
@@ -7682,6 +7706,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
76827706
.addImm(0) // omod
76837707
.addImm(0); // opsel0
76847708
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7709+
legalizeOperandsVALUt16(*NewInstr, MRI);
76857710
legalizeOperands(*NewInstr, MDT);
76867711
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
76877712
Inst.eraseFromParent();
@@ -7747,6 +7772,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77477772

77487773
// If this is a v2s copy src from vgpr16 to sgpr32,
77497774
// replace vgpr copy to subreg_to_reg
7775+
// This can be remove after we have sgpr16 in place
77507776
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
77517777
Inst.getOperand(1).getReg().isVirtual() &&
77527778
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
@@ -7785,11 +7811,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77857811
NewInstr.addImm(0);
77867812
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
77877813
MachineOperand Src = Inst.getOperand(1);
7788-
if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7789-
Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7790-
NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7791-
else
7792-
NewInstr->addOperand(Src);
7814+
NewInstr->addOperand(Src);
77937815
}
77947816

77957817
if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
@@ -7863,6 +7885,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78637885

78647886
// Check useMI of NewInstr. If used by a true16 instruction,
78657887
// add a lo16 subreg access if size mismatched
7888+
// This can be remove after we have sgpr16 in place
78667889
if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
78677890
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
78687891
E = MRI.use_end();
@@ -7878,6 +7901,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78787901
}
78797902
}
78807903
fixImplicitOperands(*NewInstr);
7904+
7905+
legalizeOperandsVALUt16(*NewInstr, MRI);
7906+
78817907
// Legalize the operands
78827908
legalizeOperands(*NewInstr, MDT);
78837909
if (NewDstReg)

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
12791279
/// was moved to VGPR. \returns true if succeeded.
12801280
bool moveFlatAddrToVGPR(MachineInstr &Inst) const;
12811281

1282+
/// Fix operands in Inst to fix 16bit SALU to VALU lowering.
1283+
void legalizeOperandsVALUt16(MachineInstr &Inst,
1284+
MachineRegisterInfo &MRI) const;
1285+
12821286
/// Replace the instructions opcode with the equivalent VALU
12831287
/// opcode. This function will also move the users of MachineInstruntions
12841288
/// in the \p WorkList to the VALU if necessary. If present, \p MDT is

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,26 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
22
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
33

4+
---
5+
name: cmp_f16
6+
body: |
7+
bb.0.entry:
8+
; GCN-LABEL: name: cmp_f16
9+
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
10+
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
11+
; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
12+
; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
13+
; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
14+
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
15+
%0:vgpr_16 = IMPLICIT_DEF
16+
%1:sreg_32 = IMPLICIT_DEF
17+
%2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
18+
%3:sreg_32 = COPY %2:vgpr_16
19+
nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
20+
%4:sreg_32_xm0_xexec = COPY $scc
21+
%5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
22+
...
23+
424
---
525
name: cvt_hi_f32_f16
626
body: |

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2-
# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
3-
# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
4-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=REAL16 %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=FAKE16 %s
54

65
---
76
name: fmac_f16
87
body: |
98
bb.0:
10-
; GCN-LABEL: name: fmac_f16
11-
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
12-
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
13-
; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
14-
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
15-
; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
16-
; GCN-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
9+
; REAL16-LABEL: name: fmac_f16
10+
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
11+
; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
12+
; REAL16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
13+
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
14+
; REAL16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
15+
; REAL16-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
16+
;
17+
; FAKE16-LABEL: name: fmac_f16
18+
; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
19+
; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
20+
; FAKE16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
21+
; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
22+
; FAKE16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
23+
; FAKE16-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
1724
%0:vgpr_32 = IMPLICIT_DEF
1825
%1:sreg_32 = IMPLICIT_DEF
1926
%2:sreg_32 = IMPLICIT_DEF

0 commit comments

Comments
 (0)