Skip to content

Commit 1121b02

Browse files
committed
[AMDGPU][True16][CodeGen] true16 codegen pattern for fma (llvm#122950)
true16 codegen pattern for f16 fma. created a duplicated shrink-mad-fma-gfx10.mir from shrink-mad-fma to seperate pre-GFX11 and GFX11 mir test.
1 parent 0afe2bd commit 1121b02

File tree

10 files changed

+871
-244
lines changed

10 files changed

+871
-244
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ static unsigned macToMad(unsigned Opc) {
203203
return AMDGPU::V_FMA_F32_e64;
204204
case AMDGPU::V_FMAC_F16_e64:
205205
return AMDGPU::V_FMA_F16_gfx9_e64;
206+
case AMDGPU::V_FMAC_F16_t16_e64:
207+
return AMDGPU::V_FMA_F16_gfx9_t16_e64;
206208
case AMDGPU::V_FMAC_F16_fake16_e64:
207209
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
208210
case AMDGPU::V_FMAC_LEGACY_F32_e64:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3533,6 +3533,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35333533
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
35343534
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35353535
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3536+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35363537
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
35373538
// Don't fold if we are using source or output modifiers. The new VOP2
35383539
// instructions don't have them.
@@ -3555,6 +3556,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35553556
bool IsFMA =
35563557
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35573558
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3559+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35583560
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35593561
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
35603562
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3588,16 +3590,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35883590

35893591
unsigned NewOpc =
35903592
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3591-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3593+
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3594+
? AMDGPU::V_FMAMK_F16_t16
3595+
: AMDGPU::V_FMAMK_F16_fake16
35923596
: AMDGPU::V_FMAMK_F16)
35933597
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
35943598
if (pseudoToMCOpcode(NewOpc) == -1)
35953599
return false;
35963600

3597-
// V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3598-
// would also require restricting their register classes. For now
3599-
// just bail out.
3600-
if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3601+
// V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3602+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3603+
// restricting their register classes. For now just bail out.
3604+
if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3605+
NewOpc == AMDGPU::V_FMAMK_F16_fake16)
36013606
return false;
36023607

36033608
const std::optional<int64_t> SubRegImm = extractSubregFromImm(
@@ -3613,7 +3618,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36133618
Src0->setIsKill(RegSrc->isKill());
36143619

36153620
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3616-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3621+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36173622
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36183623
UseMI.untieRegOperand(
36193624
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3668,23 +3673,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36683673

36693674
unsigned NewOpc =
36703675
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3671-
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3676+
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3677+
? AMDGPU::V_FMAAK_F16_t16
3678+
: AMDGPU::V_FMAAK_F16_fake16
36723679
: AMDGPU::V_FMAAK_F16)
36733680
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
36743681
if (pseudoToMCOpcode(NewOpc) == -1)
36753682
return false;
36763683

3677-
// V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3678-
// would also require restricting their register classes. For now
3679-
// just bail out.
3680-
if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3684+
// V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3685+
// takes VGPR_32_Lo128 operands, so the rewrite would also require
3686+
// restricting their register classes. For now just bail out.
3687+
if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3688+
NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36813689
return false;
36823690

36833691
// FIXME: This would be a lot easier if we could return a new instruction
36843692
// instead of having to modify in place.
36853693

36863694
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3687-
Opc == AMDGPU::V_FMAC_F32_e64 ||
3695+
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36883696
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36893697
UseMI.untieRegOperand(
36903698
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3874,8 +3882,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38743882
return AMDGPU::V_FMA_LEGACY_F32_e64;
38753883
case AMDGPU::V_FMAC_F16_e32:
38763884
case AMDGPU::V_FMAC_F16_e64:
3885+
case AMDGPU::V_FMAC_F16_t16_e64:
38773886
case AMDGPU::V_FMAC_F16_fake16_e64:
3878-
return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3887+
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3888+
? AMDGPU::V_FMA_F16_gfx9_t16_e64
3889+
: AMDGPU::V_FMA_F16_gfx9_fake16_e64
38793890
: AMDGPU::V_FMA_F16_gfx9_e64;
38803891
case AMDGPU::V_FMAC_F32_e32:
38813892
case AMDGPU::V_FMAC_F32_e64:
@@ -3941,19 +3952,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39413952
return MIB;
39423953
}
39433954

3944-
assert(
3945-
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3946-
"V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3947-
"pre-RA");
3955+
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3956+
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3957+
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3958+
"present "
3959+
"pre-RA");
39483960

39493961
// Handle MAC/FMAC.
39503962
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
39513963
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3964+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39523965
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39533966
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39543967
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39553968
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39563969
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3970+
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39573971
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39583972
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39593973
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3968,6 +3982,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39683982
return nullptr;
39693983
case AMDGPU::V_MAC_F16_e64:
39703984
case AMDGPU::V_FMAC_F16_e64:
3985+
case AMDGPU::V_FMAC_F16_t16_e64:
39713986
case AMDGPU::V_FMAC_F16_fake16_e64:
39723987
case AMDGPU::V_MAC_F32_e64:
39733988
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4053,8 +4068,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40534068
int64_t Imm;
40544069
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
40554070
unsigned NewOpc =
4056-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
4057-
: AMDGPU::V_FMAAK_F16)
4071+
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
4072+
? ST.useRealTrue16Insts()
4073+
? AMDGPU::V_FMAAK_F16_t16
4074+
: AMDGPU::V_FMAAK_F16_fake16
4075+
: AMDGPU::V_FMAAK_F16)
40584076
: AMDGPU::V_FMAAK_F32)
40594077
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
40604078
if (pseudoToMCOpcode(NewOpc) != -1) {
@@ -4071,11 +4089,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40714089
return MIB;
40724090
}
40734091
}
4074-
unsigned NewOpc =
4075-
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
4076-
: AMDGPU::V_FMAMK_F16)
4077-
: AMDGPU::V_FMAMK_F32)
4078-
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4092+
unsigned NewOpc = IsFMA
4093+
? (IsF16 ? (ST.hasTrue16BitInsts()
4094+
? ST.useRealTrue16Insts()
4095+
? AMDGPU::V_FMAMK_F16_t16
4096+
: AMDGPU::V_FMAMK_F16_fake16
4097+
: AMDGPU::V_FMAMK_F16)
4098+
: AMDGPU::V_FMAMK_F32)
4099+
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
40794100
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
40804101
if (pseudoToMCOpcode(NewOpc) != -1) {
40814102
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4513,6 +4534,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
45134534
case AMDGPU::V_MAC_F32_e64:
45144535
case AMDGPU::V_MAC_LEGACY_F32_e64:
45154536
case AMDGPU::V_FMAC_F16_e64:
4537+
case AMDGPU::V_FMAC_F16_t16_e64:
45164538
case AMDGPU::V_FMAC_F16_fake16_e64:
45174539
case AMDGPU::V_FMAC_F32_e64:
45184540
case AMDGPU::V_FMAC_F64_e64:
@@ -5569,7 +5591,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55695591
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55705592
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55715593
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5572-
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5594+
case AMDGPU::S_FMAC_F16:
5595+
return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5596+
: AMDGPU::V_FMAC_F16_fake16_e64;
55735597
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55745598
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55755599
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3287,6 +3287,14 @@ def : GCNPat <
32873287
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
32883288
SRCMODS.NONE, $src2)
32893289
>;
3290+
let True16Predicate = UseRealTrue16Insts in
3291+
def : GCNPat <
3292+
(fma (f16 (VOP3NoMods f16:$src0)),
3293+
(f16 (VOP3NoMods f16:$src1)),
3294+
(f16 (VOP3NoMods f16:$src2))),
3295+
(V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
3296+
SRCMODS.NONE, $src2)
3297+
>;
32903298
let True16Predicate = UseFakeTrue16Insts in
32913299
def : GCNPat <
32923300
(fma (f16 (VOP3NoMods f16:$src0)),

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
455455
break;
456456
case AMDGPU::V_FMA_F16_e64:
457457
case AMDGPU::V_FMA_F16_gfx9_e64:
458+
NewOpcode = AMDGPU::V_FMAAK_F16;
459+
break;
460+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
461+
NewOpcode = AMDGPU::V_FMAAK_F16_t16;
462+
break;
458463
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
459-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
460-
: AMDGPU::V_FMAAK_F16;
464+
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
461465
break;
462466
}
463467
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
485489
break;
486490
case AMDGPU::V_FMA_F16_e64:
487491
case AMDGPU::V_FMA_F16_gfx9_e64:
492+
NewOpcode = AMDGPU::V_FMAMK_F16;
493+
break;
494+
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
495+
NewOpcode = AMDGPU::V_FMAMK_F16_t16;
496+
break;
488497
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
489-
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
490-
: AMDGPU::V_FMAMK_F16;
498+
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
491499
break;
492500
}
493501
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
959967
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
960968
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
961969
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
970+
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
962971
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
963972
shrinkMadFma(MI);
964973
continue;

llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
44
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
55
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
6+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
78

89
define float @v_fma_f32(float %x, float %y, float %z) {
910
; GFX6-LABEL: v_fma_f32:
@@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
107108
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
108109
; GFX10-NEXT: s_setpc_b64 s[30:31]
109110
;
110-
; GFX11-LABEL: v_fma_f16:
111-
; GFX11: ; %bb.0:
112-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113-
; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
114-
; GFX11-NEXT: s_setpc_b64 s[30:31]
111+
; GFX11-TRUE16-LABEL: v_fma_f16:
112+
; GFX11-TRUE16: ; %bb.0:
113+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114+
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
115+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
116+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
117+
;
118+
; GFX11-FAKE16-LABEL: v_fma_f16:
119+
; GFX11-FAKE16: ; %bb.0:
120+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
122+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
115123
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
116124
ret half %fma
117125
}
@@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
145153
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
146154
; GFX10-NEXT: s_setpc_b64 s[30:31]
147155
;
148-
; GFX11-LABEL: v_fma_f16_fneg_lhs:
149-
; GFX11: ; %bb.0:
150-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151-
; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
152-
; GFX11-NEXT: s_setpc_b64 s[30:31]
156+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
157+
; GFX11-TRUE16: ; %bb.0:
158+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
160+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
161+
;
162+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
163+
; GFX11-FAKE16: ; %bb.0:
164+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
166+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
153167
%neg.x = fneg half %x
154168
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
155169
ret half %fma
@@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
184198
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
185199
; GFX10-NEXT: s_setpc_b64 s[30:31]
186200
;
187-
; GFX11-LABEL: v_fma_f16_fneg_rhs:
188-
; GFX11: ; %bb.0:
189-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190-
; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
191-
; GFX11-NEXT: s_setpc_b64 s[30:31]
201+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
202+
; GFX11-TRUE16: ; %bb.0:
203+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
205+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
206+
;
207+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
208+
; GFX11-FAKE16: ; %bb.0:
209+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
211+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
192212
%neg.y = fneg half %y
193213
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
194214
ret half %fma
@@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
223243
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
224244
; GFX10-NEXT: s_setpc_b64 s[30:31]
225245
;
226-
; GFX11-LABEL: v_fma_f16_fneg_add:
227-
; GFX11: ; %bb.0:
228-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229-
; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
230-
; GFX11-NEXT: s_setpc_b64 s[30:31]
246+
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
247+
; GFX11-TRUE16: ; %bb.0:
248+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249+
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
250+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
251+
;
252+
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
253+
; GFX11-FAKE16: ; %bb.0:
254+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255+
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
256+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
231257
%neg.z = fneg half %z
232258
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
233259
ret half %fma

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
2+
# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
3+
# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
34
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
45

56
---

0 commit comments

Comments
 (0)