Skip to content

Commit 9545c24

Browse files
mariusz-sikora-at-amdmatejaMarjanovicmbrkusanin
authored andcommitted
[AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… (llvm#78414)
…bf8 instructions Add VOP1, VOP1_DPP8, VOP1_DPP16, VOP3, VOP3_DPP8, VOP3_DPP16 instructions that were supported on GFX940 (MI300): - V_CVT_F32_FP8 - V_CVT_F32_BF8 - V_CVT_PK_F32_FP8 - V_CVT_PK_F32_BF8 - V_CVT_PK_FP8_F32 - V_CVT_PK_BF8_F32 - V_CVT_SR_FP8_F32 - V_CVT_SR_BF8_F32 --------- Co-authored-by: Mateja Marjanovic <[email protected]> Co-authored-by: Mirko Brkušanin <[email protected]> Change-Id: I62e37982868d9f5b400bf794b82c59ae530080ed
1 parent f75c933 commit 9545c24

37 files changed

+1821
-202
lines changed

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@
100100
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
101101
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
102102
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103-
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104-
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103+
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104+
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
105105

106106
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
107107

clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,60 @@
11
// REQUIRES: amdgpu-registered-target
2-
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
2+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s
3+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
34

45
typedef float v2f __attribute__((ext_vector_type(2)));
56

6-
// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
7-
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
7+
// CHECK-LABEL: @test_cvt_f32_bf8
8+
// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
89
void test_cvt_f32_bf8(global int* out, int a)
910
{
1011
*out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
1112
}
1213

13-
// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
14-
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
14+
// CHECK-LABEL: @test_cvt_f32_fp8
15+
// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
1516
void test_cvt_f32_fp8(global int* out, int a)
1617
{
1718
*out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
1819
}
1920

20-
// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
21-
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
21+
// CHECK-LABEL: @test_cvt_pk_f32_bf8
22+
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
2223
void test_cvt_pk_f32_bf8(global v2f* out, int a)
2324
{
2425
*out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
2526
}
2627

27-
// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
28-
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
28+
// CHECK-LABEL: @test_cvt_pk_f32_fp8
29+
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
2930
void test_cvt_pk_f32_fp8(global v2f* out, int a)
3031
{
3132
*out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
3233
}
3334

34-
// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
35-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
35+
// CHECK-LABEL: @test_cvt_pk_bf8_f32
36+
// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
3637
void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
3738
{
3839
*out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
3940
}
4041

41-
// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
42-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
42+
// CHECK-LABEL: @test_cvt_pk_fp8_f32
43+
// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
4344
void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
4445
{
4546
*out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
4647
}
4748

48-
// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
49-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
49+
// CHECK-LABEL: @test_cvt_sr_bf8_f32
50+
// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
5051
void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
5152
{
5253
*out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
5354
}
5455

55-
// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
56-
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
56+
// CHECK-LABEL: @test_cvt_sr_fp8_f32
57+
// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
5758
void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
5859
{
5960
*out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,6 +1507,7 @@ def FeatureISAVersion12 : FeatureSet<
15071507
FeatureFlatAtomicFaddF32Inst,
15081508
FeatureImageInsts,
15091509
FeatureExtendedImageInsts,
1510+
FeatureFP8ConversionInsts,
15101511
FeaturePackedTID,
15111512
FeatureVcmpxPermlaneHazard,
15121513
FeatureSALUFloatInsts,

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3530,6 +3530,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
35303530
return !isInlineConstant(Inst, OpIdx);
35313531
} else if (MO.isReg()) {
35323532
auto Reg = MO.getReg();
3533+
if (!Reg) {
3534+
return false;
3535+
}
35333536
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
35343537
auto PReg = mc2PseudoReg(Reg);
35353538
return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
@@ -8427,12 +8430,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
84278430
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
84288431

84298432
if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
8430-
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
8433+
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
8434+
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
8435+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
84318436
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
84328437
Inst.addOperand(Inst.getOperand(0));
84338438
}
84348439

8435-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
8440+
// Adding vdst_in operand is already covered for these DPP instructions in
8441+
// cvtVOP3DPP.
8442+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
8443+
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
8444+
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
8445+
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
8446+
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
84368447
assert(!IsPacked);
84378448
Inst.addOperand(Inst.getOperand(0));
84388449
}
@@ -8933,6 +8944,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
89338944
}
89348945
}
89358946

8947+
int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
8948+
if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
8949+
Inst.addOperand(Inst.getOperand(0));
8950+
}
8951+
8952+
bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
8953+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
8954+
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
8955+
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
8956+
if (IsVOP3CvtSrDpp) {
8957+
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
8958+
Inst.addOperand(MCOperand::createImm(0));
8959+
Inst.addOperand(MCOperand::createReg(0));
8960+
}
8961+
}
8962+
89368963
auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
89378964
MCOI::TIED_TO);
89388965
if (TiedTo != -1) {

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
720720
AMDGPU::OpName::src2_modifiers);
721721
}
722722

723+
if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
724+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
725+
// Insert dummy unused src2_modifiers.
726+
insertNamedMCOperand(MI, MCOperand::createImm(0),
727+
AMDGPU::OpName::src2_modifiers);
728+
}
729+
723730
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
724731
!AMDGPU::hasGDS(STI)) {
725732
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
@@ -950,6 +957,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
950957
// first add optional MI operands to check FI
951958
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
952959
unsigned Opc = MI.getOpcode();
960+
953961
if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
954962
convertVOP3PDPPInst(MI);
955963
} else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
@@ -959,6 +967,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
959967
if (isMacDPP(MI))
960968
convertMacDPPInst(MI);
961969

970+
int VDstInIdx =
971+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
972+
if (VDstInIdx != -1)
973+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
974+
975+
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
976+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
977+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
978+
962979
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
963980
if (MI.getNumOperands() < DescNumOps &&
964981
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -985,6 +1002,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
9851002
if (isMacDPP(MI))
9861003
convertMacDPPInst(MI);
9871004

1005+
int VDstInIdx =
1006+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
1007+
if (VDstInIdx != -1)
1008+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
1009+
1010+
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
1011+
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
1012+
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
1013+
9881014
unsigned Opc = MI.getOpcode();
9891015
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
9901016
if (MI.getNumOperands() < DescNumOps &&

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1322,6 +1322,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
13221322
const MCSubtargetInfo &STI,
13231323
raw_ostream &O) {
13241324
unsigned Opc = MI->getOpcode();
1325+
if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
1326+
auto SrcMod =
1327+
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1328+
unsigned Mod = MI->getOperand(SrcMod).getImm();
1329+
unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
1330+
unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
1331+
if (Index0 || Index1)
1332+
O << " op_sel:[" << Index0 << ',' << Index1 << ']';
1333+
return;
1334+
}
13251335
if (isPermlane16(Opc)) {
13261336
auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
13271337
auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1696,8 +1696,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
16961696
!if(HasOMod,
16971697
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
16981698
clampmod0:$clamp, omod0:$omod),
1699-
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
1700-
clampmod0:$clamp))
1699+
!if (HasClamp,
1700+
(ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
1701+
(ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
17011702
/* else */,
17021703
// VOP1 without modifiers
17031704
!if (HasClamp,
@@ -2293,6 +2294,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
22932294
field bit IsWMMA = 0;
22942295
field bit IsSWMMAC = 0;
22952296

2297+
field bit IsFP8 = 0;
2298+
22962299
field bit HasDst = !ne(DstVT.Value, untyped.Value);
22972300
field bit HasDst32 = HasDst;
22982301
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,17 @@ bool isPermlane16(unsigned Opc) {
541541
Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
542542
}
543543

544+
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
545+
return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
546+
Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
547+
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
548+
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
549+
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
550+
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
551+
Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
552+
Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
553+
}
554+
544555
bool isGenericAtomic(unsigned Opc) {
545556
return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
546557
Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,9 @@ bool isPermlane16(unsigned Opc);
542542
LLVM_READNONE
543543
bool isGenericAtomic(unsigned Opc);
544544

545+
LLVM_READNONE
546+
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);
547+
545548
namespace VOPD {
546549

547550
enum Component : unsigned {

0 commit comments

Comments
 (0)