Skip to content

Commit 8ed3b05

Browse files
authored
[AMDGPU][True16][MC] Implement V_CVT_PK_F32_FP8/BF8 (#116106)
Existing Fake16 versions of these instructions do not support op_sel on the _e32 encoding, which leaves a hole in the disassembler support. Implement the true16 version of the instructions in the MC layer.
1 parent 3a20a5f commit 8ed3b05

File tree

5 files changed

+73
-32
lines changed

5 files changed

+73
-32
lines changed

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -588,8 +588,10 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
588588
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
589589
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
590590
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
591-
Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
592-
Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
591+
Opc == AMDGPU::V_CVT_PK_F32_BF8_fake16_e64_gfx12 ||
592+
Opc == AMDGPU::V_CVT_PK_F32_FP8_fake16_e64_gfx12 ||
593+
Opc == AMDGPU::V_CVT_PK_F32_BF8_t16_e64_gfx12 ||
594+
Opc == AMDGPU::V_CVT_PK_F32_FP8_t16_e64_gfx12;
593595
}
594596

595597
bool isGenericAtomic(unsigned Opc) {

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -634,17 +634,16 @@ let SubtargetPredicate = HasFP8ConversionInsts, OtherPredicates = [HasSDWA] in {
634634
}
635635
}
636636

637-
638-
// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions.
639-
def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfile<[v2f32, i32, untyped, untyped]> {
640-
let HasOpSel = 1;
641-
let HasClamp = 0;
642-
let HasOMod = 0;
643-
let HasExtDPP = 0;
644-
let HasExtVOP3DPP = 0;
645-
let AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
646-
HasOpSel, HasOMod, IsVOP3P, 0 /*HasModifiers*/, 0/*Src0HasMods*/, 0/*Src1HasMods*/,
647-
0/*Src2HasMods*/, DstVT>.ret;
637+
let HasClamp = 0, HasOMod = 0, HasExtDPP = 0, HasExtVOP3DPP = 0,
638+
HasOpSel = 1 in {
639+
// Input modifiers are not supported
640+
// NB: fake16 VOP1 does not support op_sel.
641+
def VOPProfile_Base_CVT_PK_F32_F8_fake16 : VOPProfile_Fake16<VOPProfile<[v2f32, f16, untyped, untyped]>> {
642+
let Src0Mod = IntT16InputMods<1/*IsFake16*/>;
643+
}
644+
def VOPProfile_Base_CVT_PK_F32_F8_t16 : VOPProfile_True16<VOPProfile<[v2f32, f16, untyped, untyped]>> {
645+
let Src0Mod = IntT16InputMods<0/*IsFake16*/>;
646+
}
648647
}
649648

650649
class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
@@ -673,8 +672,15 @@ let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
673672
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
674673
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
675674
defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
676-
defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
677-
defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
675+
676+
let True16Predicate = UseFakeTrue16Insts in {
677+
defm V_CVT_PK_F32_FP8_fake16 : VOP1Inst<"v_cvt_pk_f32_fp8_fake16", VOPProfile_Base_CVT_PK_F32_F8_fake16>;
678+
defm V_CVT_PK_F32_BF8_fake16 : VOP1Inst<"v_cvt_pk_f32_bf8_fake16", VOPProfile_Base_CVT_PK_F32_F8_fake16>;
679+
}
680+
let True16Predicate = UseRealTrue16Insts in {
681+
defm V_CVT_PK_F32_FP8_t16 : VOP1Inst<"v_cvt_pk_f32_fp8_t16", VOPProfile_Base_CVT_PK_F32_F8_t16>;
682+
defm V_CVT_PK_F32_BF8_t16 : VOP1Inst<"v_cvt_pk_f32_bf8_t16", VOPProfile_Base_CVT_PK_F32_F8_t16>;
683+
}
678684
}
679685

680686
class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
@@ -698,9 +704,9 @@ class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
698704
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
699705
foreach Index = [0, -1] in {
700706
def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
701-
V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_OP_SEL_e64>;
707+
V_CVT_PK_F32_FP8_fake16_e32, V_CVT_PK_F32_FP8_fake16_e64>;
702708
def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_bf8, Index,
703-
V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_OP_SEL_e64>;
709+
V_CVT_PK_F32_BF8_fake16_e32, V_CVT_PK_F32_BF8_fake16_e64>;
704710
}
705711
}
706712

@@ -954,13 +960,14 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
954960
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
955961
defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
956962

957-
// Define VOP1 instructions using the pseudo instruction with its old profile and
958-
// VOP3 using the OpSel profile for the pseudo instruction.
959-
defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
960-
defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
961-
962-
defm V_CVT_PK_F32_BF8 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8", "v_cvt_pk_f32_bf8">;
963-
defm V_CVT_PK_F32_BF8 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_OP_SEL", "v_cvt_pk_f32_bf8">;
963+
defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
964+
defm V_CVT_PK_F32_FP8_t16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
965+
defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
966+
defm V_CVT_PK_F32_FP8_t16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
967+
defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
968+
defm V_CVT_PK_F32_BF8_t16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
969+
defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
970+
defm V_CVT_PK_F32_BF8_t16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
964971

965972
defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
966973
"V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;

llvm/test/MC/AMDGPU/gfx12_asm_vop1.s

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -429,20 +429,26 @@ v_cvt_pk_f32_bf8_e32 v[2:3], 3
429429
v_cvt_pk_f32_bf8_e32 v[3:4], 3
430430
// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], 3 ; encoding: [0x83,0xde,0x06,0x7e]
431431

432-
v_cvt_pk_f32_bf8_e32 v[2:3], v3
433-
// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e]
432+
v_cvt_pk_f32_bf8_e32 v[2:3], v3.l
433+
// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3.l ; encoding: [0x03,0xdf,0x04,0x7e]
434434

435-
v_cvt_pk_f32_bf8_e32 v[3:4], v3
436-
// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3 ; encoding: [0x03,0xdf,0x06,0x7e]
435+
v_cvt_pk_f32_bf8_e32 v[3:4], v3.l
436+
// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3.l ; encoding: [0x03,0xdf,0x06,0x7e]
437+
438+
v_cvt_pk_f32_bf8_e32 v[3:4], v3.h
439+
// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3.h ; encoding: [0x83,0xdf,0x06,0x7e]
437440

438441
v_cvt_pk_f32_fp8_e32 v[2:3], s3
439442
// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e]
440443

441444
v_cvt_pk_f32_fp8_e32 v[2:3], 3
442445
// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e]
443446

444-
v_cvt_pk_f32_fp8_e32 v[2:3], v3
445-
// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e]
447+
v_cvt_pk_f32_fp8_e32 v[2:3], v3.l
448+
// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3.l ; encoding: [0x03,0xdd,0x04,0x7e]
449+
450+
v_cvt_pk_f32_fp8_e32 v[2:3], v3.h
451+
// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3.h ; encoding: [0x83,0xdd,0x04,0x7e]
446452

447453
v_cvt_f16_f32 v5.l, v1
448454
// GFX12: v_cvt_f16_f32_e32 v5.l, v1 ; encoding: [0x01,0x15,0x0a,0x7e]

llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,12 @@ v_cvt_pk_f32_bf8_e64 v[2:3], v3
486486
v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0]
487487
// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
488488

489+
v_cvt_pk_f32_bf8_e64 v[2:3], v3.h
490+
// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
491+
492+
v_cvt_pk_f32_bf8_e64 v[2:3], v255.h
493+
// GFX12: encoding: [0x02,0x08,0xef,0xd5,0xff,0x01,0x00,0x00]
494+
489495
v_cvt_pk_f32_fp8_e64 v[2:3], s3
490496
// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
491497

@@ -534,6 +540,12 @@ v_cvt_pk_f32_fp8_e64 v[3:4], v3
534540
v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0]
535541
// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
536542

543+
v_cvt_pk_f32_fp8_e64 v[3:4], v3.h
544+
// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
545+
546+
v_cvt_pk_f32_fp8_e64 v[3:4], v255.h
547+
// GFX12: encoding: [0x03,0x08,0xee,0xd5,0xff,0x01,0x00,0x00]
548+
537549
v_cvt_f16_f32_e64 v5.l, v1
538550
// GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
539551

llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,18 +436,32 @@
436436
# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
437437
0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00
438438

439-
# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
439+
# GFX12-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.l ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
440+
# GFX12-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
440441
0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00
441442

443+
# GFX12-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
444+
0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00
445+
446+
# GFX12-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v255.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0xff,0x01,0x00,0x00]
447+
0x02,0x08,0xef,0xd5,0xff,0x01,0x00,0x00
448+
442449
# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
443450
0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00
444451

445452
# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
446453
0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00
447454

448-
# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
455+
# GFX12-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.l ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
456+
# GFX12-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
449457
0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00
450458

459+
# GFX12-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
460+
0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00
461+
462+
# GFX12-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v255.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0xff,0x01,0x00,0x00]
463+
0x02,0x08,0xee,0xd5,0xff,0x01,0x00,0x00
464+
451465
# GFX12-REAL16: v_cvt_f16_f32_e64 v5.l, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
452466
# GFX12-FAKE16: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
453467
0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00

0 commit comments

Comments
 (0)