Skip to content

Commit a99c964

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_cvt_f16_fp8 on gfx1250 (#146302)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
1 parent e810c63 commit a99c964

26 files changed

+611
-22
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,7 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx
656656
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
657657
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
658658

659+
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
659660
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
660661
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
661662

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,44 @@ void test_s_monitor_sleep() {
2424
__builtin_amdgcn_s_monitor_sleep(10);
2525
}
2626

27+
// CHECK-LABEL: @test_cvt_f16_fp8(
28+
// CHECK-NEXT: entry:
29+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
30+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
31+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
32+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
33+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
34+
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
35+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
36+
// CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP0]], i32 0)
37+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
38+
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0
39+
// CHECK-NEXT: store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2
40+
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
41+
// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP3]], i32 1)
42+
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
43+
// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1
44+
// CHECK-NEXT: store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2
45+
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
46+
// CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP6]], i32 2)
47+
// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
48+
// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2
49+
// CHECK-NEXT: store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2
50+
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
51+
// CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP9]], i32 3)
52+
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
53+
// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3
54+
// CHECK-NEXT: store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2
55+
// CHECK-NEXT: ret void
56+
//
57+
void test_cvt_f16_fp8(global half* out, int a)
58+
{
59+
out[0] = __builtin_amdgcn_cvt_f16_fp8(a, 0);
60+
out[1] = __builtin_amdgcn_cvt_f16_fp8(a, 1);
61+
out[2] = __builtin_amdgcn_cvt_f16_fp8(a, 2);
62+
out[3] = __builtin_amdgcn_cvt_f16_fp8(a, 3);
63+
}
64+
2765
// CHECK-LABEL: @test_cvt_pk_f16_fp8(
2866
// CHECK-NEXT: entry:
2967
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ void test_setprio_inc_wg(short a) {
88
void test_s_monitor_sleep(short a) {
99
__builtin_amdgcn_s_monitor_sleep(a); // expected-error {{'__builtin_amdgcn_s_monitor_sleep' must be a constant integer}}
1010
}
11+
12+
void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
13+
__builtin_amdgcn_cvt_f16_fp8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_fp8' must be a constant integer}}
14+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3509,6 +3509,12 @@ def int_amdgcn_s_monitor_sleep :
35093509
DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
35103510
IntrHasSideEffects]>;
35113511

3512+
// llvm.amdgcn.cvt.f16.fp8 half vdst, int srcA, imm byte_sel [0..3]
3513+
def int_amdgcn_cvt_f16_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_fp8">,
3514+
DefaultAttrsIntrinsic<[llvm_half_ty],
3515+
[llvm_i32_ty, llvm_i32_ty],
3516+
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
3517+
35123518
//===----------------------------------------------------------------------===//
35133519
// Special Intrinsics for backend internal use only. No frontend
35143520
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4596,6 +4596,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45964596
case Intrinsic::amdgcn_cvt_sr_bf8_f32:
45974597
case Intrinsic::amdgcn_cvt_sr_bf16_f32:
45984598
case Intrinsic::amdgcn_cvt_sr_f16_f32:
4599+
case Intrinsic::amdgcn_cvt_f16_fp8:
45994600
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
46004601
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
46014602
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,12 @@ class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32
687687
let HasModifiers = 0;
688688
}
689689

690+
let IsSingle = 0, HasOpSel = 1, HasModifiers = 1 in {
691+
def V_CVT_F16_F8_Profile : VOPProfile_Base_CVT_F_F8_ByteSel<f16>;
692+
def V_CVT_F16_F8_True16_Profile : VOP3_Profile_True16<V_CVT_F16_F8_Profile>;
693+
def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
694+
}
695+
690696
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
691697
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
692698
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
@@ -702,9 +708,10 @@ let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
702708
}
703709
}
704710

705-
class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
711+
class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSel = 0> : GCNPat<
706712
(node i32:$src0, timm:$byte_sel),
707-
(inst $src0, (as_i32timm $byte_sel))
713+
!if(HasOpSel, (inst 0, $src0, (as_i32timm $byte_sel)),
714+
(inst $src0, (as_i32timm $byte_sel)))
708715
>;
709716

710717
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
@@ -738,13 +745,22 @@ def VOPProfile_CVT_PK_F16_F8_fake16 : VOP3_Profile_Fake16<VOPProfile_CVT_PK_F16_
738745

739746
let SubtargetPredicate = isGFX1250Plus in {
740747
let mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
748+
defm V_CVT_F16_FP8 : VOP1Inst_t16_with_profiles<"v_cvt_f16_fp8",
749+
V_CVT_F16_F8_Profile, V_CVT_F16_F8_True16_Profile, V_CVT_F16_F8_Fake16_Profile>;
741750
defm V_CVT_PK_F16_FP8 : VOP1Inst_t16_with_profiles<"v_cvt_pk_f16_fp8",
742751
VOPProfile_CVT_PK_F16_F8, VOPProfile_CVT_PK_F16_F8_true16, VOPProfile_CVT_PK_F16_F8_fake16,
743752
int_amdgcn_cvt_pk_f16_fp8>;
744753
defm V_CVT_PK_F16_BF8 : VOP1Inst_t16_with_profiles<"v_cvt_pk_f16_bf8",
745754
VOPProfile_CVT_PK_F16_F8, VOPProfile_CVT_PK_F16_F8_true16, VOPProfile_CVT_PK_F16_F8_fake16,
746755
int_amdgcn_cvt_pk_f16_bf8>;
747756
}
757+
758+
let True16Predicate = UseRealTrue16Insts in {
759+
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_t16_e64, 1>;
760+
}
761+
let True16Predicate = UseFakeTrue16Insts in {
762+
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
763+
}
748764
} // End SubtargetPredicate = isGFX1250Plus
749765

750766
let SubtargetPredicate = isGFX10Plus in {
@@ -1082,6 +1098,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
10821098
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
10831099
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
10841100
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
1101+
defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
10851102

10861103
//===----------------------------------------------------------------------===//
10871104
// GFX10.

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,8 @@ class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
348348
class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
349349
bits<2> byte_sel;
350350
let Inst{11-12} = byte_sel; // NB: bit order is intentionally reversed!
351-
let Inst{14-13} = 0; // op_sel2/3
351+
let Inst{13} = !if(!and(p.HasOpSel, p.HasSrc2), src2_modifiers{2}, 0);
352+
let Inst{14} = !if(!and(p.HasOpSel, p.HasDst), src0_modifiers{3}, 0);
352353
}
353354

354355
class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
@@ -526,7 +527,7 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
526527
bits<9> scale_src1;
527528

528529
//MFMALdScaleModifierOp transforms 2 bit opsel input to 4 bit value
529-
//where opsel and opselHi are in 3rd and 4th bit.
530+
//where opsel and opselHi are in 3rd and 4th bit.
530531
bits<4> src0_modifiers;
531532
bits<4> src1_modifiers;
532533

@@ -869,14 +870,14 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
869870
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
870871
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
871872
// 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
872-
let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
873-
!if(P.HasFP8SrcByteSel, byte_sel{1}, ?));
874-
let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
875-
!if(P.HasFP8SrcByteSel, byte_sel{0}, ?));
876-
let Inst{13} = !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0),
877-
!if(P.HasFP8DstByteSel, byte_sel{0}, ?));
878-
let Inst{14} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0),
879-
!if(P.HasFP8DstByteSel, byte_sel{1}, ?));
873+
let Inst{11} = !if(P.HasFP8SrcByteSel, byte_sel{1},
874+
!if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0), ?));
875+
let Inst{12} = !if(P.HasFP8SrcByteSel, byte_sel{0},
876+
!if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0), ?));
877+
let Inst{13} = !if(P.HasFP8DstByteSel, byte_sel{0},
878+
!if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0), ?));
879+
let Inst{14} = !if(P.HasFP8DstByteSel, byte_sel{1},
880+
!if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0), ?));
880881
let Inst{15} = !if(P.HasClamp, clamp, 0);
881882
let Inst{25-16} = op;
882883
let Inst{31-26} = 0x35;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,177 @@
44
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL-REAL16 %s
55
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL-FAKE16 %s
66

7+
declare half @llvm.amdgcn.cvt.f16.bf8(i32, i32)
8+
declare half @llvm.amdgcn.cvt.f16.fp8(i32, i32)
9+
declare <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16)
10+
declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16)
11+
12+
define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
13+
; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0:
14+
; GFX1250-SDAG-REAL16: ; %bb.0:
15+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e32 v0.l, v0
16+
; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
17+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
18+
; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
19+
;
20+
; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte0:
21+
; GFX1250-SDAG-FAKE16: ; %bb.0:
22+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e32 v0, v0
23+
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
24+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
25+
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
26+
;
27+
; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte0:
28+
; GFX1250-GISEL-REAL16: ; %bb.0:
29+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e32 v0.l, v0
30+
; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
31+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
32+
; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
33+
;
34+
; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte0:
35+
; GFX1250-GISEL-FAKE16: ; %bb.0:
36+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e32 v0, v0
37+
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
39+
; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
40+
%cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 0)
41+
%ret = fpext half %cvt to float
42+
ret float %ret
43+
}
44+
45+
define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) {
46+
; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte1:
47+
; GFX1250-SDAG-REAL16: ; %bb.0:
48+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1
49+
; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
50+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
51+
; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
52+
;
53+
; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte1:
54+
; GFX1250-SDAG-FAKE16: ; %bb.0:
55+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:1
56+
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
57+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
58+
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
59+
;
60+
; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte1:
61+
; GFX1250-GISEL-REAL16: ; %bb.0:
62+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1
63+
; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
64+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
65+
; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
66+
;
67+
; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte1:
68+
; GFX1250-GISEL-FAKE16: ; %bb.0:
69+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:1
70+
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
71+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
72+
; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
73+
%cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 1)
74+
%ret = fpext half %cvt to float
75+
ret float %ret
76+
}
77+
78+
define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) {
79+
; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte2:
80+
; GFX1250-SDAG-REAL16: ; %bb.0:
81+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2
82+
; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
83+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
84+
; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
85+
;
86+
; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte2:
87+
; GFX1250-SDAG-FAKE16: ; %bb.0:
88+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:2
89+
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
90+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
91+
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
92+
;
93+
; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte2:
94+
; GFX1250-GISEL-REAL16: ; %bb.0:
95+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2
96+
; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
97+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
98+
; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
99+
;
100+
; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte2:
101+
; GFX1250-GISEL-FAKE16: ; %bb.0:
102+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:2
103+
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
104+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
105+
; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
106+
%cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 2)
107+
%ret = fpext half %cvt to float
108+
ret float %ret
109+
}
110+
111+
define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) {
112+
; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3:
113+
; GFX1250-SDAG-REAL16: ; %bb.0:
114+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
115+
; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
116+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
117+
; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
118+
;
119+
; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3:
120+
; GFX1250-SDAG-FAKE16: ; %bb.0:
121+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
122+
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
123+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
124+
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
125+
;
126+
; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3:
127+
; GFX1250-GISEL-REAL16: ; %bb.0:
128+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
129+
; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
130+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
131+
; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
132+
;
133+
; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3:
134+
; GFX1250-GISEL-FAKE16: ; %bb.0:
135+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
136+
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
137+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
138+
; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
139+
%cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 3)
140+
%ret = fpext half %cvt to float
141+
ret float %ret
142+
}
143+
144+
define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) {
145+
; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi:
146+
; GFX1250-SDAG-REAL16: ; %bb.0:
147+
; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.h, v0 byte_sel:3
148+
; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0
149+
; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
150+
;
151+
; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi:
152+
; GFX1250-SDAG-FAKE16: ; %bb.0:
153+
; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
154+
; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
155+
; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100
156+
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
157+
;
158+
; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi:
159+
; GFX1250-GISEL-REAL16: ; %bb.0:
160+
; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
161+
; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
162+
; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0
163+
; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
164+
;
165+
; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi:
166+
; GFX1250-GISEL-FAKE16: ; %bb.0:
167+
; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
168+
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
169+
; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0
170+
; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
171+
%cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 3)
172+
%ins.0 = insertelement <2 x half> undef, half 0.0, i32 0
173+
%ins.1 = insertelement <2 x half> %ins.0, half %cvt, i32 1
174+
%ret = bitcast <2 x half> %ins.1 to float
175+
ret float %ret
176+
}
177+
7178
define amdgpu_ps float @test_cvt_pk_f16_bf8_v(i16 %a) {
8179
; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_bf8_v:
9180
; GFX1250-SDAG-REAL16: ; %bb.0:

llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,18 @@ v_cvt_f32_bf16 v5, src_scc
4646
v_cvt_f32_bf16 v127, 0x8000
4747
// GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00]
4848

49+
v_cvt_f16_fp8 v1, v2
50+
// GFX1250: v_cvt_f16_fp8_e32 v1, v2 ; encoding: [0x02,0xef,0x02,0x7e]
51+
52+
v_cvt_f16_fp8 v1, s2
53+
// GFX1250: v_cvt_f16_fp8_e32 v1, s2 ; encoding: [0x02,0xee,0x02,0x7e]
54+
55+
v_cvt_f16_fp8 v1, 2
56+
// GFX1250: v_cvt_f16_fp8_e32 v1, 2 ; encoding: [0x82,0xee,0x02,0x7e]
57+
58+
v_cvt_f16_fp8 v1, 0x1234
59+
// GFX1250: v_cvt_f16_fp8_e32 v1, 0x1234 ; encoding: [0xff,0xee,0x02,0x7e,0x34,0x12,0x00,0x00]
60+
4961
v_cvt_pk_f16_bf8 v1, v2
5062
// GFX1250: v_cvt_pk_f16_bf8 v1, v2 ; encoding: [0x02,0xed,0x02,0x7e]
5163

0 commit comments

Comments
 (0)