Skip to content

Commit b1d8c60

Browse files
[AArch64] Implement FP8 SVE Intrinsics for narrowing conversions (#118124)
This patch adds the following instrinsics: * Half-precision and BFloat16 convert, narrow, and interleave to 8-bit floating-point. // Variant is also available for: _bf16_x2 svmfloat8_t svcvtn_mf8[_f16_x2]_fpm(svfloat16x2_t zn, fpm_t fpm); * Single-precision convert, narrow, and interleave to 8-bit floating-point (top and bottom). svmfloat8_t svcvtnt_mf8[_f32_x2]_fpm(svmfloat8_t zd, svfloat32x2_t zn, fpm_t fpm); svmfloat8_t svcvtnb_mf8[_f32_x2]_fpm(svfloat32x2_t zn, fpm_t fpm);
1 parent 5e007af commit b1d8c60

File tree

7 files changed

+222
-6
lines changed

7 files changed

+222
-6
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2468,4 +2468,11 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
24682468
// 8-bit floating-point convert to BFloat16/Float16 (top)
24692469
def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>;
24702470
def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>;
2471+
2472+
// BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point
2473+
def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>;
2474+
2475+
// Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom)
2476+
def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
2477+
def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
24712478
}
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
3+
// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
4+
5+
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
6+
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
7+
8+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
9+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
10+
11+
// REQUIRES: aarch64-registered-target
12+
13+
#ifdef __ARM_FEATURE_SME
14+
#include <arm_sme.h>
15+
#else
16+
#include <arm_sve.h>
17+
#endif
18+
19+
#ifdef SVE_OVERLOADED_FORMS
20+
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
21+
#else
22+
#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
23+
#endif
24+
25+
#ifdef __ARM_FEATURE_SME
26+
#define STREAMING __arm_streaming
27+
#else
28+
#define STREAMING
29+
#endif
30+
31+
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtn_f8_bf16(
32+
// CHECK-SAME: <vscale x 8 x bfloat> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
33+
// CHECK-NEXT: [[ENTRY:.*:]]
34+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
35+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8bf16(<vscale x 8 x bfloat> [[ZN_ZM_COERCE0]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1]])
36+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
37+
//
38+
// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svcvtn_f8_bf1614svbfloat16x2_tm(
39+
// CHECK-CXX-SAME: <vscale x 8 x bfloat> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
40+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
41+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
42+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8bf16(<vscale x 8 x bfloat> [[ZN_ZM_COERCE0]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1]])
43+
// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
44+
//
45+
svmfloat8_t test_svcvtn_f8_bf16(svbfloat16x2_t zn_zm, fpm_t fpm) STREAMING {
46+
return SVE_ACLE_FUNC(svcvtn_mf8,_bf16_x2,_fpm)(zn_zm, fpm);
47+
}
48+
49+
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtn_f8_f16(
50+
// CHECK-SAME: <vscale x 8 x half> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
51+
// CHECK-NEXT: [[ENTRY:.*:]]
52+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
53+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> [[ZN_ZM_COERCE0]], <vscale x 8 x half> [[ZN_ZM_COERCE1]])
54+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
55+
//
56+
// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svcvtn_f8_f1613svfloat16x2_tm(
57+
// CHECK-CXX-SAME: <vscale x 8 x half> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
58+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
59+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
60+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> [[ZN_ZM_COERCE0]], <vscale x 8 x half> [[ZN_ZM_COERCE1]])
61+
// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
62+
//
63+
svmfloat8_t test_svcvtn_f8_f16(svfloat16x2_t zn_zm, fpm_t fpm) STREAMING {
64+
return SVE_ACLE_FUNC(svcvtn_mf8,_f16_x2,_fpm)(zn_zm, fpm);
65+
}
66+
67+
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtnb_f8_f32(
68+
// CHECK-SAME: <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
69+
// CHECK-NEXT: [[ENTRY:.*:]]
70+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
71+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnb.nxv4f32(<vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
72+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
73+
//
74+
// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svcvtnb_f8_f3213svfloat32x2_tm(
75+
// CHECK-CXX-SAME: <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
76+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
77+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
78+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnb.nxv4f32(<vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
79+
// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
80+
//
81+
svmfloat8_t test_svcvtnb_f8_f32(svfloat32x2_t zn_zm, fpm_t fpm) STREAMING {
82+
return SVE_ACLE_FUNC(svcvtnb_mf8,_f32_x2,_fpm)(zn_zm, fpm);
83+
}
84+
85+
// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtnt_f8_f32(
86+
// CHECK-SAME: <vscale x 16 x i8> [[ZD:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
87+
// CHECK-NEXT: [[ENTRY:.*:]]
88+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
89+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> [[ZD]], <vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
90+
// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
91+
//
92+
// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svcvtnt_f8_f32u13__SVMfloat8_t13svfloat32x2_tm(
93+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZD:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
94+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
95+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
96+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> [[ZD]], <vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
97+
// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
98+
//
99+
svmfloat8_t test_svcvtnt_f8_f32(svmfloat8_t zd, svfloat32x2_t zn_zm, fpm_t fpm) STREAMING {
100+
return SVE_ACLE_FUNC(svcvtnt_mf8,_f32_x2,_fpm)(zd, zn_zm, fpm);
101+
}

clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// REQUIRES: aarch64-registered-target
22

3-
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s
3+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -verify -emit-llvm -o - %s
44

55
#include <arm_sve.h>
66

@@ -21,4 +21,13 @@ void test_features(svmfloat8_t zn, fpm_t fpm) {
2121
// expected-error@-1 {{'svcvtlt1_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
2222
svcvtlt2_f16_mf8_fpm(zn, fpm);
2323
// expected-error@-1 {{'svcvtlt2_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
24+
25+
svcvtn_mf8_bf16_x2_fpm(svcreate2(svundef_bf16(), svundef_bf16()), fpm);
26+
// expected-error@-1 {{'svcvtn_mf8_bf16_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
27+
svcvtn_mf8_f16_x2_fpm(svcreate2(svundef_f16(), svundef_f16()), fpm);
28+
// expected-error@-1 {{'svcvtn_mf8_f16_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
29+
svcvtnb_mf8_f32_x2_fpm(svcreate2(svundef_f32(), svundef_f32()), fpm);
30+
// expected-error@-1 {{'svcvtnb_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
31+
svcvtnt_mf8_f32_x2_fpm(zn, svcreate2(svundef_f32(), svundef_f32()), fpm);
32+
// expected-error@-1 {{'svcvtnt_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
2433
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3872,6 +3872,20 @@ let TargetPrefix = "aarch64" in {
38723872
def int_aarch64_sve_fp8_cvtlt1 : SVE2_FP8_Cvt;
38733873
def int_aarch64_sve_fp8_cvtlt2 : SVE2_FP8_Cvt;
38743874

3875+
// SVE Narrowing Conversions
3876+
class SVE2_FP8_Narrow_Cvt
3877+
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
3878+
[llvm_anyvector_ty, LLVMMatchType<0>],
3879+
[IntrReadMem, IntrInaccessibleMemOnly]>;
3880+
3881+
def int_aarch64_sve_fp8_cvtn : SVE2_FP8_Narrow_Cvt;
3882+
def int_aarch64_sve_fp8_cvtnb : SVE2_FP8_Narrow_Cvt;
3883+
3884+
def int_aarch64_sve_fp8_cvtnt
3885+
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
3886+
[llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>],
3887+
[IntrReadMem, IntrInaccessibleMemOnly]>;
3888+
38753889
class SME2_FP8_CVT_X2_Single_Intrinsic
38763890
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
38773891
[llvm_nxv16i8_ty],

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4379,10 +4379,11 @@ defm BF1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b10, "bf1cvtlt", nxv8bf16, int_aar
43794379
defm BF2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b11, "bf2cvtlt", nxv8bf16, int_aarch64_sve_fp8_cvtlt2>;
43804380

43814381
// FP8 downconvert
4382-
defm FCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r>;
4383-
defm FCVTNB_Z2Z_StoB : sve2_fp8_down_cvt_single<0b01, "fcvtnb", ZZ_s_mul_r>;
4384-
defm BFCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b10, "bfcvtn", ZZ_h_mul_r>;
4385-
defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single<0b11, "fcvtnt", ZZ_s_mul_r>;
4382+
defm FCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r, nxv8f16, int_aarch64_sve_fp8_cvtn>;
4383+
defm FCVTNB_Z2Z_StoB : sve2_fp8_down_cvt_single<0b01, "fcvtnb", ZZ_s_mul_r, nxv4f32, int_aarch64_sve_fp8_cvtnb>;
4384+
defm BFCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b10, "bfcvtn", ZZ_h_mul_r, nxv8bf16, int_aarch64_sve_fp8_cvtn>;
4385+
4386+
defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single_top<0b11, "fcvtnt", ZZ_s_mul_r, nxv4f32, int_aarch64_sve_fp8_cvtnt>;
43864387
} // End HasSVE2orSME2, HasFP8
43874388

43884389
let Predicates = [HasSVE2orSME2, HasFAMINMAX] in {

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10794,10 +10794,45 @@ class sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic,
1079410794
let Inst{5} = 0b0;
1079510795
let Inst{4-0} = Zd;
1079610796
let Uses = [FPMR, FPCR];
10797+
10798+
let mayLoad = 1;
10799+
let mayStore = 0;
1079710800
}
1079810801

10799-
multiclass sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic, RegisterOperand src> {
10802+
multiclass sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic, RegisterOperand src,
10803+
ValueType ty, SDPatternOperator op> {
1080010804
def NAME : sve2_fp8_down_cvt_single<opc, mnemonic, ZPR8, src>;
10805+
10806+
def : Pat<(nxv16i8 (op ty:$Zn1, ty:$Zn2)),
10807+
(!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2Mul2, $Zn1, zsub0, $Zn2, zsub1))>;
10808+
}
10809+
10810+
class sve2_fp8_down_cvt_single_top<bits<2> opc, string mnemonic, RegisterOperand src_ty>
10811+
: I<(outs ZPR8:$Zd), (ins ZPR8:$_Zd, src_ty:$Zn), mnemonic, "\t$Zd, $Zn","", []>, Sched<[]> {
10812+
bits<5> Zd;
10813+
bits<4> Zn;
10814+
10815+
let Inst{31-12} = 0b01100101000010100011;
10816+
let Inst{11-10} = opc;
10817+
let Inst{9-6} = Zn;
10818+
let Inst{5} = 0b0;
10819+
let Inst{4-0} = Zd;
10820+
10821+
let Constraints = "$Zd = $_Zd";
10822+
let DestructiveInstType = DestructiveOther;
10823+
let ElementSize = ZPR8.ElementSize;
10824+
10825+
let Uses = [FPMR, FPCR];
10826+
let mayLoad = 1;
10827+
let mayStore = 0;
10828+
}
10829+
10830+
multiclass sve2_fp8_down_cvt_single_top<bits<2> opc, string mnemonic, RegisterOperand src_ty,
10831+
ValueType ty, SDPatternOperator op> {
10832+
def NAME : sve2_fp8_down_cvt_single_top<opc, mnemonic, src_ty>;
10833+
10834+
def : Pat<(nxv16i8 (op nxv16i8:$Zd, ty:$Zn1, ty:$Zn2)),
10835+
(!cast<Instruction>(NAME) $Zd, (REG_SEQUENCE ZPR2Mul2, $Zn1, zsub0, $Zn2, zsub1))>;
1080110836
}
1080210837

1080310838
// FP8 Widening Multiply-Add Long - Indexed Group
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mattr=+sve2,+fp8 < %s | FileCheck %s
3+
; RUN: llc -mattr=+sme2,+fp8 --force-streaming < %s | FileCheck %s
4+
5+
target triple = "aarch64-linux"
6+
7+
define <vscale x 16 x i8> @cvtn_bf16(<vscale x 8 x bfloat> %s1, <vscale x 8 x bfloat> %s2) {
8+
; CHECK-LABEL: cvtn_bf16:
9+
; CHECK: // %bb.0:
10+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
11+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
12+
; CHECK-NEXT: bfcvtn z0.b, { z0.h, z1.h }
13+
; CHECK-NEXT: ret
14+
%r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8bf16(<vscale x 8 x bfloat> %s1, <vscale x 8 x bfloat> %s2)
15+
ret <vscale x 16 x i8> %r
16+
}
17+
18+
define <vscale x 16 x i8> @cvtn_f16(<vscale x 8 x half> %s1, <vscale x 8 x half> %s2) {
19+
; CHECK-LABEL: cvtn_f16:
20+
; CHECK: // %bb.0:
21+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
22+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
23+
; CHECK-NEXT: fcvtn z0.b, { z0.h, z1.h }
24+
; CHECK-NEXT: ret
25+
%r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %s1, <vscale x 8 x half> %s2)
26+
ret <vscale x 16 x i8> %r
27+
}
28+
29+
define <vscale x 16 x i8> @cvtnb_f32(<vscale x 4 x float> %s1, <vscale x 4 x float> %s2) {
30+
; CHECK-LABEL: cvtnb_f32:
31+
; CHECK: // %bb.0:
32+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
33+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
34+
; CHECK-NEXT: fcvtnb z0.b, { z0.s, z1.s }
35+
; CHECK-NEXT: ret
36+
%r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnb.nxv4f32(<vscale x 4 x float> %s1, <vscale x 4 x float> %s2)
37+
ret <vscale x 16 x i8> %r
38+
}
39+
40+
define <vscale x 16 x i8> @cvtnt_f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %s1, <vscale x 4 x float> %s2) {
41+
; CHECK-LABEL: cvtnt_f32:
42+
; CHECK: // %bb.0:
43+
; CHECK-NEXT: mov z3.d, z2.d
44+
; CHECK-NEXT: mov z2.d, z1.d
45+
; CHECK-NEXT: fcvtnt z0.b, { z2.s, z3.s }
46+
; CHECK-NEXT: ret
47+
%r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %s1, <vscale x 4 x float> %s2)
48+
ret <vscale x 16 x i8> %r
49+
}

0 commit comments

Comments
 (0)