Skip to content

Commit cc1a2ea

Browse files
[AArch64] Implement FP8 SVE intrinsics for widening conversions (#118123)
This patch adds the following intrinsics: * 8-bit floating-point convert to half-precision and BFloat16. // Variants are also available for: _bf16 svfloat16_t svcvt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); * 8-bit floating-point convert to half-precision and BFloat16 (top). // Variants are also available for: _bf16 svfloat16_t svcvtlt1_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm); svfloat16_t svcvtlt2_f16[_mf8]_fpm(svmfloat8_t zn, fpm_t fpm);
1 parent f6289f1 commit cc1a2ea

File tree

7 files changed

+317
-14
lines changed

7 files changed

+317
-14
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2430,12 +2430,12 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
24302430
def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>;
24312431

24322432
// Convert from FP8 to half-precision/BFloat16 multi-vector
2433-
def SVF1CVT : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>;
2434-
def SVF2CVT : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>;
2433+
def SVF1CVT_X2 : Inst<"svcvt1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1_x2", [IsStreaming, SetsFPMR], []>;
2434+
def SVF2CVT_X2 : Inst<"svcvt2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2_x2", [IsStreaming, SetsFPMR], []>;
24352435

24362436
// Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector
2437-
def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>;
2438-
def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>;
2437+
def SVF1CVTL_X2 : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>;
2438+
def SVF2CVTL_X2 : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>;
24392439
}
24402440

24412441
let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
@@ -2451,3 +2451,15 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in {
24512451
defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">;
24522452
defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">;
24532453
}
2454+
2455+
let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
2456+
// SVE FP8 widening conversions
2457+
2458+
// 8-bit floating-point convert to BFloat16/Float16
2459+
def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>;
2460+
def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>;
2461+
2462+
// 8-bit floating-point convert to BFloat16/Float16 (top)
2463+
def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>;
2464+
def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>;
2465+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
3+
// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
4+
5+
// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
6+
// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
7+
8+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
9+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
10+
11+
// REQUIRES: aarch64-registered-target
12+
13+
#ifdef __ARM_FEATURE_SME
14+
#include <arm_sme.h>
15+
#else
16+
#include <arm_sve.h>
17+
#endif
18+
19+
#ifdef SVE_OVERLOADED_FORMS
20+
#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
21+
#else
22+
#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
23+
#endif
24+
25+
#ifdef __ARM_FEATURE_SME
26+
#define STREAMING __arm_streaming
27+
#else
28+
#define STREAMING
29+
#endif
30+
31+
// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvt1_bf16_mf8(
32+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
33+
// CHECK-NEXT: [[ENTRY:.*:]]
34+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
35+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
36+
// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
37+
//
38+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z20test_svcvt1_bf16_mf8u13__SVMfloat8_tm(
39+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
40+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
41+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
42+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
43+
// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
44+
//
45+
svbfloat16_t test_svcvt1_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
46+
return SVE_ACLE_FUNC(svcvt1_bf16,_mf8,_fpm)(zn, fpm);
47+
}
48+
49+
// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvt2_bf16_mf8(
50+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
51+
// CHECK-NEXT: [[ENTRY:.*:]]
52+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
53+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
54+
// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
55+
//
56+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z20test_svcvt2_bf16_mf8u13__SVMfloat8_tm(
57+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
58+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
59+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
60+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
61+
// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
62+
//
63+
svbfloat16_t test_svcvt2_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
64+
return SVE_ACLE_FUNC(svcvt2_bf16,_mf8,_fpm)(zn, fpm);
65+
}
66+
67+
// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvtlt1_bf16_mf8(
68+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
69+
// CHECK-NEXT: [[ENTRY:.*:]]
70+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
71+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
72+
// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
73+
//
74+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z22test_svcvtlt1_bf16_mf8u13__SVMfloat8_tm(
75+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
76+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
77+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
78+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
79+
// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
80+
//
81+
svbfloat16_t test_svcvtlt1_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
82+
return SVE_ACLE_FUNC(svcvtlt1_bf16,_mf8,_fpm)(zn, fpm);
83+
}
84+
85+
// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvtlt2_bf16_mf8(
86+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
87+
// CHECK-NEXT: [[ENTRY:.*:]]
88+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
89+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
90+
// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
91+
//
92+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z22test_svcvtlt2_bf16_mf8u13__SVMfloat8_tm(
93+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
94+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
95+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
96+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
97+
// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
98+
//
99+
svbfloat16_t test_svcvtlt2_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
100+
return SVE_ACLE_FUNC(svcvtlt2_bf16,_mf8,_fpm)(zn, fpm);
101+
}
102+
103+
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvt1_f16_mf8(
104+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
105+
// CHECK-NEXT: [[ENTRY:.*:]]
106+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
107+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
108+
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
109+
//
110+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z19test_svcvt1_f16_mf8u13__SVMfloat8_tm(
111+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
112+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
113+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
114+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
115+
// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
116+
//
117+
svfloat16_t test_svcvt1_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
118+
return SVE_ACLE_FUNC(svcvt1_f16,_mf8,_fpm)(zn, fpm);
119+
}
120+
121+
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvt2_f16_mf8(
122+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
123+
// CHECK-NEXT: [[ENTRY:.*:]]
124+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
125+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
126+
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
127+
//
128+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z19test_svcvt2_f16_mf8u13__SVMfloat8_tm(
129+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
130+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
131+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
132+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
133+
// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
134+
//
135+
svfloat16_t test_svcvt2_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
136+
return SVE_ACLE_FUNC(svcvt2_f16,_mf8,_fpm)(zn, fpm);
137+
}
138+
139+
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvtlt1_f16_mf8(
140+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
141+
// CHECK-NEXT: [[ENTRY:.*:]]
142+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
143+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
144+
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
145+
//
146+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z21test_svcvtlt1_f16_mf8u13__SVMfloat8_tm(
147+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
148+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
149+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
150+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
151+
// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
152+
//
153+
svfloat16_t test_svcvtlt1_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
154+
return SVE_ACLE_FUNC(svcvtlt1_f16,_mf8,_fpm)(zn, fpm);
155+
}
156+
157+
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvtlt2_f16_mf8(
158+
// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
159+
// CHECK-NEXT: [[ENTRY:.*:]]
160+
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
161+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
162+
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
163+
//
164+
// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z21test_svcvtlt2_f16_mf8u13__SVMfloat8_tm(
165+
// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
166+
// CHECK-CXX-NEXT: [[ENTRY:.*:]]
167+
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
168+
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
169+
// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
170+
//
171+
svfloat16_t test_svcvtlt2_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
172+
return SVE_ACLE_FUNC(svcvtlt2_f16,_mf8,_fpm)(zn, fpm);
173+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// REQUIRES: aarch64-registered-target
2+
3+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm %s
4+
5+
#include <arm_sve.h>
6+
7+
void test_features(svmfloat8_t zn, fpm_t fpm) {
8+
svcvt1_bf16_mf8_fpm(zn, fpm);
9+
// expected-error@-1 {{'svcvt1_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
10+
svcvt2_bf16_mf8_fpm(zn, fpm);
11+
// expected-error@-1 {{'svcvt2_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
12+
svcvtlt1_bf16_mf8_fpm(zn, fpm);
13+
// expected-error@-1 {{'svcvtlt1_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
14+
svcvtlt2_bf16_mf8_fpm(zn, fpm);
15+
// expected-error@-1 {{'svcvtlt2_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
16+
svcvt1_f16_mf8_fpm(zn, fpm);
17+
// expected-error@-1 {{'svcvt1_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
18+
svcvt2_f16_mf8_fpm(zn, fpm);
19+
// expected-error@-1 {{'svcvt2_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
20+
svcvtlt1_f16_mf8_fpm(zn, fpm);
21+
// expected-error@-1 {{'svcvtlt1_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
22+
svcvtlt2_f16_mf8_fpm(zn, fpm);
23+
// expected-error@-1 {{'svcvtlt2_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
24+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3860,6 +3860,17 @@ def int_aarch64_neon_famin : AdvSIMD_2VectorArg_Intrinsic;
38603860
//
38613861
let TargetPrefix = "aarch64" in {
38623862

3863+
// SVE Widening Conversions
3864+
class SVE2_FP8_Cvt
3865+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
3866+
[llvm_nxv16i8_ty],
3867+
[IntrReadMem, IntrInaccessibleMemOnly]>;
3868+
3869+
def int_aarch64_sve_fp8_cvt1 : SVE2_FP8_Cvt;
3870+
def int_aarch64_sve_fp8_cvt2 : SVE2_FP8_Cvt;
3871+
def int_aarch64_sve_fp8_cvtlt1 : SVE2_FP8_Cvt;
3872+
def int_aarch64_sve_fp8_cvtlt2 : SVE2_FP8_Cvt;
3873+
38633874
class SME2_FP8_CVT_X2_Single_Intrinsic
38643875
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
38653876
[llvm_nxv16i8_ty],
@@ -3886,4 +3897,4 @@ let TargetPrefix = "aarch64" in {
38863897
// FP8 outer product
38873898
def int_aarch64_sme_fp8_fmopa_za16 : SME_FP8_OuterProduct_Intrinsic;
38883899
def int_aarch64_sme_fp8_fmopa_za32 : SME_FP8_OuterProduct_Intrinsic;
3889-
}
3900+
}

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4369,14 +4369,14 @@ let Predicates = [HasNonStreamingSVE2p2orSME2p2] in {
43694369
//===----------------------------------------------------------------------===//
43704370
let Predicates = [HasSVE2orSME2, HasFP8] in {
43714371
// FP8 upconvert
4372-
defm F1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b00, "f1cvt">;
4373-
defm F2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b01, "f2cvt">;
4374-
defm BF1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b10, "bf1cvt">;
4375-
defm BF2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b11, "bf2cvt">;
4376-
defm F1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b00, "f1cvtlt">;
4377-
defm F2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b01, "f2cvtlt">;
4378-
defm BF1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b10, "bf1cvtlt">;
4379-
defm BF2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b11, "bf2cvtlt">;
4372+
defm F1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b00, "f1cvt", nxv8f16, int_aarch64_sve_fp8_cvt1>;
4373+
defm F2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b01, "f2cvt", nxv8f16, int_aarch64_sve_fp8_cvt2>;
4374+
defm BF1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b10, "bf1cvt", nxv8bf16, int_aarch64_sve_fp8_cvt1>;
4375+
defm BF2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b11, "bf2cvt", nxv8bf16, int_aarch64_sve_fp8_cvt2>;
4376+
defm F1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b00, "f1cvtlt", nxv8f16, int_aarch64_sve_fp8_cvtlt1>;
4377+
defm F2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b01, "f2cvtlt", nxv8f16, int_aarch64_sve_fp8_cvtlt2>;
4378+
defm BF1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b10, "bf1cvtlt", nxv8bf16, int_aarch64_sve_fp8_cvtlt1>;
4379+
defm BF2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b11, "bf2cvtlt", nxv8bf16, int_aarch64_sve_fp8_cvtlt2>;
43804380

43814381
// FP8 downconvert
43824382
defm FCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r>;

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10769,10 +10769,15 @@ class sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic,
1076910769
let Inst{9-5} = Zn;
1077010770
let Inst{4-0} = Zd;
1077110771
let Uses = [FPMR, FPCR];
10772+
10773+
let mayLoad = 1;
10774+
let mayStore = 0;
1077210775
}
1077310776

10774-
multiclass sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic> {
10777+
multiclass sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic, ValueType vtd, SDPatternOperator op> {
1077510778
def _BtoH : sve2_fp8_cvt_single<L, opc, mnemonic, ZPR16, ZPR8>;
10779+
10780+
def : SVE_1_Op_Pat<vtd, op, nxv16i8, !cast<Instruction>(NAME # _BtoH)>;
1077610781
}
1077710782

1077810783
// FP8 downconvert
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mattr=+sve2,+fp8 < %s | FileCheck %s
3+
; RUN: llc -mattr=+sme2,+fp8 --force-streaming < %s | FileCheck %s
4+
5+
target triple = "aarch64-linux"
6+
7+
define <vscale x 8 x bfloat> @cvt1_bf16(<vscale x 16 x i8> %s) {
8+
; CHECK-LABEL: cvt1_bf16:
9+
; CHECK: // %bb.0:
10+
; CHECK-NEXT: bf1cvt z0.h, z0.b
11+
; CHECK-NEXT: ret
12+
%r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt1.nxv8bf16(<vscale x 16 x i8> %s)
13+
ret <vscale x 8 x bfloat> %r
14+
}
15+
16+
define <vscale x 8 x bfloat> @cvt2_bf16(<vscale x 16 x i8> %s) {
17+
; CHECK-LABEL: cvt2_bf16:
18+
; CHECK: // %bb.0:
19+
; CHECK-NEXT: bf2cvt z0.h, z0.b
20+
; CHECK-NEXT: ret
21+
%r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt2.nxv8bf16(<vscale x 16 x i8> %s)
22+
ret <vscale x 8 x bfloat> %r
23+
}
24+
25+
define <vscale x 8 x bfloat> @cvtlt1_bf16(<vscale x 16 x i8> %s) {
26+
; CHECK-LABEL: cvtlt1_bf16:
27+
; CHECK: // %bb.0:
28+
; CHECK-NEXT: bf1cvtlt z0.h, z0.b
29+
; CHECK-NEXT: ret
30+
%r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt1.nxv8bf16(<vscale x 16 x i8> %s)
31+
ret <vscale x 8 x bfloat> %r
32+
}
33+
34+
define <vscale x 8 x bfloat> @cvtlt2_bf16(<vscale x 16 x i8> %s) {
35+
; CHECK-LABEL: cvtlt2_bf16:
36+
; CHECK: // %bb.0:
37+
; CHECK-NEXT: bf2cvtlt z0.h, z0.b
38+
; CHECK-NEXT: ret
39+
%r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt2.nxv8bf16(<vscale x 16 x i8> %s)
40+
ret <vscale x 8 x bfloat> %r
41+
}
42+
43+
define <vscale x 8 x half> @cvt1_f16(<vscale x 16 x i8> %s) {
44+
; CHECK-LABEL: cvt1_f16:
45+
; CHECK: // %bb.0:
46+
; CHECK-NEXT: f1cvt z0.h, z0.b
47+
; CHECK-NEXT: ret
48+
%r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt1.nxv8f16(<vscale x 16 x i8> %s)
49+
ret <vscale x 8 x half> %r
50+
}
51+
52+
define <vscale x 8 x half> @cvt2_f16(<vscale x 16 x i8> %s) {
53+
; CHECK-LABEL: cvt2_f16:
54+
; CHECK: // %bb.0:
55+
; CHECK-NEXT: f2cvt z0.h, z0.b
56+
; CHECK-NEXT: ret
57+
%r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt2.nxv8f16(<vscale x 16 x i8> %s)
58+
ret <vscale x 8 x half> %r
59+
}
60+
61+
62+
define <vscale x 8 x half> @cvtlt1_f16(<vscale x 16 x i8> %s) {
63+
; CHECK-LABEL: cvtlt1_f16:
64+
; CHECK: // %bb.0:
65+
; CHECK-NEXT: f1cvtlt z0.h, z0.b
66+
; CHECK-NEXT: ret
67+
%r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt1.nxv8f16(<vscale x 16 x i8> %s)
68+
ret <vscale x 8 x half> %r
69+
}
70+
71+
define <vscale x 8 x half> @cvtlt2_f16(<vscale x 16 x i8> %s) {
72+
; CHECK-LABEL: cvtlt2_f16:
73+
; CHECK: // %bb.0:
74+
; CHECK-NEXT: f2cvtlt z0.h, z0.b
75+
; CHECK-NEXT: ret
76+
%r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt2.nxv8f16(<vscale x 16 x i8> %s)
77+
ret <vscale x 8 x half> %r
78+
}

0 commit comments

Comments
 (0)