Skip to content

Commit 1227171

Browse files
authored
[AArch64][SME] Add intrinsics for multi-vector BFCLAMP (#88251)
According to the specification in ARM-software/acle#309 this adds the intrinsics ``` svbfloat16x2_t svclamp[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming; svbfloat16x4_t svclamp[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming; ``` These are available only if __ARM_FEATURE_SME_B16B16 is enabled.
1 parent 39848d0 commit 1227171

File tree

6 files changed

+115
-1
lines changed

6 files changed

+115
-1
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2151,6 +2151,11 @@ let TargetGuard = "sme2" in {
21512151
def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "hfd", MergeNone, "aarch64_sve_fclamp_single_x4", [IsStreaming], []>;
21522152
}
21532153

2154+
let TargetGuard = "sme2,b16b16"in {
2155+
def SVBFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "b", MergeNone, "aarch64_sve_bfclamp_single_x2", [IsStreaming], []>;
2156+
def SVBFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "b", MergeNone, "aarch64_sve_bfclamp_single_x4", [IsStreaming], []>;
2157+
}
2158+
21542159
let TargetGuard = "sme2" in {
21552160
// == ADD (vectors) ==
21562161
def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,3 +745,67 @@ svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svf
745745
svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) __arm_streaming {
746746
return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3);
747747
}
748+
749+
// CHECK-LABEL: @test_svclamp_single_bf16_x2(
750+
// CHECK-NEXT: entry:
751+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
752+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
753+
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
754+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
755+
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
756+
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
757+
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
758+
// CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP6]]
759+
//
760+
// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_tS0_(
761+
// CPP-CHECK-NEXT: entry:
762+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
763+
// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
764+
// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
765+
// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
766+
// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
767+
// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
768+
// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
769+
// CPP-CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP6]]
770+
//
771+
svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
772+
return SVE_ACLE_FUNC(svclamp, _single_bf16_x2, , )(op1, op2, op3);
773+
}
774+
775+
// CHECK-LABEL: @test_svclamp_single_bf16_x4(
776+
// CHECK-NEXT: entry:
777+
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
778+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
779+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
780+
// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
781+
// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
782+
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
783+
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
784+
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
785+
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
786+
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
787+
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
788+
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
789+
// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
790+
// CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP12]]
791+
//
792+
// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_tS0_(
793+
// CPP-CHECK-NEXT: entry:
794+
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
795+
// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
796+
// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
797+
// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
798+
// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
799+
// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
800+
// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
801+
// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
802+
// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
803+
// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
804+
// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
805+
// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
806+
// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
807+
// CPP-CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP12]]
808+
//
809+
svbfloat16x4_t test_svclamp_single_bf16_x4(svbfloat16x4_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
810+
return SVE_ACLE_FUNC(svclamp, _single_bf16_x4, , )(op1, op2, op3);
811+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// REQUIRES: aarch64-registered-target
2+
3+
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
4+
5+
#include <arm_sme.h>
6+
7+
void test_b16b16(svbool_t pg, uint64_t u64, int64_t i64, const bfloat16_t *const_bf16_ptr, bfloat16_t *bf16_ptr, svbfloat16_t bf16, svbfloat16x2_t bf16x2, svbfloat16x3_t bf16x3, svbfloat16x4_t bf16x4) __arm_streaming
8+
{
9+
// expected-error@+1 {{'svclamp_single_bf16_x2' needs target feature sme2,b16b16}}
10+
svclamp_single_bf16_x2(bf16x2, bf16, bf16);
11+
// expected-error@+1 {{'svclamp_single_bf16_x4' needs target feature sme2,b16b16}}
12+
svclamp_single_bf16_x4(bf16x4, bf16, bf16);
13+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3472,10 +3472,12 @@ let TargetPrefix = "aarch64" in {
34723472
def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
34733473
def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
34743474
def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
3475+
def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
34753476

34763477
def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
34773478
def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
34783479
def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
3480+
def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
34793481

34803482
//
34813483
// Multi-vector add/sub and accumulate into ZA

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5738,6 +5738,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
57385738
AArch64::FCLAMP_VG2_2Z2Z_D}))
57395739
SelectClamp(Node, 2, Op);
57405740
return;
5741+
case Intrinsic::aarch64_sve_bfclamp_single_x2:
5742+
SelectClamp(Node, 2, AArch64::BFCLAMP_VG2_2ZZZ_H);
5743+
return;
57415744
case Intrinsic::aarch64_sve_sclamp_single_x4:
57425745
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
57435746
Node->getValueType(0),
@@ -5759,6 +5762,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
57595762
AArch64::FCLAMP_VG4_4Z4Z_D}))
57605763
SelectClamp(Node, 4, Op);
57615764
return;
5765+
case Intrinsic::aarch64_sve_bfclamp_single_x4:
5766+
SelectClamp(Node, 4, AArch64::BFCLAMP_VG4_4ZZZ_H);
5767+
return;
57625768
case Intrinsic::aarch64_sve_add_single_x2:
57635769
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
57645770
Node->getValueType(0),

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
33

44
define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
55
; CHECK-LABEL: bfclamp:
@@ -11,3 +11,27 @@ define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bf
1111
}
1212

1313
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
14+
15+
define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x2_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d){
16+
; CHECK-LABEL: test_bfclamp_single_x2_f16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
19+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
20+
; CHECK-NEXT: bfclamp { z0.h, z1.h }, z2.h, z3.h
21+
; CHECK-NEXT: ret
22+
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d)
23+
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
24+
}
25+
26+
define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x4_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f){
27+
; CHECK-LABEL: test_bfclamp_single_x4_f16:
28+
; CHECK: // %bb.0:
29+
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
30+
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
31+
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
32+
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
33+
; CHECK-NEXT: bfclamp { z0.h - z3.h }, z4.h, z5.h
34+
; CHECK-NEXT: ret
35+
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f)
36+
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
37+
}

0 commit comments

Comments
 (0)