[Clang][SVE2.1] Add builtins and intrinsics for SVBFMLSLB/T

CarolineConcatto · CarolineConcatto · commit 200a92520c25 · 2023-10-19T16:44:39.000Z
As described in: ARM-software/acle#257 Patch by: Kerry McLaughlin <kerry.mclaughlin@arm.com> Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D151461
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
@@ -1959,6 +1959,12 @@ def SVDOT_X2_F : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "f",  MergeNone, "aarch64_
 def SVDOT_LANE_X2_S : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "i",  MergeNone, "aarch64_sve_sdot_lane_x2", [], [ImmCheck<3, ImmCheck0_3>]>;
 def SVDOT_LANE_X2_U : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "Ui", MergeNone, "aarch64_sve_udot_lane_x2", [], [ImmCheck<3, ImmCheck0_3>]>;
 def SVDOT_LANE_X2_F : SInst<"svdot_lane[_{d}_{2}_{3}]", "ddhhi", "f",  MergeNone, "aarch64_sve_fdot_lane_x2", [], [ImmCheck<3, ImmCheck0_3>]>;
+
+def SVBFMLSLB : SInst<"svbfmlslb[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslb", [IsOverloadNone], []>;
+def SVBFMLSLT : SInst<"svbfmlslt[_{d}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlslt", [IsOverloadNone], []>;
+
+def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
 let TargetGuard = "sve2p1" in {
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -99,6 +99,7 @@
 // O: svfloat16_t
 // M: svfloat32_t
 // N: svfloat64_t
+// $: svbfloat16_t
 
 // J: Prefetch type (sv_prfop)
 
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_bfmlsl.c
@@ -0,0 +1,85 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// BFMLSLB
+
+
+// CHECK-LABEL: @test_bfmlslb(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z12test_bfmlslbu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_bfmlslb(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm)
+{
+  return SVE_ACLE_FUNC(svbfmlslb,_f32,,)(zda, zn, zm);
+}
+
+
+// CHECK-LABEL: @test_bfmlslb_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_bfmlslb_laneu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_bfmlslb_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm)
+{
+  return SVE_ACLE_FUNC(svbfmlslb_lane,_f32,,)(zda, zn, zm, 7);
+}
+
+// BFMLSLT
+
+
+// CHECK-LABEL: @test_bfmlslt(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z12test_bfmlsltu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_bfmlslt(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm)
+{
+  return SVE_ACLE_FUNC(svbfmlslt,_f32,,)(zda, zn, zm);
+}
+
+
+// CHECK-LABEL: @test_bfmlslt_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_bfmlslt_laneu13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float> [[ZDA:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_bfmlslt_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm)
+{
+  return SVE_ACLE_FUNC(svbfmlslt_lane,_f32,,)(zda, zn, zm, 7);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
@@ -852,6 +852,13 @@ void SVEType::applyModifier(char Mod) {
     NumVectors = 0;
     Signed = false;
     break;
+  case '$':
+    Predicate = false;
+    Svcount = false;
+    Float = false;
+    BFloat = true;
+    ElementBitwidth = 16;
+    break;
   case '}':
     Predicate = false;
     Signed = true;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3011,6 +3011,16 @@ let TargetPrefix = "aarch64" in {
                             [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
+  class SME2_BFMLS_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+                            [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
+                            [IntrNoMem]>;
+
+  class SME2_BFMLS_Lane_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+                            [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty],
+                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+
   class SME2_ZA_ArrayVector_Read_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                 [llvm_i32_ty],
@@ -3214,6 +3224,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_usmla_za32_lane_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
   def int_aarch64_sme_usmla_za32_lane_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
 
+  def int_aarch64_sve_bfmlslb : SME2_BFMLS_Intrinsic;
+  def int_aarch64_sve_bfmlslb_lane : SME2_BFMLS_Lane_Intrinsic;
+
+  def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic;
+  def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic;
+
   // Multi-vector signed saturating doubling multiply high
 
   def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3753,12 +3753,14 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 
 let Predicates = [HasSVE2p1_or_HasSME2] in {
 defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp>;
+
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
-def BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb">;
-def BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt">;
-def BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb">;
-def BFMLSLT_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b111, "bfmlslt">;
+
+defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
+defm BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslt>;
+defm BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb_lane>;
+defm BFMLSLT_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b111, "bfmlslt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslt_lane>;
 
 defm SDOT_ZZZ_HtoS  : sve2p1_two_way_dot_vv<"sdot", 0b0, int_aarch64_sve_sdot_x2>;
 defm UDOT_ZZZ_HtoS  : sve2p1_two_way_dot_vv<"udot", 0b1, int_aarch64_sve_udot_x2>;
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 4 x float> @bfmlslb_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslb_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslb z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslt_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslt_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslt z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslb_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslb_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslb z0.s, z1.h, z2.h[7]
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlslt_lane_f32(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
+; CHECK-LABEL: bfmlslt_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfmlslt z0.s, z1.h, z2.h[7]
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float> %zda, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
+  ret <vscale x 4 x float> %out
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlslt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)