[Clang][SME2] Add builtins for moving multi-vectors to/from ZA #71191

kmclaughlin-arm · 2023-11-03T15:32:28Z

Adds the following SME2 builtins:

svread_hor/ver,
svwrite_hor/ver,
svread_za64,
svwrite_za64

See ARM-software/acle#217

llvmbot · 2023-11-03T15:32:45Z

@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-clang-codegen

@llvm/pr-subscribers-clang

Author: Kerry McLaughlin (kmclaughlin-arm)

Changes

Adds the following SME2 builtins:

svread_hor/ver,
svwrite_hor/ver,
svread_za64,
svwrite_za64

See ARM-software/acle#217

Patch is 230.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71191.diff

5 Files Affected:

(modified) clang/include/clang/Basic/arm_sme.td (+46)
(modified) clang/include/clang/Basic/arm_sve_sme_incl.td (+1-1)
(added) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c (+1422)
(added) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c (+1097)
(added) clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp (+57)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 8d85327a86b1aaf..d81859ba39dfd7c 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -263,3 +263,49 @@ multiclass ZAFPOuterProd<string n_suffix> {
 
 defm SVMOPA : ZAFPOuterProd<"mopa">;
 defm SVMOPS : ZAFPOuterProd<"mops">;
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - MOVA
+
+//
+// 2 and 4 vector-group read/write intrinsics.
+//
+
+multiclass WriteHV_VG<string n, string t, string i, list<ImmCheck> checks> {
+  let TargetGuard = "sme2" in {
+    def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "_vg2", "vim2", t, MergeNone, i # "_hor_vg2", [IsSharedZA, IsStreaming], checks>;
+    def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "_vg2", "vim2", t, MergeNone, i # "_ver_vg2", [IsSharedZA, IsStreaming], checks>;
+    def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "_vg4", "vim4", t, MergeNone, i # "_hor_vg4", [IsSharedZA, IsStreaming], checks>;
+    def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "_vg4", "vim4", t, MergeNone, i # "_ver_vg4", [IsSharedZA, IsStreaming], checks>;
+  }
+}
+
+defm SVWRITE_ZA8  : WriteHV_VG<"za8[_{d}]",  "cUc",   "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVWRITE_ZA16 : WriteHV_VG<"za16[_{d}]", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVWRITE_ZA32 : WriteHV_VG<"za32[_{d}]", "iUif",  "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVWRITE_ZA64 : WriteHV_VG<"za64[_{d}]", "lUld",  "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>]>;
+
+multiclass ReadHV_VG<string n, string t, string i, list<ImmCheck> checks> {
+  let TargetGuard = "sme2" in {
+    def NAME # _VG2_H : Inst<"svread_hor_" # n # "_vg2", "2im", t, MergeNone, i # "_hor_vg2", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
+    def NAME # _VG2_V : Inst<"svread_ver_" # n # "_vg2", "2im", t, MergeNone, i # "_ver_vg2", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
+    def NAME # _VG4_H : Inst<"svread_hor_" # n # "_vg4", "4im", t, MergeNone, i # "_hor_vg4", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
+    def NAME # _VG4_V : Inst<"svread_ver_" # n # "_vg4", "4im", t, MergeNone, i # "_ver_vg4", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
+  }
+}
+
+defm SVREAD_ZA8  : ReadHV_VG<"za8_{d}",  "cUc",   "aarch64_sme_read", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVREAD_ZA16 : ReadHV_VG<"za16_{d}", "sUshb", "aarch64_sme_read", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVREAD_ZA32 : ReadHV_VG<"za32_{d}", "iUif",  "aarch64_sme_read", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVREAD_ZA64 : ReadHV_VG<"za64_{d}", "lUld",  "aarch64_sme_read", [ImmCheck<0, ImmCheck0_7>]>;
+
+//
+// Single vector-group read/write intrinsics.
+//
+
+let TargetGuard = "sme2" in {
+  def SVWRITE_ZA64_VG1x2 : Inst<"svwrite_za64[_{d}]_vg1x2", "vm2", "lUld", MergeNone, "aarch64_sme_write_vg1x2", [IsSharedZA, IsStreaming], []>;
+  def SVWRITE_ZA64_VG1x4 : Inst<"svwrite_za64[_{d}]_vg1x4", "vm4", "lUld", MergeNone, "aarch64_sme_write_vg1x4", [IsSharedZA, IsStreaming], []>;
+  def SVREAD_ZA64_VG1x2  : Inst<"svread_za64_{d}_vg1x2",    "2m",  "lUld", MergeNone, "aarch64_sme_read_vg1x2",  [IsSharedZA, IsPreservesZA, IsStreaming], []>;
+  def SVREAD_ZA64_VG1x4  : Inst<"svread_za64_{d}_vg1x4",    "4m",  "lUld", MergeNone, "aarch64_sme_read_vg1x4",  [IsSharedZA, IsPreservesZA, IsStreaming], []>;
+}
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 3a7a5b51b25801e..22a2a3c5434d657 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -257,7 +257,7 @@ class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
 }
 
 class Inst<string n, string p, string t, MergeType mt, string i,
-           list<FlagType> ft, list<ImmCheck> ch, MemEltType met> {
+           list<FlagType> ft, list<ImmCheck> ch, MemEltType met = MemEltTyDefault> {
   string Name = n;
   string Prototype = p;
   string Types = t;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
new file mode 100644
index 000000000000000..706403e5180ad4b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
@@ -0,0 +1,1422 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_svread_ver_za8_u8_vg2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_u8_vg2j(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svread_ver_za8_u8_vg2(uint32_t base) {
+  return svread_ver_za8_u8_vg2(0, base);
+}
+
+// CHECK-LABEL: @test_svread_ver_za8_s8_vg2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svread_ver_za8_s8_vg2j(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svread_ver_za8_s8_vg2(uint32_t base) {
+  return svread_ver_za8_s8_vg2(0, base);
+}
+
+// CHECK-LABEL: @test_svread_hor_za8_u8_vg2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg2j(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svread_hor_za8_u8_vg2(uint32_t base) {
+  return svread_hor_za8_u8_vg2(0, base);
+}
+
+// CHECK-LABEL: @test_svread_hor_za8_s8_vg2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg2j(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svread_hor_za8_s8_vg2(uint32_t base) {
+  return svread_hor_za8_s8_vg2(0, base);
+}
+
+// CHECK-LABEL: @test_svread_hor_za8_u8_vg4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_u8_vg4j(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svread_hor_za8_u8_vg4(uint32_t base) {
+  return svread_hor_za8_u8_vg4(0, base);
+}
+
+// CHECK-LABEL: @test_svread_hor_za8_s8_vg4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: @_Z26test_svread_hor_za8_s8_vg4j(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svint8x4_t test_svread_hor_za8_s8_vg4(uint32_t base) {
+  return svread_hor_za8_s8_vg4(0, base);
+}
+
+// CHECK-LABEL: @test_svread_ver_za8_u8_vg4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vs...
[truncated]

SamTebbs33

LGTM

MDevereau · 2023-11-15T16:47:23Z

clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp

@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN:    -target-feature +sve2 -target-feature +sme2 -target-feature +sve -fsyntax-only -verify %s


Can -target-feature +sve2 be deleted?

MDevereau · 2023-11-15T16:49:35Z

clang/include/clang/Basic/arm_sme.td

+// SME2 - MOVA
+
+//
+// 2 and 4 vector-group read/write intrinsics.


Opinion: Given you've defined write and read separately you should have separate comments for read and write

MDevereau · 2023-11-15T16:53:05Z

clang/include/clang/Basic/arm_sme.td

+let TargetGuard = "sme2" in {
+  def SVWRITE_ZA64_VG1x2 : Inst<"svwrite_za64[_{d}]_vg1x2", "vm2", "lUld", MergeNone, "aarch64_sme_write_vg1x2", [IsSharedZA, IsStreaming], []>;
+  def SVWRITE_ZA64_VG1x4 : Inst<"svwrite_za64[_{d}]_vg1x4", "vm4", "lUld", MergeNone, "aarch64_sme_write_vg1x4", [IsSharedZA, IsStreaming], []>;
+  def SVREAD_ZA64_VG1x2  : Inst<"svread_za64_{d}_vg1x2",    "2m",  "lUld", MergeNone, "aarch64_sme_read_vg1x2",  [IsSharedZA, IsPreservesZA, IsStreaming], []>;
+  def SVREAD_ZA64_VG1x4  : Inst<"svread_za64_{d}_vg1x4",    "4m",  "lUld", MergeNone, "aarch64_sme_read_vg1x4",  [IsSharedZA, IsPreservesZA, IsStreaming], []>;
+}


Nit: Since you defined write and read separately for normal the _ZA builtins it might make sense to do the same here even if its just 4 lines.

Adds the following SME2 builtins: - svread_hor/ver, - svwrite_hor/ver, - svread_za64, - svwrite_za64 See ARM-software/acle#217

(See ARM-software/acle#278)

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c

…cle_sme2_read.c

david-arm

LGTM! I had one minor comment, but I won't hold up the patch for it.

clang/include/clang/Basic/arm_sme.td

kmclaughlin-arm added clang Clang issues not falling into any other category clang:codegen IR generation bugs: mangling, exceptions, etc. labels Nov 3, 2023

kmclaughlin-arm requested review from SamTebbs33, sdesmalen-arm, CarolineConcatto and MDevereau November 3, 2023 15:32

llvmbot added the clang:frontend Language frontend issues, e.g. anything involving "Sema" label Nov 3, 2023

kmclaughlin-arm force-pushed the sme2-builtins-mova branch from 61cea40 to 7d6c006 Compare November 13, 2023 10:45

SamTebbs33 approved these changes Nov 14, 2023

View reviewed changes

MDevereau reviewed Nov 15, 2023

View reviewed changes

kmclaughlin-arm added 3 commits December 5, 2023 10:52

[Clang][SVE2] Add builtins for moving multi-vectors to/from ZA

04b18c1

Adds the following SME2 builtins: - svread_hor/ver, - svwrite_hor/ver, - svread_za64, - svwrite_za64 See ARM-software/acle#217

- Add arm_streaming arm_shared_za & arm_preserves_za attributes to tests

c369e64

- Added overloaded types for svread & svwrite.

884f9e9

(See ARM-software/acle#278)

kmclaughlin-arm force-pushed the sme2-builtins-mova branch from 7d6c006 to 884f9e9 Compare December 5, 2023 14:14

llvmbot added the backend:AArch64 label Dec 5, 2023

MDevereau reviewed Dec 5, 2023

View reviewed changes

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c Outdated Show resolved Hide resolved

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c Outdated Show resolved Hide resolved

- Remove unused SVE_ACLE_FUNC & SVE_OVERLOADED_FORMS run lines from a…

dfe59ed

…cle_sme2_read.c

kmclaughlin-arm changed the title ~~[Clang][SVE2] Add builtins for moving multi-vectors to/from ZA~~ [Clang][SME2] Add builtins for moving multi-vectors to/from ZA Dec 5, 2023

david-arm approved these changes Dec 14, 2023

View reviewed changes

clang/include/clang/Basic/arm_sme.td Outdated Show resolved Hide resolved

- Moved more of the builtin name into the read & write multiclasses

b4d97a0

kmclaughlin-arm merged commit e9af57d into llvm:main Dec 19, 2023

kmclaughlin-arm deleted the sme2-builtins-mova branch January 4, 2024 11:16

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Clang][SME2] Add builtins for moving multi-vectors to/from ZA #71191

[Clang][SME2] Add builtins for moving multi-vectors to/from ZA #71191

Uh oh!

kmclaughlin-arm commented Nov 3, 2023

Uh oh!

llvmbot commented Nov 3, 2023 •

edited

Loading

Uh oh!

SamTebbs33 left a comment

Uh oh!

MDevereau Nov 15, 2023

Uh oh!

MDevereau Nov 15, 2023

Uh oh!

MDevereau Nov 15, 2023

Uh oh!

Uh oh!

Uh oh!

david-arm left a comment

Uh oh!

Uh oh!

Uh oh!

		@@ -0,0 +1,57 @@
		// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
		// RUN: -target-feature +sve2 -target-feature +sme2 -target-feature +sve -fsyntax-only -verify %s

[Clang][SME2] Add builtins for moving multi-vectors to/from ZA #71191

[Clang][SME2] Add builtins for moving multi-vectors to/from ZA #71191

Uh oh!

Conversation

kmclaughlin-arm commented Nov 3, 2023

Uh oh!

llvmbot commented Nov 3, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

SamTebbs33 left a comment

Choose a reason for hiding this comment

Uh oh!

MDevereau Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

MDevereau Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

MDevereau Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

david-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Nov 3, 2023 •

edited

Loading