-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unscaled offset #70474
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-clang Author: Momchil Velikov (momchil-velikov) ChangesACLE spec: ARM-software/acle#257 Co-authored-by: Caroline Concatto <[email protected]> Patch is 579.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70474.diff 18 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b5baafedd139602..ff9a4062a35912c 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -298,6 +298,29 @@ let TargetGuard = "sve,bf16" in {
def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
}
+let TargetGuard = "sve2p1" in {
+ // Contiguous zero-extending load to quadword (single vector).
+ def SVLD1UWQ : MInst<"svld1uwq[_{d}]", "dPc", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
+ def SVLD1UWQ_VNUM : MInst<"svld1uwq_vnum[_{d}]", "dPcl", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
+
+ def SVLD1UDQ : MInst<"svld1udq[_{d}]", "dPc", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
+ def SVLD1UDQ_VNUM : MInst<"svld1udq_vnum[_{d}]", "dPcl", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
+
+ // Load one vector (vector base + scalar offset)
+ def SVLD1Q_GATHER_U64BASE_OFFSET : MInst<"svld1q_gather[_{2}base]_offset_{d}", "dPgl", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
+ def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
+
+ // Load N-element structure into N vectors (scalar base)
+ defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">;
+ defm SVLD3Q : StructLoad<"svld3q[_{2}]", "3Pc", "aarch64_sve_ld3q_sret">;
+ defm SVLD4Q : StructLoad<"svld4q[_{2}]", "4Pc", "aarch64_sve_ld4q_sret">;
+
+ // Load N-element structure into N vectors (scalar base, VL displacement)
+ defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
+ defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
+ defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
+}
+
////////////////////////////////////////////////////////////////////////////////
// Stores
@@ -420,6 +443,29 @@ let TargetGuard = "sve,bf16" in {
def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
}
+let TargetGuard = "sve2p1" in {
+ // Contiguous truncating store from quadword (single vector).
+ def SVST1UWQ : MInst<"svst1uwq[_{d}]", "vPcd", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
+ def SVST1UWQ_VNUM : MInst<"svst1uwq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
+
+ def SVST1UDQ : MInst<"svst1udq[_{d}]", "vPcd", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
+ def SVST1UDQ_VNUM : MInst<"svst1udq_vnum[_{d}]", "vPcld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
+
+ // Store one vector (vector base + scalar offset)
+ def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]", "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
+ def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]", "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
+
+ // Store N vectors into N-element structure (scalar base)
+ defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;
+ defm SVST3Q : StructStore<"svst3q[_{d}]", "vPc3", "aarch64_sve_st3q">;
+ defm SVST4Q : StructStore<"svst4q[_{d}]", "vPc4", "aarch64_sve_st4q">;
+
+ // Store N vectors into N-element structure (scalar base, VL displacement)
+ defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
+ defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
+ defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
+}
+
////////////////////////////////////////////////////////////////////////////////
// Prefetches
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index dce5ee5888c458e..60bc84cb3602de9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9529,14 +9529,17 @@ Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_ld2_sret:
case Intrinsic::aarch64_sve_ld1_pn_x2:
case Intrinsic::aarch64_sve_ldnt1_pn_x2:
+ case Intrinsic::aarch64_sve_ld2q_sret:
N = 2;
break;
case Intrinsic::aarch64_sve_ld3_sret:
+ case Intrinsic::aarch64_sve_ld3q_sret:
N = 3;
break;
case Intrinsic::aarch64_sve_ld4_sret:
case Intrinsic::aarch64_sve_ld1_pn_x4:
case Intrinsic::aarch64_sve_ldnt1_pn_x4:
+ case Intrinsic::aarch64_sve_ld4q_sret:
N = 4;
break;
default:
@@ -9574,14 +9577,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_st2:
case Intrinsic::aarch64_sve_st1_pn_x2:
case Intrinsic::aarch64_sve_stnt1_pn_x2:
+ case Intrinsic::aarch64_sve_st2q:
N = 2;
break;
case Intrinsic::aarch64_sve_st3:
+ case Intrinsic::aarch64_sve_st3q:
N = 3;
break;
case Intrinsic::aarch64_sve_st4:
case Intrinsic::aarch64_sve_st1_pn_x4:
case Intrinsic::aarch64_sve_stnt1_pn_x4:
+ case Intrinsic::aarch64_sve_st4q:
N = 4;
break;
default:
@@ -9662,7 +9668,7 @@ Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
llvm::Type *ReturnTy,
SmallVectorImpl<Value *> &Ops,
- unsigned BuiltinID,
+ unsigned IntrinsicID,
bool IsZExtReturn) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
@@ -9671,28 +9677,47 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
// The vector type that is returned may be different from the
// eventual type loaded from memory.
auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
- auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
+ llvm::ScalableVectorType *MemoryTy = nullptr;
+ llvm::ScalableVectorType *PredTy = nullptr;
+ bool IsExtendingLoad = true;
+ switch (IntrinsicID) {
+ case Intrinsic::aarch64_sve_ld1uwq:
+ case Intrinsic::aarch64_sve_ld1udq:
+ MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
+ PredTy =
+ llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
+ IsExtendingLoad = false;
+ break;
+ default:
+ MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
+ PredTy = MemoryTy;
+ break;
+ }
- Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
+ Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];
// Does the load have an offset?
if (Ops.size() > 2)
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
- Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
+ Function *F =
+ CGM.getIntrinsic(IntrinsicID, IsExtendingLoad ? MemoryTy : VectorTy);
auto *Load =
cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);
+ if (!IsExtendingLoad)
+ return Load;
+
return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
- : Builder.CreateSExt(Load, VectorTy);
+ : Builder.CreateSExt(Load, VectorTy);
}
Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
SmallVectorImpl<Value *> &Ops,
- unsigned BuiltinID) {
+ unsigned IntrinsicID) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
LangPTy->castAs<PointerType>()->getPointeeType());
@@ -9702,17 +9727,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
- Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
+ auto PredTy = MemoryTy;
+ auto AddrMemoryTy = MemoryTy;
+ bool IsTruncatingStore = true;
+ ;
+ switch (IntrinsicID) {
+ case Intrinsic::aarch64_sve_st1uwq:
+ case Intrinsic::aarch64_sve_st1udq:
+ AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
+ PredTy =
+ llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
+ IsTruncatingStore = false;
+ break;
+ default:
+ break;
+ }
+ Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];
// Does the store have an offset?
if (Ops.size() == 4)
- BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
+ BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);
// Last value is always the data
- llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy);
+ Value *Val = IsTruncatingStore ? Builder.CreateTrunc(Ops.back(), MemoryTy)
+ : Ops.back();
- Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
+ Function *F =
+ CGM.getIntrinsic(IntrinsicID, IsTruncatingStore ? MemoryTy : VectorTy);
auto *Store =
cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
new file mode 100644
index 000000000000000..16361ecc987d3c5
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
@@ -0,0 +1,255 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 \
+// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// LD1W
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_u32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z17test_svld1uwq_u32u10__SVBool_tPKj
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svuint32_t test_svld1uwq_u32(svbool_t pred, uint32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq, _u32, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_vnum_u32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z22test_svld1uwq_vnum_u32u10__SVBool_tPKj
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svuint32_t test_svld1uwq_vnum_u32(svbool_t pred, uint32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq_vnum, _u32, , )(pred, base, -8);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_s32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z17test_svld1uwq_s32u10__SVBool_tPKi
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+//
+svint32_t test_svld1uwq_s32(svbool_t pred, int32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq, _s32, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svld1uwq_vnum_s32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 7
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z22test_svld1uwq_vnum_s32u10__SVBool_tPKi
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 7
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP2]]
+//
+svint32_t test_svld1uwq_vnum_s32(svbool_t pred, int32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq_vnum, _s32, , )(pred, base, 7);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svld1uwq_f32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z17test_svld1uwq_f32u10__SVBool_tPKf
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svld1uwq_f32(svbool_t pred, float32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq, _f32, , )(pred, base);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svld1uwq_vnum_f32
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 -8
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP2]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svld1uwq_vnum_f32u10__SVBool_tPKf
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 -8
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP2]]
+//
+svfloat32_t test_svld1uwq_vnum_f32(svbool_t pred, float32_t const * base) {
+ return SVE_ACLE_FUNC(svld1uwq_vnum, _f32, , )(pred, base, -8);
+}
+
+
+// LD1D
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svld1udq_u64
+// CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z17test_svld1udq_u64u10__SVBool_tPKm
+// CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
+//...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for this! I've not done an exhaustive review, but I'll leave the comments I have so far.
1c47d21
to
a3b7e13
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
a3b7e13
to
134bccf
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wow, this is a huge patch. :) It took me a few hours to work through all the tests, and it's quite possible I've missed something. However, overall it looks fine and I can't see any major issues. I think there is one missing test, but once that's fixed I'll approve it!
- Rename a couple of inaccurate variable names - Add memory access flags to some intrinsic types
The `LD1Q` and `ST1Q` instructions use a single predicate bit for each pair of 64-bit offsets. In other words, the number of bits changes with `vscale` and is completely unrelated to the interpretation of the loaded data by the ACLE intrinsics or the user program. Therefore the most semantically correct type to use for the predicate parameter for LLVM IR intrinsics which end up as `LD1Q` or `ST1Q` is `<vscale x 1 x i1>`.
134bccf
to
3154066
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. I think I would have preferred the patch to be split up into 3 - one for contiguous extending loads/truncating stores, one for structured loads/stores, and one for the gathers. That's why it took me so long to review this patch as I was constantly trying to keep all the information about each builtin/instruction in my head whilst reviewing the tests for correctness!
This patch adds a set of SVE2.1 quadword load/store intrisics:
Contiguous zero-extending load to quadword (single vector)
svt svld1uwq[](svbool_t, const _t *ptr);
svt svld1uwq_vnum[](svbool_t, const *ptr, int64_t vnum);
svt svld1udq[](svbool_t, const _t *ptr);
svt svld1udq_vnum[](svbool_t, const _t *ptr, int64_t vnum);
Contiguous truncating store of single vector operand
void svst1uwq[_](svbool_t, const _t *ptr, svt data);
void svst1uwq_vnum[](svbool_t, const _t *ptr, int64_t vnum, sv_t data);
void svst1udq[_](svbool_t, const _t *ptr, svt data);
void svst1udq_vnum[](svbool_t, const _t *ptr, int64_t vnum, sv_t data);
Gather load quadword
sv_t svld1q_gather[u64base](svbool_t pg, svuint64_t zn);
sv_t svld1q_gather[_u64base]offset(svbool_t pg, svuint64_t zn, int64_t offset);
Scatter store quadword
void svst1q_scatter[u64base][](svbool_t pg, svuint64_t zn, sv_t data);
void svst1q_scatter[_u64base]offset[](svbool_t pg, svuint64_t zn, int64_t offset, sv_t data);
Contiguous load two, three or four quadword structures.
svx2_t svld2q[_](svbool_t pg, const t *rn);
svx2_t svld2q_vnum[](svbool_t pg, const t *rn, uint64_t vnum);
svx3_t svld3q[](svbool_t pg, const t *rn);
svx3_t svld3q_vnum[](svbool_t pg, const t *rn, uint64_t vnum);
svx4_t svld4q[](svbool_t pg, const t *rn);
svx4_t svld4q_vnum[](svbool_t pg, const _t *rn, uint64_t vnum);
Contiguous store two, three or four quadword structures.
void svst2q[_](svbool_t pg, t *rn, svx2_t zt);
void svst2q_vnum[](svbool_t pg, t *rn, int64_t vnum, svx2_t zt);
void svst3q[](svbool_t pg, t *rn, svx3_t zt);
void svst3q_vnum[](svbool_t pg, t *rn, int64_t vnum, svx3_t zt);
void svst4q[](svbool_t pg, t *rn, svx4_t zt);
void svst4q_vnum[](svbool_t pg, _t *rn, int64_t vnum, svx4_t zt);
ACLE spec: ARM-software/acle#257
Co-authored-by: Caroline Concatto [email protected]
Co-authored-by: Hassnaa Hamdi [email protected]