-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Clang][SME2] Use tuple result of SME builtins directly. #109423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
I missed a codepath during PR108008 so SME2/SVE2p1 builtins are converting their struct return type into a large vector, which is causing unnecessary casting via memory.
@llvm/pr-subscribers-clang Author: Paul Walker (paulwalker-arm) ChangesI missed a codepath during PR108008 so SME2/SVE2p1 builtins are converting their struct return type into a large vector, which is causing unnecessary casting via memory. Patch is 2.20 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109423.diff 30 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3d1138b7773853..85c6065b960b8b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9845,6 +9845,22 @@ Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
return C;
}
+Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple,
+ llvm::StructType *Ty) {
+ if (PredTuple->getType() == Ty)
+ return PredTuple;
+
+ Value *Ret = llvm::PoisonValue::get(Ty);
+ for (unsigned I = 0; I < Ty->getNumElements(); ++I) {
+ Value *Pred = Builder.CreateExtractValue(PredTuple, I);
+ Pred = EmitSVEPredicateCast(
+ Pred, cast<llvm::ScalableVectorType>(Ty->getTypeAtIndex(I)));
+ Ret = Builder.CreateInsertValue(Ret, Pred, I);
+ }
+
+ return Ret;
+}
+
Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
SmallVectorImpl<Value *> &Ops,
unsigned IntID) {
@@ -10351,41 +10367,6 @@ Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
return Tuple;
}
-Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
- // Multi-vector results should be broken up into a single (wide) result
- // vector.
- auto *StructTy = dyn_cast<StructType>(Call->getType());
- if (!StructTy)
- return Call;
-
- auto *VTy = dyn_cast<ScalableVectorType>(StructTy->getTypeAtIndex(0U));
- if (!VTy)
- return Call;
- unsigned N = StructTy->getNumElements();
-
- // We may need to emit a cast to a svbool_t
- bool IsPredTy = VTy->getElementType()->isIntegerTy(1);
- unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements();
-
- ScalableVectorType *WideVTy =
- ScalableVectorType::get(VTy->getElementType(), MinElts * N);
- Value *Ret = llvm::PoisonValue::get(WideVTy);
- for (unsigned I = 0; I < N; ++I) {
- Value *SRet = Builder.CreateExtractValue(Call, I);
- assert(SRet->getType() == VTy && "Unexpected type for result value");
- Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
-
- if (IsPredTy)
- SRet = EmitSVEPredicateCast(
- SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16));
-
- Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx);
- }
- Call = Ret;
-
- return Call;
-}
-
void CodeGenFunction::GetAArch64SVEProcessedOperands(
unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
SVETypeFlags TypeFlags) {
@@ -10516,12 +10497,16 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
getSVEOverloadTypes(TypeFlags, Ty, Ops));
Value *Call = Builder.CreateCall(F, Ops);
+ if (Call->getType() == Ty)
+ return Call;
+
// Predicate results must be converted to svbool_t.
- if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType()))
- if (PredTy->getScalarType()->isIntegerTy(1))
- Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
+ if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Ty))
+ return EmitSVEPredicateCast(Call, PredTy);
+ if (auto PredTupleTy = dyn_cast<llvm::StructType>(Ty))
+ return EmitSVEPredicateTupleCast(Call, PredTupleTy);
- return FormSVEBuiltinResult(Call);
+ llvm_unreachable("unsupported element count!");
}
switch (BuiltinID) {
@@ -10853,9 +10838,8 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
TypeFlags.isOverloadNone()
? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
: CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
- Value *Call = Builder.CreateCall(F, Ops);
- return FormSVEBuiltinResult(Call);
+ return Builder.CreateCall(F, Ops);
}
Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 2df17e83bae2ee..c55ee4a28a6a99 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4645,6 +4645,8 @@ class CodeGenFunction : public CodeGenTypeCache {
unsigned BuiltinID);
llvm::Value *EmitSVEPredicateCast(llvm::Value *Pred,
llvm::ScalableVectorType *VTy);
+ llvm::Value *EmitSVEPredicateTupleCast(llvm::Value *PredTuple,
+ llvm::StructType *Ty);
llvm::Value *EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
llvm::SmallVectorImpl<llvm::Value *> &Ops,
unsigned IntID);
@@ -4669,12 +4671,6 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *EmitSVEStructStore(const SVETypeFlags &TypeFlags,
SmallVectorImpl<llvm::Value *> &Ops,
unsigned IntID);
- /// FormSVEBuiltinResult - Returns the struct of scalable vectors as a wider
- /// vector. It extracts the scalable vector from the struct and inserts into
- /// the wider vector. This avoids the error when allocating space in llvm
- /// for struct of scalable vectors if a function returns struct.
- llvm::Value *FormSVEBuiltinResult(llvm::Value *Call);
-
llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags,
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 9c639984305d1b..1297185c4b50e0 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -24,27 +24,13 @@
// CHECK-LABEL: @test_svclamp_single_s8_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT: store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x210svint8x2_tu10__SVInt8_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT: store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s8_x2, , )(op1, op2, op3);
@@ -52,27 +38,13 @@ svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3)
// CHECK-LABEL: @test_svclamp_single_s16_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT: store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x211svint16x2_tu11__SVInt16_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT: store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
//
svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s16_x2, , )(op1, op2, op3);
@@ -80,27 +52,13 @@ svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t
// CHECK-LABEL: @test_svclamp_single_s32_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT: store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x211svint32x2_tu11__SVInt32_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT: store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
//
svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s32_x2, , )(op1, op2, op3);
@@ -108,27 +66,13 @@ svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t
// CHECK-LABEL: @test_svclamp_single_s64_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT: store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x211svint64x2_tu11__SVInt64_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT: store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
//
svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s64_x2, , )(op1, op2, op3);
@@ -139,35 +83,13 @@ svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t
// CHECK-LABEL: @test_svclamp_single_s8_x4(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT: store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x410svint8x4_tu10__SVInt8_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE...
[truncated]
|
@llvm/pr-subscribers-clang-codegen Author: Paul Walker (paulwalker-arm) ChangesI missed a codepath during PR108008 so SME2/SVE2p1 builtins are converting their struct return type into a large vector, which is causing unnecessary casting via memory. Patch is 2.20 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109423.diff 30 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3d1138b7773853..85c6065b960b8b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9845,6 +9845,22 @@ Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
return C;
}
+Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple,
+ llvm::StructType *Ty) {
+ if (PredTuple->getType() == Ty)
+ return PredTuple;
+
+ Value *Ret = llvm::PoisonValue::get(Ty);
+ for (unsigned I = 0; I < Ty->getNumElements(); ++I) {
+ Value *Pred = Builder.CreateExtractValue(PredTuple, I);
+ Pred = EmitSVEPredicateCast(
+ Pred, cast<llvm::ScalableVectorType>(Ty->getTypeAtIndex(I)));
+ Ret = Builder.CreateInsertValue(Ret, Pred, I);
+ }
+
+ return Ret;
+}
+
Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
SmallVectorImpl<Value *> &Ops,
unsigned IntID) {
@@ -10351,41 +10367,6 @@ Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
return Tuple;
}
-Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
- // Multi-vector results should be broken up into a single (wide) result
- // vector.
- auto *StructTy = dyn_cast<StructType>(Call->getType());
- if (!StructTy)
- return Call;
-
- auto *VTy = dyn_cast<ScalableVectorType>(StructTy->getTypeAtIndex(0U));
- if (!VTy)
- return Call;
- unsigned N = StructTy->getNumElements();
-
- // We may need to emit a cast to a svbool_t
- bool IsPredTy = VTy->getElementType()->isIntegerTy(1);
- unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements();
-
- ScalableVectorType *WideVTy =
- ScalableVectorType::get(VTy->getElementType(), MinElts * N);
- Value *Ret = llvm::PoisonValue::get(WideVTy);
- for (unsigned I = 0; I < N; ++I) {
- Value *SRet = Builder.CreateExtractValue(Call, I);
- assert(SRet->getType() == VTy && "Unexpected type for result value");
- Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
-
- if (IsPredTy)
- SRet = EmitSVEPredicateCast(
- SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16));
-
- Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx);
- }
- Call = Ret;
-
- return Call;
-}
-
void CodeGenFunction::GetAArch64SVEProcessedOperands(
unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
SVETypeFlags TypeFlags) {
@@ -10516,12 +10497,16 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
getSVEOverloadTypes(TypeFlags, Ty, Ops));
Value *Call = Builder.CreateCall(F, Ops);
+ if (Call->getType() == Ty)
+ return Call;
+
// Predicate results must be converted to svbool_t.
- if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType()))
- if (PredTy->getScalarType()->isIntegerTy(1))
- Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
+ if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Ty))
+ return EmitSVEPredicateCast(Call, PredTy);
+ if (auto PredTupleTy = dyn_cast<llvm::StructType>(Ty))
+ return EmitSVEPredicateTupleCast(Call, PredTupleTy);
- return FormSVEBuiltinResult(Call);
+ llvm_unreachable("unsupported element count!");
}
switch (BuiltinID) {
@@ -10853,9 +10838,8 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
TypeFlags.isOverloadNone()
? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
: CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
- Value *Call = Builder.CreateCall(F, Ops);
- return FormSVEBuiltinResult(Call);
+ return Builder.CreateCall(F, Ops);
}
Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 2df17e83bae2ee..c55ee4a28a6a99 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4645,6 +4645,8 @@ class CodeGenFunction : public CodeGenTypeCache {
unsigned BuiltinID);
llvm::Value *EmitSVEPredicateCast(llvm::Value *Pred,
llvm::ScalableVectorType *VTy);
+ llvm::Value *EmitSVEPredicateTupleCast(llvm::Value *PredTuple,
+ llvm::StructType *Ty);
llvm::Value *EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
llvm::SmallVectorImpl<llvm::Value *> &Ops,
unsigned IntID);
@@ -4669,12 +4671,6 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *EmitSVEStructStore(const SVETypeFlags &TypeFlags,
SmallVectorImpl<llvm::Value *> &Ops,
unsigned IntID);
- /// FormSVEBuiltinResult - Returns the struct of scalable vectors as a wider
- /// vector. It extracts the scalable vector from the struct and inserts into
- /// the wider vector. This avoids the error when allocating space in llvm
- /// for struct of scalable vectors if a function returns struct.
- llvm::Value *FormSVEBuiltinResult(llvm::Value *Call);
-
llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags,
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 9c639984305d1b..1297185c4b50e0 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -24,27 +24,13 @@
// CHECK-LABEL: @test_svclamp_single_s8_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT: store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x210svint8x2_tu10__SVInt8_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x2.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CPP-CHECK-NEXT: store <vscale x 32 x i8> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s8_x2, , )(op1, op2, op3);
@@ -52,27 +38,13 @@ svint8x2_t test_svclamp_single_s8_x2(svint8x2_t op1, svint8_t op2, svint8_t op3)
// CHECK-LABEL: @test_svclamp_single_s16_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CHECK-NEXT: store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s16_x211svint16x2_tu11__SVInt16_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 8 x i16>, <vscale x 8 x i16> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.sclamp.single.x2.nxv8i16(<vscale x 8 x i16> [[OP1_COERCE0:%.*]], <vscale x 8 x i16> [[OP1_COERCE1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i16> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
-// CPP-CHECK-NEXT: store <vscale x 16 x i16> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 8 x i16>, <vscale x 8 x i16> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]]
//
svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s16_x2, , )(op1, op2, op3);
@@ -80,27 +52,13 @@ svint16x2_t test_svclamp_single_s16_x2(svint16x2_t op1, svint16_t op2, svint16_t
// CHECK-LABEL: @test_svclamp_single_s32_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CHECK-NEXT: store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s32_x211svint32x2_tu11__SVInt32_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.sclamp.single.x2.nxv4i32(<vscale x 4 x i32> [[OP1_COERCE0:%.*]], <vscale x 4 x i32> [[OP1_COERCE1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i32> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
-// CPP-CHECK-NEXT: store <vscale x 8 x i32> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 4 x i32>, <vscale x 4 x i32> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]]
//
svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s32_x2, , )(op1, op2, op3);
@@ -108,27 +66,13 @@ svint32x2_t test_svclamp_single_s32_x2(svint32x2_t op1, svint32_t op2, svint32_t
// CHECK-LABEL: @test_svclamp_single_s64_x2(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CHECK-NEXT: store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z26test_svclamp_single_s64_x211svint64x2_tu11__SVInt64_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 2 x i64>, <vscale x 2 x i64> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.sclamp.single.x2.nxv2i64(<vscale x 2 x i64> [[OP1_COERCE0:%.*]], <vscale x 2 x i64> [[OP1_COERCE1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i64> [[OP3:%.*]])
-// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
-// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP1]], i64 0)
-// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
-// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
-// CPP-CHECK-NEXT: store <vscale x 4 x i64> [[TMP4]], ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: [[TMP5:%.*]] = load { <vscale x 2 x i64>, <vscale x 2 x i64> }, ptr [[RETVAL]], align 16
-// CPP-CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP5]]
+// CPP-CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]]
//
svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_s64_x2, , )(op1, op2, op3);
@@ -139,35 +83,13 @@ svint64x2_t test_svclamp_single_s64_x2(svint64x2_t op1, svint64_t op2, svint64_t
// CHECK-LABEL: @test_svclamp_single_s8_x4(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE1:%.*]], <vscale x 16 x i8> [[OP1_COERCE2:%.*]], <vscale x 16 x i8> [[OP1_COERCE3:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i8> [[OP3:%.*]])
-// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP1]], i64 0)
-// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
-// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
-// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 32)
-// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
-// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
-// CHECK-NEXT: store <vscale x 64 x i8> [[TMP8]], ptr [[RETVAL]], align 16
-// CHECK-NEXT: [[TMP9:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[RETVAL]], align 16
-// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]]
+// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z25test_svclamp_single_s8_x410svint8x4_tu10__SVInt8_tS0_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[RETVAL:%.*]] = alloca { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }, align 16
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.sclamp.single.x4.nxv16i8(<vscale x 16 x i8> [[OP1_COERCE0:%.*]], <vscale x 16 x i8> [[OP1_COERCE...
[truncated]
|
I missed a codepath during PR108008 so SME2/SVE2p1 builtins are converting their struct return type into a large vector, which is causing unnecessary casting via memory.