-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV] Refactor vector function variant selection to prepare for uniform args #68879
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Refactor vector function variant selection to prepare for uniform args #68879
Conversation
Parameters marked as uniform take a scalar value, assuming the value is invariant in the scalar loop.
@llvm/pr-subscribers-llvm-transforms Author: Graham Hunter (huntergr-arm) ChangesParameters marked as uniform take a scalar value, assuming the value is Patch is 20.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68879.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 88f064b6d57cebc..01c1b4d41d8a8ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7009,39 +7009,60 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// Find the cost of vectorizing the call, if we can find a suitable
// vector variant of the function.
- InstructionCost MaskCost = 0;
- VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
- bool UsesMask = MaskRequired;
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we want an unmasked vector function but can't find one matching the
- // VF, maybe we can find vector function that does use a mask and
- // synthesize an all-true mask.
- if (!VecFunc && !MaskRequired) {
- Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
- VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we found one, add in the cost of creating a mask
- if (VecFunc) {
- UsesMask = true;
- MaskCost = TTI.getShuffleCost(
- TargetTransformInfo::SK_Broadcast,
- VectorType::get(IntegerType::getInt1Ty(
- VecFunc->getFunctionType()->getContext()),
- VF));
- }
- }
+ bool UsesMask = false;
+ VFInfo FuncInfo;
+ Function *VecFunc = nullptr;
+ // Search through any available variants for one we can use at this VF.
+ for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
+ // Must match requested VF.
+ if (Info.Shape.VF != VF)
+ continue;
- std::optional<unsigned> MaskPos = std::nullopt;
- if (VecFunc && UsesMask) {
- for (const VFInfo &Info : VFDatabase::getMappings(*CI))
- if (Info.Shape == Shape) {
- assert(Info.isMasked() && "Vector function info shape mismatch");
- MaskPos = Info.getParamIndexForOptionalMask().value();
+ // Must take a mask argument if one is required
+ if (MaskRequired && !Info.isMasked())
+ continue;
+
+ // Check that all parameter kinds are supported
+ bool ParamsOk = true;
+ for (VFParameter Param : Info.Shape.Parameters) {
+ switch (Param.ParamKind) {
+ case VFParamKind::Vector:
+ break;
+ case VFParamKind::OMP_Uniform: {
+ Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
+ // Make sure the scalar parameter in the loop is invariant.
+ if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
+ TheLoop))
+ ParamsOk = false;
+ break;
+ }
+ case VFParamKind::GlobalPredicate:
+ UsesMask = true;
+ break;
+ default:
+ ParamsOk = false;
break;
}
+ }
+
+ if (!ParamsOk)
+ continue;
- assert(MaskPos.has_value() && "Unable to find mask parameter index");
+ // Found a suitable candidate, stop here.
+ VecFunc = CI->getModule()->getFunction(Info.VectorName);
+ FuncInfo = Info;
+ break;
}
+ // Add in the cost of synthesizing a mask if one wasn't required.
+ InstructionCost MaskCost = 0;
+ if (VecFunc && UsesMask && !MaskRequired)
+ MaskCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(
+ VecFunc->getFunctionType()->getContext()),
+ VF));
+
if (TLI && VecFunc && !CI->isNoBuiltin())
VectorCost =
TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
@@ -7065,7 +7086,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
Decision = CM_IntrinsicCall;
}
- setCallWideningDecision(CI, VF, Decision, VecFunc, IID, MaskPos, Cost);
+ setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
+ FuncInfo.getParamIndexForOptionalMask(), Cost);
}
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..5a640ecde2abc24 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -503,6 +503,9 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
"DbgInfoIntrinsic should have been dropped during VPlan construction");
State.setDebugLocFrom(CI.getDebugLoc());
+ FunctionType *VFTy = nullptr;
+ if (Variant)
+ VFTy = Variant->getFunctionType();
for (unsigned Part = 0; Part < State.UF; ++Part) {
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
@@ -514,12 +517,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
+ // Some vectorized function variants may take also take a scalar argument,
+ // e.g. linear parameters for pointers.
Value *Arg;
- if (VectorIntrinsicID == Intrinsic::not_intrinsic ||
- !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
- Arg = State.get(I.value(), Part);
- else
+ if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
+ (VectorIntrinsicID != Intrinsic::not_intrinsic &&
+ isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
Arg = State.get(I.value(), VPIteration(0, 0));
+ else
+ Arg = State.get(I.value(), Part);
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
new file mode 100644
index 000000000000000..da9a680943e3c6b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -S | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; A call whose argument can remain a scalar for a vectorized function variant
+; with a uniform argument because it's loop invariant
+define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i64 %n) #0 {
+; CHECK-LABEL: define void @test_uniform
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: store <vscale x 2 x double> [[TMP5]], ptr [[TMP6]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT: [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[UNIFORM]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+ %data = load double, ptr %gepsrc, align 8
+ %call = call double @foo(double %data, i64 %uniform) #1
+ %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+ store double %call, ptr %gepdst
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; If the parameter is not uniform, then we can't use the vector variant.
+define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
+; CHECK-LABEL: define void @test_uniform_not_invariant
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT: [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR2]]
+; CHECK-NEXT: [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+ %data = load double, ptr %gepsrc, align 8
+ %call = call double @foo(double %data, i64 %indvars.iv) #1
+ %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+ store double %call, ptr %gepdst
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; Scalar functions
+declare double @foo(double, i64)
+
+; Vector variants
+declare <vscale x 2 x double> @foo_uniform(<vscale x 2 x double>, i64, <vscale x 2 x i1>)
+
+attributes #0 = { "target-features"="+sve" }
+
+; Mappings
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxvu_foo(foo_uniform)" }
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
new file mode 100644
index 000000000000000..9c12b9c2dcfca49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+
+; A call whose argument can remain a scalar for a vectorized function variant
+; with a uniform argument because it's loop invariant
+define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i64 %n) {
+; CHECK-LABEL: define void @test_uniform
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @foo_uniform(<2 x double> [[WIDE_LOAD]], i64 [[UNIFORM]])
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT: [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[UNIFORM]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT: [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+ %data = load double, ptr %gepsrc, align 8
+ %call = call double @foo(double %data, i64 %uniform) #0
+ %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+ store double %call, ptr %gepdst
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; If the parameter is not uniform, then we can't use the vector variant and
+; must fall back to scalarization.
+define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 %n) {
+; CHECK-LABEL: define void @test_uniform_not_invariant
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[TMP0]]) #[[ATTR0]]
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i64 1
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT: [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR0]]
+; CHECK-NEXT: [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+ %data = load double, ptr %gepsrc, align 8
+ %call = call double @foo(double %data, i64 %indvars.iv) #0
+ %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+ store double %call, ptr %gepdst
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; Scalar functions
+declare double @foo(double, i64)
+
+; Vector variant...
[truncated]
|
I have a followup patch to support linear arguments once this has been merged. |
|
||
; A call whose argument can remain a scalar for a vectorized function variant | ||
; with a uniform argument because it's loop invariant | ||
define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i64 %n) #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @huntergr-arm,
I think it would be good to add more tests, I am thinking about extra test when the mask is required, for example tail-folding is enabled.
But I am also thinking about test where the scalar, uniform type does not match the size of vector element type for example when the available mapping is:
@foo_uniform(<vscale x 2 x double>, i32, <vscale x 2 x i1>)
such mappings seems to crash the compiler at the moment on assertion failure, I think we could do better job in handling this cases, what do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#72260 adds support for mixing element types.
I'll add some masking tests.
TheLoop)) | ||
ParamsOk = false; | ||
break; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
to me looks like the piece of code handling "case VFParamKind::OMP_Uniform: " is the actual new functionality, so you could split this patch into NFC change/refactoring + new functionality+tests demonstrating that it works, what do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, the reason for putting it all together was the lack of dependent commits for review chains. That's now been resolved, though I don't think I can auto convert this PR to a branch on the main repo. Will raise a new PR for the uniform bit.
} | ||
|
||
// Add in the cost of synthesizing a mask if one wasn't required. | ||
InstructionCost MaskCost = 0; | ||
if (VecFunc && UsesMask && !MaskRequired) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this isn't functionality related to support of the uniform paramaters, and in my opinion should go as a separate patch with extra tests checking the increased cost.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's just refactoring of existing code, not new functionality.
I've removed the uniform argument case and the tests from this PR, so it should just be NFC refactoring in preparation now. I'll precommit the tests then raise a new PR for the uniform case. Ok? |
Thanks for extracting the NFC refactoring, it LGTM |
@@ -514,12 +517,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { | |||
for (const auto &I : enumerate(operands())) { | |||
// Some intrinsics have a scalar argument - don't replace it with a | |||
// vector. | |||
// Some vectorized function variants may take also take a scalar argument, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:typo in the comment, single occurrence of take is enough.
…m args (llvm#68879) Parameters marked as uniform take a scalar value, assuming the value is invariant in the scalar loop. In order to support this, we need to stop asking for a vector function variant with a default shape assuming that all arguments will become vector arguments, and instead consider all available variants and their parameter types.
Parameters marked as uniform take a scalar value, assuming the value is
invariant in the scalar loop.