[LV] Refactor vector function variant selection to prepare for uniform args #68879

huntergr-arm · 2023-10-12T12:05:30Z

Parameters marked as uniform take a scalar value, assuming the value is
invariant in the scalar loop.

Parameters marked as uniform take a scalar value, assuming the value is invariant in the scalar loop.

llvmbot · 2023-10-12T12:06:36Z

@llvm/pr-subscribers-llvm-transforms

Author: Graham Hunter (huntergr-arm)

Changes

Parameters marked as uniform take a scalar value, assuming the value is
invariant in the scalar loop.

Patch is 20.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68879.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+50-28)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+10-4)
(added) llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll (+117)
(added) llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll (+133)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 88f064b6d57cebc..01c1b4d41d8a8ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7009,39 +7009,60 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       // Find the cost of vectorizing the call, if we can find a suitable
       // vector variant of the function.
-      InstructionCost MaskCost = 0;
-      VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
-      bool UsesMask = MaskRequired;
-      Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-      // If we want an unmasked vector function but can't find one matching the
-      // VF, maybe we can find vector function that does use a mask and
-      // synthesize an all-true mask.
-      if (!VecFunc && !MaskRequired) {
-        Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
-        VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-        // If we found one, add in the cost of creating a mask
-        if (VecFunc) {
-          UsesMask = true;
-          MaskCost = TTI.getShuffleCost(
-              TargetTransformInfo::SK_Broadcast,
-              VectorType::get(IntegerType::getInt1Ty(
-                                  VecFunc->getFunctionType()->getContext()),
-                              VF));
-        }
-      }
+      bool UsesMask = false;
+      VFInfo FuncInfo;
+      Function *VecFunc = nullptr;
+      // Search through any available variants for one we can use at this VF.
+      for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
+        // Must match requested VF.
+        if (Info.Shape.VF != VF)
+          continue;
 
-      std::optional<unsigned> MaskPos = std::nullopt;
-      if (VecFunc && UsesMask) {
-        for (const VFInfo &Info : VFDatabase::getMappings(*CI))
-          if (Info.Shape == Shape) {
-            assert(Info.isMasked() && "Vector function info shape mismatch");
-            MaskPos = Info.getParamIndexForOptionalMask().value();
+        // Must take a mask argument if one is required
+        if (MaskRequired && !Info.isMasked())
+          continue;
+
+        // Check that all parameter kinds are supported
+        bool ParamsOk = true;
+        for (VFParameter Param : Info.Shape.Parameters) {
+          switch (Param.ParamKind) {
+          case VFParamKind::Vector:
+            break;
+          case VFParamKind::OMP_Uniform: {
+            Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
+            // Make sure the scalar parameter in the loop is invariant.
+            if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
+                                              TheLoop))
+              ParamsOk = false;
+            break;
+          }
+          case VFParamKind::GlobalPredicate:
+            UsesMask = true;
+            break;
+          default:
+            ParamsOk = false;
             break;
           }
+        }
+
+        if (!ParamsOk)
+          continue;
 
-        assert(MaskPos.has_value() && "Unable to find mask parameter index");
+        // Found a suitable candidate, stop here.
+        VecFunc = CI->getModule()->getFunction(Info.VectorName);
+        FuncInfo = Info;
+        break;
       }
 
+      // Add in the cost of synthesizing a mask if one wasn't required.
+      InstructionCost MaskCost = 0;
+      if (VecFunc && UsesMask && !MaskRequired)
+        MaskCost = TTI.getShuffleCost(
+            TargetTransformInfo::SK_Broadcast,
+            VectorType::get(IntegerType::getInt1Ty(
+                                VecFunc->getFunctionType()->getContext()),
+                            VF));
+
       if (TLI && VecFunc && !CI->isNoBuiltin())
         VectorCost =
             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
@@ -7065,7 +7086,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
         Decision = CM_IntrinsicCall;
       }
 
-      setCallWideningDecision(CI, VF, Decision, VecFunc, IID, MaskPos, Cost);
+      setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
+                              FuncInfo.getParamIndexForOptionalMask(), Cost);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..5a640ecde2abc24 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -503,6 +503,9 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
   State.setDebugLocFrom(CI.getDebugLoc());
 
+  FunctionType *VFTy = nullptr;
+  if (Variant)
+    VFTy = Variant->getFunctionType();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     SmallVector<Type *, 2> TysForDecl;
     // Add return type if intrinsic is overloaded on it.
@@ -514,12 +517,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     for (const auto &I : enumerate(operands())) {
       // Some intrinsics have a scalar argument - don't replace it with a
       // vector.
+      // Some vectorized function variants may take also take a scalar argument,
+      // e.g. linear parameters for pointers.
       Value *Arg;
-      if (VectorIntrinsicID == Intrinsic::not_intrinsic ||
-          !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
-        Arg = State.get(I.value(), Part);
-      else
+      if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
+          (VectorIntrinsicID != Intrinsic::not_intrinsic &&
+           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
         Arg = State.get(I.value(), VPIteration(0, 0));
+      else
+        Arg = State.get(I.value(), Part);
       if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
         TysForDecl.push_back(Arg->getType());
       Args.push_back(Arg);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
new file mode 100644
index 000000000000000..da9a680943e3c6b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -S | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; A call whose argument can remain a scalar for a vectorized function variant
+; with a uniform argument because it's loop invariant
+define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i64 %n) #0 {
+; CHECK-LABEL: define void @test_uniform
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 2 x double> [[TMP5]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[UNIFORM]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+  %data = load double, ptr %gepsrc, align 8
+  %call = call double @foo(double %data, i64 %uniform) #1
+  %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+  store double %call, ptr %gepdst
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; If the parameter is not uniform, then we can't use the vector variant.
+define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
+; CHECK-LABEL: define void @test_uniform_not_invariant
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR2]]
+; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+  %data = load double, ptr %gepsrc, align 8
+  %call = call double @foo(double %data, i64 %indvars.iv) #1
+  %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+  store double %call, ptr %gepdst
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; Scalar functions
+declare double @foo(double, i64)
+
+; Vector variants
+declare <vscale x 2 x double> @foo_uniform(<vscale x 2 x double>, i64, <vscale x 2 x i1>)
+
+attributes #0 = { "target-features"="+sve" }
+
+; Mappings
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxvu_foo(foo_uniform)" }
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
new file mode 100644
index 000000000000000..9c12b9c2dcfca49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+
+; A call whose argument can remain a scalar for a vectorized function variant
+; with a uniform argument because it's loop invariant
+define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i64 %n) {
+; CHECK-LABEL: define void @test_uniform
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @foo_uniform(<2 x double> [[WIDE_LOAD]], i64 [[UNIFORM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[UNIFORM]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+  %data = load double, ptr %gepsrc, align 8
+  %call = call double @foo(double %data, i64 %uniform) #0
+  %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+  store double %call, ptr %gepdst
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; If the parameter is not uniform, then we can't use the vector variant and
+; must fall back to scalarization.
+define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 %n) {
+; CHECK-LABEL: define void @test_uniform_not_invariant
+; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[TMP0]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR0]]
+; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store double [[CALL]], ptr [[GEPDST]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gepsrc = getelementptr double, ptr %src, i64 %indvars.iv
+  %data = load double, ptr %gepsrc, align 8
+  %call = call double @foo(double %data, i64 %indvars.iv) #0
+  %gepdst = getelementptr inbounds double, ptr %dst, i64 %indvars.iv
+  store double %call, ptr %gepdst
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; Scalar functions
+declare double @foo(double, i64)
+
+; Vector variant...
[truncated]

huntergr-arm · 2023-10-12T12:07:05Z

I have a followup patch to support linear arguments once this has been merged.

mgabka · 2023-11-15T08:08:39Z

llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll

+
+; A call whose argument can remain a scalar for a vectorized function variant
+; with a uniform argument because it's loop invariant
+define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i64 %n) #0 {


Hi @huntergr-arm,
I think it would be good to add more tests, I am thinking about extra test when the mask is required, for example tail-folding is enabled.

But I am also thinking about test where the scalar, uniform type does not match the size of vector element type for example when the available mapping is:

@foo_uniform(<vscale x 2 x double>, i32, <vscale x 2 x i1>)

such mappings seems to crash the compiler at the moment on assertion failure, I think we could do better job in handling this cases, what do you think?

#72260 adds support for mixing element types.

I'll add some masking tests.

mgabka · 2023-11-15T09:22:39Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+                                              TheLoop))
+              ParamsOk = false;
+            break;
+          }


to me looks like the piece of code handling "case VFParamKind::OMP_Uniform: " is the actual new functionality, so you could split this patch into NFC change/refactoring + new functionality+tests demonstrating that it works, what do you think?

Yeah, the reason for putting it all together was the lack of dependent commits for review chains. That's now been resolved, though I don't think I can auto convert this PR to a branch on the main repo. Will raise a new PR for the uniform bit.

mgabka · 2023-11-15T09:34:35Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

      }

+      // Add in the cost of synthesizing a mask if one wasn't required.
+      InstructionCost MaskCost = 0;
+      if (VecFunc && UsesMask && !MaskRequired)


this isn't functionality related to support of the uniform paramaters, and in my opinion should go as a separate patch with extra tests checking the increased cost.

That's just refactoring of existing code, not new functionality.

…variant lookup.

huntergr-arm · 2023-11-16T13:00:54Z

I've removed the uniform argument case and the tests from this PR, so it should just be NFC refactoring in preparation now. I'll precommit the tests then raise a new PR for the uniform case. Ok?

mgabka · 2023-11-16T16:27:55Z

I've removed the uniform argument case and the tests from this PR, so it should just be NFC refactoring in preparation now. I'll precommit the tests then raise a new PR for the uniform case. Ok?

Thanks for extracting the NFC refactoring, it LGTM

mgabka · 2023-11-16T13:51:47Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

@@ -514,12 +517,15 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
    for (const auto &I : enumerate(operands())) {
      // Some intrinsics have a scalar argument - don't replace it with a
      // vector.
+      // Some vectorized function variants may take also take a scalar argument,


nit:typo in the comment, single occurrence of take is enough.

See #68879

…m args (llvm#68879) Parameters marked as uniform take a scalar value, assuming the value is invariant in the scalar loop. In order to support this, we need to stop asking for a vector function variant with a default shape assuming that all arguments will become vector arguments, and instead consider all available variants and their parameter types.

See llvm#68879

[LV] Add support for uniform parameters on vectorized function variants

f69c83a

Parameters marked as uniform take a scalar value, assuming the value is invariant in the scalar loop.

huntergr-arm requested review from fhahn and mgabka October 12, 2023 12:05

llvmbot added vectorizers llvm:transforms labels Oct 12, 2023

mgabka reviewed Nov 15, 2023

View reviewed changes

Separate uniform argument functionality from refactoring of function …

d092e48

…variant lookup.

mgabka approved these changes Nov 16, 2023

View reviewed changes

Comment typo fix

bd71094

huntergr-arm changed the title ~~[LV] Add support for uniform parameters on vectorized function variants~~ [LV] Refactor vector function variant selection to prepare for uniform args Nov 20, 2023

huntergr-arm merged commit 4d64a2b into llvm:main Nov 20, 2023

huntergr-arm deleted the uniform-args-for-vector-variants branch November 20, 2023 13:30

huntergr-arm added a commit that referenced this pull request Nov 20, 2023

[LV] Precommit tests for uniform arguments for vector function variants

84ebe5b

See #68879

zahiraam pushed a commit to zahiraam/llvm-project that referenced this pull request Nov 20, 2023

[LV] Precommit tests for uniform arguments for vector function variants

a646126

See llvm#68879

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[LV] Refactor vector function variant selection to prepare for uniform args #68879

[LV] Refactor vector function variant selection to prepare for uniform args #68879

Uh oh!

huntergr-arm commented Oct 12, 2023

Uh oh!

llvmbot commented Oct 12, 2023

Uh oh!

huntergr-arm commented Oct 12, 2023

Uh oh!

mgabka Nov 15, 2023

Uh oh!

huntergr-arm Nov 15, 2023

Uh oh!

mgabka Nov 15, 2023

Uh oh!

huntergr-arm Nov 15, 2023

Uh oh!

mgabka Nov 15, 2023

Uh oh!

huntergr-arm Nov 15, 2023

Uh oh!

huntergr-arm commented Nov 16, 2023

Uh oh!

mgabka commented Nov 16, 2023

Uh oh!

mgabka Nov 16, 2023

Uh oh!

Uh oh!

[LV] Refactor vector function variant selection to prepare for uniform args #68879

[LV] Refactor vector function variant selection to prepare for uniform args #68879

Uh oh!

Conversation

huntergr-arm commented Oct 12, 2023

Uh oh!

llvmbot commented Oct 12, 2023

Uh oh!

huntergr-arm commented Oct 12, 2023

Uh oh!

mgabka Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

huntergr-arm Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

mgabka Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

huntergr-arm Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

mgabka Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

huntergr-arm Nov 15, 2023

Choose a reason for hiding this comment

Uh oh!

huntergr-arm commented Nov 16, 2023

Uh oh!

mgabka commented Nov 16, 2023

Uh oh!

mgabka Nov 16, 2023

Choose a reason for hiding this comment

Uh oh!

Uh oh!