-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV] Optimize VPWidenIntOrFpInductionRecipe for known TC #118828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-systemz @llvm/pr-subscribers-llvm-transforms Author: Hari Limaye (hazzlim) ChangesOptimize the IR generated for a VPWidenIntOrFpInductionRecipe to use the Patch is 109.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118828.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c7c044a042719..3e6774ff14ac1f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7657,6 +7657,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
OrigLoop->getHeader()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+ auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
+ VPlanTransforms::optimizeForTCAndVF(BestVPlan, TC, BestVF);
+
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
<< ", UF=" << BestUF << '\n');
BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d828f038f9a2..809efc5988930b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2150,6 +2150,9 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
VPValue *getStepValue() { return getOperand(1); }
const VPValue *getStepValue() const { return getOperand(1); }
+ /// Update the start value of the recipe.
+ void setStepValue(VPValue *V) { setOperand(1, V); }
+
VPValue *getVFValue() { return getOperand(2); }
const VPValue *getVFValue() const { return getOperand(2); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cee83d1015b536..6e27b1b5055446 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -661,6 +661,60 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
}
}
+void VPlanTransforms::optimizeForTCAndVF(VPlan &Plan, unsigned TC,
+ ElementCount BestVF) {
+ assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
+ if (!TC || !BestVF.isFixed())
+ return;
+
+ // Calculate the widest type required for known TC and VF.
+ uint64_t Width = BestVF.getKnownMinValue();
+ uint64_t MaxVal = alignTo(TC, Width) - 1;
+ unsigned MaxActiveBits = Log2_64_Ceil(MaxVal);
+ unsigned NewBitWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
+ LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+ auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
+
+ bool MadeChange = false;
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+ auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ if (!WideIV || !WideIV->isCanonical())
+ continue;
+
+ if (WideIV->hasMoreThanOneUniqueUser())
+ continue;
+
+ // Currently only handle cases where the single user is a header-mask
+ // comparison with the backedge-taken-count.
+ VPValue *Bound;
+ using namespace VPlanPatternMatch;
+ auto *Cmp = dyn_cast<VPInstruction>(*WideIV->user_begin());
+ if (!Cmp ||
+ !match(Cmp, m_Binary<Instruction::ICmp>(m_Specific(WideIV),
+ m_VPValue(Bound))) ||
+ Bound != Plan.getOrCreateBackedgeTakenCount())
+ continue;
+
+ if (NewIVTy == WideIV->getScalarType())
+ continue;
+
+ // Update IV operands and comparison bound to use new narrower type.
+ auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+ WideIV->setStartValue(NewStart);
+ auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+ WideIV->setStepValue(NewStep);
+ auto *NewBound = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, TC - 1));
+ Cmp->setOperand(1, NewBound);
+
+ MadeChange = true;
+ }
+
+ if (MadeChange)
+ Plan.setVF(BestVF);
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 1491e0a8df04d5..ba772cf385d560 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -15,6 +15,7 @@
#include "VPlan.h"
#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -58,6 +59,9 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
+ /// Optimize \p Plan based on \p TC and \p BestVF.
+ static void optimizeForTCAndVF(VPlan &Plan, unsigned TC, ElementCount BestVF);
+
/// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
/// optimizations, dead recipe removal, replicate region optimizations and
/// block merging.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 69560305706364..bec30b3dcb5289 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -389,8 +389,8 @@ define void @latch_branch_cost(ptr %dst) {
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 99)
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 99)
; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; PRED-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
@@ -456,7 +456,7 @@ define void @latch_branch_cost(ptr %dst) {
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.continue14:
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: middle.block:
@@ -903,9 +903,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
; DEFAULT: vector.body:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
; DEFAULT-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
+; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
; DEFAULT-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; DEFAULT: pred.store.if:
@@ -978,7 +978,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; DEFAULT-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE14]]
; DEFAULT: pred.store.continue14:
-; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; DEFAULT: middle.block:
@@ -1005,9 +1005,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
; PRED-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
+; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
; PRED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
; PRED-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
@@ -1080,7 +1080,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE14]]
; PRED: pred.store.continue14:
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; PRED: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index c301ef3c5319a2..b207cca03c90f2 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -18,8 +18,8 @@ define void @pr45679(ptr %A) optsize {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 13)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -53,7 +53,7 @@ define void @pr45679(ptr %A) optsize {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -213,8 +213,8 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 13)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -252,7 +252,7 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index 8e132ed8399cd6..c76057a18bf3cd 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -11,9 +11,9 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -57,7 +57,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
; CHECK-NEXT: [[TMP26]] = add i32 [[TMP25]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -97,10 +97,10 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -166,7 +166,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP46]])
; CHECK-NEXT: [[TMP48]] = add i32 [[TMP47]], [[TMP45]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], splat (i32 4)
; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -212,9 +212,9 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -261,7 +261,7 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]])
; CHECK-NEXT: [[TMP29]] = add i32 [[TMP28]], [[TMP26]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
@@ -302,10 +302,10 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -371,7 +371,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.red...
[truncated]
|
@llvm/pr-subscribers-vectorizers Author: Hari Limaye (hazzlim) ChangesOptimize the IR generated for a VPWidenIntOrFpInductionRecipe to use the Patch is 109.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118828.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c7c044a042719..3e6774ff14ac1f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7657,6 +7657,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
OrigLoop->getHeader()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+ auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
+ VPlanTransforms::optimizeForTCAndVF(BestVPlan, TC, BestVF);
+
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
<< ", UF=" << BestUF << '\n');
BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d828f038f9a2..809efc5988930b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2150,6 +2150,9 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
VPValue *getStepValue() { return getOperand(1); }
const VPValue *getStepValue() const { return getOperand(1); }
+ /// Update the start value of the recipe.
+ void setStepValue(VPValue *V) { setOperand(1, V); }
+
VPValue *getVFValue() { return getOperand(2); }
const VPValue *getVFValue() const { return getOperand(2); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cee83d1015b536..6e27b1b5055446 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -661,6 +661,60 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
}
}
+void VPlanTransforms::optimizeForTCAndVF(VPlan &Plan, unsigned TC,
+ ElementCount BestVF) {
+ assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
+ if (!TC || !BestVF.isFixed())
+ return;
+
+ // Calculate the widest type required for known TC and VF.
+ uint64_t Width = BestVF.getKnownMinValue();
+ uint64_t MaxVal = alignTo(TC, Width) - 1;
+ unsigned MaxActiveBits = Log2_64_Ceil(MaxVal);
+ unsigned NewBitWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
+ LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+ auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
+
+ bool MadeChange = false;
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+ auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ if (!WideIV || !WideIV->isCanonical())
+ continue;
+
+ if (WideIV->hasMoreThanOneUniqueUser())
+ continue;
+
+ // Currently only handle cases where the single user is a header-mask
+ // comparison with the backedge-taken-count.
+ VPValue *Bound;
+ using namespace VPlanPatternMatch;
+ auto *Cmp = dyn_cast<VPInstruction>(*WideIV->user_begin());
+ if (!Cmp ||
+ !match(Cmp, m_Binary<Instruction::ICmp>(m_Specific(WideIV),
+ m_VPValue(Bound))) ||
+ Bound != Plan.getOrCreateBackedgeTakenCount())
+ continue;
+
+ if (NewIVTy == WideIV->getScalarType())
+ continue;
+
+ // Update IV operands and comparison bound to use new narrower type.
+ auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+ WideIV->setStartValue(NewStart);
+ auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+ WideIV->setStepValue(NewStep);
+ auto *NewBound = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, TC - 1));
+ Cmp->setOperand(1, NewBound);
+
+ MadeChange = true;
+ }
+
+ if (MadeChange)
+ Plan.setVF(BestVF);
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 1491e0a8df04d5..ba772cf385d560 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -15,6 +15,7 @@
#include "VPlan.h"
#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -58,6 +59,9 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
+ /// Optimize \p Plan based on \p TC and \p BestVF.
+ static void optimizeForTCAndVF(VPlan &Plan, unsigned TC, ElementCount BestVF);
+
/// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
/// optimizations, dead recipe removal, replicate region optimizations and
/// block merging.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 69560305706364..bec30b3dcb5289 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -389,8 +389,8 @@ define void @latch_branch_cost(ptr %dst) {
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 99)
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 99)
; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; PRED-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
@@ -456,7 +456,7 @@ define void @latch_branch_cost(ptr %dst) {
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.continue14:
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: middle.block:
@@ -903,9 +903,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
; DEFAULT: vector.body:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
; DEFAULT-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
+; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
; DEFAULT-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; DEFAULT: pred.store.if:
@@ -978,7 +978,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; DEFAULT-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE14]]
; DEFAULT: pred.store.continue14:
-; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; DEFAULT: middle.block:
@@ -1005,9 +1005,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
; PRED-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
+; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
; PRED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
; PRED-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
@@ -1080,7 +1080,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE14]]
; PRED: pred.store.continue14:
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; PRED: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index c301ef3c5319a2..b207cca03c90f2 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -18,8 +18,8 @@ define void @pr45679(ptr %A) optsize {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 13)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -53,7 +53,7 @@ define void @pr45679(ptr %A) optsize {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -213,8 +213,8 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 13)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -252,7 +252,7 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index 8e132ed8399cd6..c76057a18bf3cd 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -11,9 +11,9 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -57,7 +57,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
; CHECK-NEXT: [[TMP26]] = add i32 [[TMP25]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -97,10 +97,10 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -166,7 +166,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP46]])
; CHECK-NEXT: [[TMP48]] = add i32 [[TMP47]], [[TMP45]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], splat (i32 4)
; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -212,9 +212,9 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -261,7 +261,7 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]])
; CHECK-NEXT: [[TMP29]] = add i32 [[TMP28]], [[TMP26]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
@@ -302,10 +302,10 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -371,7 +371,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.red...
[truncated]
|
I wasn't sure if this was the best/right way to approach this, so please let me know if there's a better approach to take! |
@@ -2150,6 +2150,9 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { | |||
VPValue *getStepValue() { return getOperand(1); } | |||
const VPValue *getStepValue() const { return getOperand(1); } | |||
|
|||
/// Update the start value of the recipe. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should the comment be /// Update the step value of the recipe
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yep good point - fixed.
return; | ||
|
||
// Calculate the widest type required for known TC and VF. | ||
uint64_t Width = BestVF.getKnownMinValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you also need to multiply by the interleave count here as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes - I have updated to multiply by BestUF here as well.
/// Optimize \p Plan based on \p TC and \p BestVF. | ||
static void optimizeForTCAndVF(VPlan &Plan, unsigned TC, ElementCount BestVF); | ||
/// Optimize \p Plan based on \p TC, \p BestVF and \p BestUF. | ||
static void optimizeForTCAndVFAndUF(VPlan &Plan, unsigned TC, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Perhaps just call it optimizeForTCAndVFUF
// Calculate the widest type required for known TC and VF. | ||
uint64_t Width = BestVF.getKnownMinValue(); | ||
// Calculate the widest type required for known TC, VF and UF. | ||
uint64_t Width = BestVF.getKnownMinValue() * BestUF; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After making this change I didn't see any corresponding change in the tests, which looks we have a missing test case. Is it possible to create a very simple test that is affected by this change? For example, if you have a known trip count of 50 and use -force-vector-width=64 -force-vector-interleave=8
then the max value will be 511 and will require an i16 type. That's still better than an i64 type though!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah good point. I've added a test - but this PR will still not trigger on the case when we are interleaving. This is because when unrolling the induction by UF, we create new recipes for the induction steps for part 1 .. UF, which are users of the original IndVar. As this PR limits to only a single use (Cmp) of the IndVar, this case will not be optimized.
llvm-project/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Lines 186 to 218 in fbc18b8
// Now create recipes to compute the induction steps for part 1 .. UF. Part 0 | |
// remains the header phi. Parts > 0 are computed by adding Step to the | |
// previous part. The header phi recipe will get 2 new operands: the step | |
// value for a single part and the last part, used to compute the backedge | |
// value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = | |
// VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 | |
// %Part.1 = %Part.0 + %VectorStep | |
// %Part.2 = %Part.1 + %VectorStep | |
// %Part.3 = %Part.2 + %VectorStep | |
// | |
// The newly added recipes are added to ToSkip to avoid interleaving them | |
// again. | |
VPValue *Prev = IV; | |
Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); | |
unsigned AddOpc = | |
IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; | |
for (unsigned Part = 1; Part != UF; ++Part) { | |
std::string Name = | |
Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; | |
VPInstruction *Add = Builder.createNaryOp(AddOpc, | |
{ | |
Prev, | |
VectorStep, | |
}, | |
FMFs, IV->getDebugLoc(), Name); | |
ToSkip.insert(Add); | |
addRecipeForPart(IV, Add, Part); | |
Prev = Add; | |
} | |
IV->addOperand(VectorStep); | |
IV->addOperand(Prev); | |
} |
I am thinking we could leave extending this to a follow-up PR, if you agree?
Bound != Plan.getOrCreateBackedgeTakenCount()) | ||
continue; | ||
|
||
if (NewIVTy == WideIV->getScalarType()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
merge with other continues above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - done.
if (!WideIV || !WideIV->isCanonical()) | ||
continue; | ||
|
||
if (WideIV->hasMoreThanOneUniqueUser()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
merge with other continue above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - done.
// comparison with the backedge-taken-count. | ||
VPValue *Bound; | ||
using namespace VPlanPatternMatch; | ||
auto *Cmp = dyn_cast<VPInstruction>(*WideIV->user_begin()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to dyn_cast + check when using the matcher
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - done.
@@ -7657,6 +7657,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( | |||
OrigLoop->getHeader()->getContext()); | |||
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); | |||
|
|||
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop); | |||
VPlanTransforms::optimizeForTCAndVF(BestVPlan, TC, BestVF); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we instead use BestVPlan.getTripCount() and move this in optimizeForVFAndUF, as this also optimizes the plan for a given UF/VF?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it would interfere with that, in general I assume it might make sense to apply this one for all potential VFs as narrower IVs might trip the balance eventually (at the moment, the cost of inductions isn't estimate correctly yet though).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've moved into optimizeForVFAndUF, splitting out the existing code and this new code into static functions as it seemed sensible to make the logic simpler/things easier to read, if you agree?
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" | ||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
define dso_local void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define dso_local void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p) { | |
define void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - done.
; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck %s | ||
|
||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" | ||
target triple = "aarch64-unknown-linux-gnu" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the test depend on AArch64? If not it needs -force-vector-width
and dropping the triple
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point - done and removed the target datalayout as well, as this seems unnecessary also.
return false; | ||
|
||
// Calculate the widest type required for known TC, VF and UF. | ||
uint64_t TCVal = TC->getZExtValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This assumes the trip count first into 64 bits, which may not be the case I think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yep good point - fixed.
if (!match(*WideIV->user_begin(), | ||
m_Binary<Instruction::ICmp>(m_Specific(WideIV), | ||
m_VPValue(Bound))) || | ||
Bound != Plan.getOrCreateBackedgeTakenCount()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we also use m_SPecific
for the bound?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, that's much neater - Done.
auto TCVal = TC->getValue().tryZExtValue(); | ||
if (!TCVal) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we instead do the math on APInt to not require this restriction?
Could you also add a test with TC > 64 bits?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure - I've updated to do the majority of the calculations on APInt to avoid issues with bitwidth, and I've added a test with TC > 64 bits :)
ff1716c
to
fec0384
Compare
@fhahn polite ping on this - the PR had gone a bit out of sync with recent updates to |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks almost ready to go! I just had two more minor comments ...
llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width-unrolling.ll
Outdated
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One more thing I am wondering if we could end up introducing illegal vector types for a target and if we have to check the new type is legal?
|
||
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); | ||
for (VPRecipeBase &Phi : HeaderVPBB->phis()) { | ||
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment saying why this is restricted to canonical IVs with a single user?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
I think that it should be fine, given that if the larger type was legal a smaller type with a power-of-2 size should be as well because it can always be promoted to a larger type if necessary. It's not obvious that we check the legality of using a vector of i64 (the default) as the induction variable in the first place? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One more thing I am wondering if we could end up introducing illegal vector types for a target and if we have to check the new type is legal?
I think that it should be fine, given that if the larger type was legal a smaller type with a power-of-2 size should be as well because it can always be promoted to a larger type if necessary. It's not obvious that we check the legality of using a vector of i64 (the default) as the induction variable in the first place?
Yeah it should be fine I think.
One more thing to double check: interleave count doens't have to be a power of 2, could you add some tests where the IC isn't a power of 2 (e.g.3, 5) to make sure the rounding works as expected even if IC * VF isn't a power-of-2?
Hmm - it's actually quite hard to get this to trigger in a test. For one thing, as discussed above this patch will not trigger when we are interleaving #118828 (comment) so there's currently not much to test when interleaving. As well as this, I think that the main use-case for this optimization is when we are tail folding and the Vector IV of interest is the mask - however if IC * VF isn't a power of 2 in that case we will hit the assert here: llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp Lines 2470 to 2471 in 72768d9
Do you think it would be ok to merge as-is and leave supporting interleaving as a follow-up? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm - it's actually quite hard to get this to trigger in a test. For one thing, as discussed above this patch will not trigger when we are interleaving #118828 (comment) so there's currently not much to test when interleaving.
As well as this, I think that the main use-case for this optimization is when we are tail folding and the Vector IV of interest is the mask - however if IC * VF isn't a power of 2 in that case we will hit the assert here:
llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Lines 2470 to 2471 in 72768d9
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking");
I see, that sounds good. Left a few more comments.
return false; | ||
|
||
auto *TC = dyn_cast_if_present<ConstantInt>( | ||
Plan.getTripCount()->getUnderlyingValue()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The underlying value should only be used if TripCount is a live-in (currently guaranteed I think), can you update this to use getLiveInIRValue
, which will assert if it isn't a live-in?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So if I refactor to use getLiveInIRValue()
, we get quite a few lit test crashes where TripCount is not a live-in, so I'm not sure this is guaranteed?
I've added a check above to make sure that the TC is a live-in, so that we are not calling getUnderlyingValue()
on non live-ins.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, the trip count is one of those special live-ins.
|
||
// Currently only handle canonical IVs as it is trivial to replace the start | ||
// and stop values, and we only perform the optimisation when the IV is only | ||
// used by the comparison controlling loop control-flow. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the part about the comparison controlling loop control-flow accurate? We only check if it is used in a compare against the back edge-taken count, but this may not be used to control the loop control flow (which IIUC would mean exiting the loop?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point - I've simplified the comment to just say that we currently only consider IVs with a single use (the details of this being a comparison with BTC are documented in the comment below).
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); | ||
|
||
// Currently only handle canonical IVs as it is trivial to replace the start | ||
// and stop values, and we only perform the optimisation when the IV is only |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// and stop values, and we only perform the optimisation when the IV is only | |
// and stop values, and we only perform the optimization when the IV is only |
to be consistent with function comment & name?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
|
||
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); | ||
for (VPRecipeBase &Phi : HeaderVPBB->phis()) { | ||
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we also support VPWidenCanonicalIVPHIrecipe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we could, but the logic for changing the type that gets produced when the recipe is executed is slightly different to VPWidenIntOrFpInductionRecipe
, so I was thinking to leave this to a follow up patch?
50e8b2b
to
34607b2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the latest updates.
Could you update the PR to latest main to make sure all tests are passing?
/// Update the step value of the recipe. | ||
void setStepValue(VPValue *V) { setOperand(1, V); } | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you move this to VPWidenInductionRecipe
, after `getStepValue?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
return false; | ||
|
||
auto *TC = dyn_cast_if_present<ConstantInt>( | ||
Plan.getTripCount()->getUnderlyingValue()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, the trip count is one of those special live-ins.
/// Try to simplify the branch condition of \p Plan. This may restrict the | ||
/// resulting plan to \p BestVF and \p BestUF. | ||
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, | ||
ElementCount BestVF, | ||
unsigned BestUF, | ||
PredicatedScalarEvolution &PSE); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is only used in optimizeForVFAndUF
, so it shouldn't need exposing here IIUC? It could be a static function in VPlanTransforms.cpp.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah good point - done.
34607b2
to
d47582a
Compare
Thanks for the review - I have addressed the latest comments & updated to latest main, all tests seem to be passing with a minor update. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
d47582a
to
9906489
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pattern match looks fine to me. Spotted a few more remaining small things after another pass.
LGTM with those addressed if possible
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} | ||
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} | ||
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} | ||
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} | ||
; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} | ||
; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} | ||
;. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can generate with -check-globals=none
to avoid generating all of those.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
if (!TC || !BestVF.isFixed()) | ||
return false; | ||
|
||
// Calculate the widest type required for known TC, VF and UF. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Calculate the widest type required for known TC, VF and UF. | |
// Calculate the minimum power-of-2 bit width that can fit the known TC, VF and UF. Returns at least 8. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah thanks, that's a much clearer comment - done.
@@ -1063,7 +1138,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |||
|
|||
VPBlockUtils::connectBlocks(Preheader, Header); | |||
VPBlockUtils::connectBlocks(ExitingVPBB, Exit); | |||
simplifyRecipes(Plan, *CanIVTy); | |||
VPlanTransforms::simplifyRecipes(Plan, *CanIVTy); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unrelated?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is actually necessary - because this logic has been moved into a (static) non-member function we now need the class scope when calling the simplifyRecipes()
@@ -0,0 +1,1006 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: opt -p loop-vectorize -force-vector-width=8 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need VF=8 here or could we also choose 2, to get more compact IR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah good point - done.
Optimize the IR generated for a VPWidenIntOrFpInductionRecipe to use the narrowest type necessary, when the trip-count of a loop is known to be constant and the only use of the recipe is the condition used by the vector loop's backedge branch.
- Use cast rather than dyn_cast - Remove unrolling test - Be explicit about APInt type. - Add comment
- Move `setStepValue` to `VPWidenInductionRecipe` - Make `simplifyBranchConditionForVFAndUF` a static, non-member function - Update a lit test
Following Commit 870f753 a broadcast is materialized for the BTC, so update our pattern-matching logic to look for a comparison with a broadcast of the BTC rather than the BTC itself.
- Improve comment according to suggestion - Generate test with `--check-globals=none` - Use VF=2 for more compact IR in test
9906489
to
7b75327
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
✅ With the latest revision this PR passed the C/C++ code formatter. |
Thanks so much @fhahn and @david-arm for reviewing - I will wait for the buildbots to be happy after fixing the conflict with upstream/main and then I will land this. |
Optimize the IR generated for a VPWidenIntOrFpInductionRecipe to use the
narrowest type necessary, when the trip-count of a loop is known to be
constant and the only use of the recipe is the condition used by the
vector loop's backedge branch.