llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Lines changed: 37 additions & 22 deletions b/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Lines changed: 37 additions & 22 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
Lines changed: 38 additions & 0 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
Lines changed: 38 additions & 0 deletions
@@ -2652,6 +2652,33 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
   return I->second;
 }
 
+/// Knowing that loop \p L would be fully unrolled after vectorisation, add
+/// instructions that will get simplified and thus should not have any cost to
+/// \p InstsToIgnore
+static void AddFullyUnrolledInstructionsToIgnore(
+    Loop *L, const LoopVectorizationLegality::InductionList &IL,
+    SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
+  auto *Cmp = L->getLatchCmpInst();
+  if (!Cmp)
+    return;
+  InstsToIgnore.insert(Cmp);
+  for (const auto &[IV, IndDesc] : IL) {
+    // Get next iteration value of the induction variable
+    Instruction *IVInst =
+        cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
+    bool IsSimplifiedAway = true;
+    // Check that this value used only to exit the loop
+    for (auto *UIV : IVInst->users()) {
+      if (UIV != IV && UIV != Cmp) {
+        IsSimplifiedAway = false;
+        break;
+      }
+    }
+    if (IsSimplifiedAway)
+      InstsToIgnore.insert(IVInst);
+  }
+}
+
 void InnerLoopVectorizer::createInductionResumeValues(
     const SCEV2ValueTy &ExpandedSCEVs,
     std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -5557,19 +5584,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
   InstructionCost Cost;
 
-  // If with the given VF loop gets fully unrolled, ignore the costs of
-  // comparison and induction instructions, as they'll get simplified away
-  SmallPtrSet<const Value *, 16> ValuesToIgnoreForVF;
+  // If with the given fixed width VF loop gets fully unrolled, ignore the costs
+  // of comparison and induction instructions, as they'll get simplified away
+  SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
   auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  auto *Cmp = TheLoop->getLatchCmpInst();
-  if (Cmp && TC == VF.getKnownMinValue()) {
-    ValuesToIgnoreForVF.insert(Cmp);
-    for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
-      Instruction *IVInc = cast<Instruction>(
-          IV->getIncomingValueForBlock(TheLoop->getLoopLatch()));
-      ValuesToIgnoreForVF.insert(IVInc);
-    }
-  }
+  if (VF.isFixed() && TC == VF.getFixedValue())
+    AddFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
+                                         ValuesToIgnoreForVF);
 
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
@@ -7263,16 +7284,10 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
     // If with the given VF loop gets fully unrolled, ignore the costs of
     // comparison and induction instructions, as they'll get simplified away
-    auto TC = CM.PSE.getSE()->getSmallConstantTripCount(OrigLoop);
-    auto *Cmp = OrigLoop->getLatchCmpInst();
-    if (Cmp && TC == VF.getKnownMinValue()) {
-      CostCtx.SkipCostComputation.insert(Cmp);
-      for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
-        Instruction *IVInc = cast<Instruction>(
-            IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-        CostCtx.SkipCostComputation.insert(IVInc);
-      }
-    }
+    auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
+    if (VF.isFixed() && TC == VF.getFixedValue())
+      AddFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
+                                           CostCtx.SkipCostComputation);
 
     for (Instruction *IVInst : IVInsts) {
       if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
 
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s
+
+target triple="aarch64--linux-gnu"
+
+define i64 @test(ptr %a, ptr %b) #0 {
+; CHECK: LV: Checking a loop in 'test'
+; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, 16
+; CHECK: LV: Vector loop of width 8 costs: 3.
+; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NOT: LV: Found an estimated cost of 1 for VF 16 For instruction:   %exitcond.not = icmp eq i64 %indvars.iv.next, 16
+; CHECK: LV: Vector loop of width 16 costs: 3.
+; CHECK: LV: Selecting VF: 16
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i64 [ %add, %for.body ]
+  ret i64 %add.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.09 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %mul, %sum.09
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { vscale_range(1, 16) "target-features"="+sve" }