[LoopVectorize][CostModel] Choose smaller VFs for in-loop reductions without loads/stores

RosieSumpter · RosieSumpter · commit 961f51fdf04f · 2022-01-04T10:12:57.000Z
For loops that contain in-loop reductions but no loads or stores, large VFs are chosen because LoopVectorizationCostModel::getSmallestAndWidestTypes has no element types to check through and so returns the default widths (-1U for the smallest and 8 for the widest). This results in the widest VF being chosen for the following example, float s = 0; for (int i = 0; i < N; ++i) s += (float) i*i; which, for more computationally intensive loops, leads to large loop sizes when the operations end up being scalarized. In this patch, for the case where ElementTypesInLoop is empty, the widest type is determined by finding the smallest type used by recurrences in the loop instead of falling back to a default value of 8 bits. This results in the cost model choosing a more sensible VF for loops like the one above. Differential Revision: https://reviews.llvm.org/D113973
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -77,10 +77,12 @@ class RecurrenceDescriptor {
   RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
                        FastMathFlags FMF, Instruction *ExactFP, Type *RT,
                        bool Signed, bool Ordered,
-                       SmallPtrSetImpl<Instruction *> &CI)
+                       SmallPtrSetImpl<Instruction *> &CI,
+                       unsigned MinWidthCastToRecurTy)
       : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
         ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
-        IsOrdered(Ordered) {
+        IsOrdered(Ordered),
+        MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
     CastInsts.insert(CI.begin(), CI.end());
   }
 
@@ -251,6 +253,11 @@ class RecurrenceDescriptor {
   /// recurrence.
   const SmallPtrSet<Instruction *, 8> &getCastInsts() const { return CastInsts; }
 
+  /// Returns the minimum width used by the recurrence in bits.
+  unsigned getMinWidthCastToRecurrenceTypeInBits() const {
+    return MinWidthCastToRecurrenceType;
+  }
+
   /// Returns true if all source operands of the recurrence are SExtInsts.
   bool isSigned() const { return IsSigned; }
 
@@ -291,6 +298,8 @@ class RecurrenceDescriptor {
   bool IsOrdered = false;
   // Instructions used for type-promoting the recurrence.
   SmallPtrSet<Instruction *, 8> CastInsts;
+  // The minimum width used by the recurrence.
+  unsigned MinWidthCastToRecurrenceType;
 };
 
 /// A struct for saving information about induction variables.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -161,27 +161,39 @@ static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
 
 /// Collect cast instructions that can be ignored in the vectorizer's cost
 /// model, given a reduction exit value and the minimal type in which the
-/// reduction can be represented.
-static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
-                                 Type *RecurrenceType,
-                                 SmallPtrSetImpl<Instruction *> &Casts) {
+// reduction can be represented. Also search casts to the recurrence type
+// to find the minimum width used by the recurrence.
+static void collectCastInstrs(Loop *TheLoop, Instruction *Exit,
+                              Type *RecurrenceType,
+                              SmallPtrSetImpl<Instruction *> &Casts,
+                              unsigned &MinWidthCastToRecurTy) {
 
   SmallVector<Instruction *, 8> Worklist;
   SmallPtrSet<Instruction *, 8> Visited;
   Worklist.push_back(Exit);
+  MinWidthCastToRecurTy = -1U;
 
   while (!Worklist.empty()) {
     Instruction *Val = Worklist.pop_back_val();
     Visited.insert(Val);
-    if (auto *Cast = dyn_cast<CastInst>(Val))
+    if (auto *Cast = dyn_cast<CastInst>(Val)) {
       if (Cast->getSrcTy() == RecurrenceType) {
         // If the source type of a cast instruction is equal to the recurrence
         // type, it will be eliminated, and should be ignored in the vectorizer
         // cost model.
         Casts.insert(Cast);
         continue;
       }
-
+      if (Cast->getDestTy() == RecurrenceType) {
+        // The minimum width used by the recurrence is found by checking for
+        // casts on its operands. The minimum width is used by the vectorizer
+        // when finding the widest type for in-loop reductions without any
+        // loads/stores.
+        MinWidthCastToRecurTy = std::min<unsigned>(
+            MinWidthCastToRecurTy, Cast->getSrcTy()->getScalarSizeInBits());
+        continue;
+      }
+    }
     // Add all operands to the work list if they are loop-varying values that
     // we haven't yet visited.
     for (Value *O : cast<User>(Val)->operands())
@@ -265,6 +277,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // Data used for determining if the recurrence has been type-promoted.
   Type *RecurrenceType = Phi->getType();
   SmallPtrSet<Instruction *, 4> CastInsts;
+  unsigned MinWidthCastToRecurrenceType;
   Instruction *Start = Phi;
   bool IsSigned = false;
 
@@ -500,21 +513,24 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
         computeRecurrenceType(ExitInstruction, DB, AC, DT);
     if (ComputedType != RecurrenceType)
       return false;
-
-    // The recurrence expression will be represented in a narrower type. If
-    // there are any cast instructions that will be unnecessary, collect them
-    // in CastInsts. Note that the 'and' instruction was already included in
-    // this list.
-    //
-    // TODO: A better way to represent this may be to tag in some way all the
-    //       instructions that are a part of the reduction. The vectorizer cost
-    //       model could then apply the recurrence type to these instructions,
-    //       without needing a white list of instructions to ignore.
-    //       This may also be useful for the inloop reductions, if it can be
-    //       kept simple enough.
-    collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
   }
 
+  // Collect cast instructions and the minimum width used by the recurrence.
+  // If the starting value is not the same as the phi node and the computed
+  // recurrence type is equal to the recurrence type, the recurrence expression
+  // will be represented in a narrower or wider type. If there are any cast
+  // instructions that will be unnecessary, collect them in CastsFromRecurTy.
+  // Note that the 'and' instruction was already included in this list.
+  //
+  // TODO: A better way to represent this may be to tag in some way all the
+  //       instructions that are a part of the reduction. The vectorizer cost
+  //       model could then apply the recurrence type to these instructions,
+  //       without needing a white list of instructions to ignore.
+  //       This may also be useful for the inloop reductions, if it can be
+  //       kept simple enough.
+  collectCastInstrs(TheLoop, ExitInstruction, RecurrenceType, CastInsts,
+                    MinWidthCastToRecurrenceType);
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -524,7 +540,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // Save the description of this reduction variable.
   RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
                           ReduxDesc.getExactFPMathInst(), RecurrenceType,
-                          IsSigned, IsOrdered, CastInsts);
+                          IsSigned, IsOrdered, CastInsts,
+                          MinWidthCastToRecurrenceType);
   RedDes = RD;
 
   return true;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5960,11 +5960,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
   unsigned MaxWidth = 8;
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-  for (Type *T : ElementTypesInLoop) {
-    MinWidth = std::min<unsigned>(
-        MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
-    MaxWidth = std::max<unsigned>(
-        MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+  // For in-loop reductions, no element types are added to ElementTypesInLoop
+  // if there are no loads/stores in the loop. In this case, check through the
+  // reduction variables to determine the maximum width.
+  if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
+    // Reset MaxWidth so that we can find the smallest type used by recurrences
+    // in the loop.
+    MaxWidth = -1U;
+    for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
+      const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
+      // When finding the min width used by the recurrence we need to account
+      // for casts on the input operands of the recurrence.
+      MaxWidth = std::min<unsigned>(
+          MaxWidth, std::min<unsigned>(
+                        RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
+                        RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
+    }
+  } else {
+    for (Type *T : ElementTypesInLoop) {
+      MinWidth = std::min<unsigned>(
+          MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+      MaxWidth = std::max<unsigned>(
+          MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+    }
   }
   return {MinWidth, MaxWidth};
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-target-instruction-cost=1 -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -31,3 +31,74 @@ for.body:
 for.end:
   ret void
 }
+
+; For in-loop reductions with no loads or stores in the loop the widest type is
+; determined by looking through the recurrences, which allows a sensible VF to be
+; chosen. The following 3 cases check different combinations of widths.
+
+; CHECK-LABEL: Checking a loop in "no_loads_stores_32"
+; CHECK: The Smallest and Widest types: 4294967295 / 32 bits
+; CHECK: Selecting VF: 4
+
+define double @no_loads_stores_32(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = sitofp i32 %i.08 to float
+  %conv1 = fpext float %conv to double
+  %add = fadd double %s.09, %conv1
+  %inc = add nuw i32 %i.08, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %.lcssa = phi double [ %add, %for.body ]
+  ret double %.lcssa
+}
+
+; CHECK-LABEL: Checking a loop in "no_loads_stores_16"
+; CHECK: The Smallest and Widest types: 4294967295 / 16 bits
+; CHECK: Selecting VF: 8
+
+define double @no_loads_stores_16() {
+entry:
+  br label %for.body
+
+for.body:
+  %s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %i.08 = phi i16 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = sitofp i16 %i.08 to double
+  %add = fadd double %s.09, %conv
+  %inc = add nuw nsw i16 %i.08, 1
+  %exitcond.not = icmp eq i16 %inc, 12345
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %.lcssa = phi double [ %add, %for.body ]
+  ret double %.lcssa
+}
+
+; CHECK-LABEL: Checking a loop in "no_loads_stores_8"
+; CHECK: The Smallest and Widest types: 4294967295 / 8 bits
+; CHECK: Selecting VF: 16
+
+define float @no_loads_stores_8() {
+entry:
+  br label %for.body
+
+for.body:
+  %s.09 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %i.08 = phi i8 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = sitofp i8 %i.08 to float
+  %add = fadd float %s.09, %conv
+  %inc = add nuw nsw i8 %i.08, 1
+  %exitcond.not = icmp eq i8 %inc, 12345
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %.lcssa = phi float [ %add, %for.body ]
+  ret float %.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
@@ -33,7 +33,7 @@ unreachable:                                      ; preds = %entry
 
 ; CHECK-LABEL: define void @test1(
 ; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
-; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
+; CHECK: call <8 x double> @llvm.floor.v8f64(<8 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
 
 declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)