[SLP] allow forming 2-way reduction patterns

rotateright · rotateright · commit 7ff57705ba19 · 2019-11-07T06:08:42.000-05:00
We have a vector compare reduction problem seen in PR39665 comment 2: https://bugs.llvm.org/show_bug.cgi?id=39665#c2 Or slightly reduced here: define i1 @Cmp2(<2 x double> %a0) { %a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0> %b = extractelement <2 x i1> %a, i32 0 %c = extractelement <2 x i1> %a, i32 1 %d = and i1 %b, %c ret i1 %d } SLP would not attempt to turn this into a vector reduction because there is an artificial lower limit on that transform. We can not completely remove that limit without inducing regressions though, so this patch just hacks an extra attempt at creating a 2-way reduction to the end of the analysis. As shown in the test file, we are still not getting some of the motivating cases, so follow-on patches will be needed to solve those cases. Differential Revision: https://reviews.llvm.org/D59710
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -113,9 +113,12 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
 
   /// Try to find horizontal reduction or otherwise vectorize a chain of binary
   /// operators.
+  /// \p Try2WayRdx specializes the analysis to only attempt a 2-element
+  /// reduction.
   bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
                                 slpvectorizer::BoUpSLP &R,
-                                TargetTransformInfo *TTI);
+                                TargetTransformInfo *TTI,
+                                bool Try2WayRdx = false);
 
   /// Try to vectorize trees that start at insertvalue instructions.
   bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6397,19 +6397,22 @@ class HorizontalReduction {
 
   /// Attempt to vectorize the tree found by
   /// matchAssociativeReduction.
-  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, bool Try2WayRdx) {
     if (ReducedVals.empty())
       return false;
 
     // If there is a sufficient number of reduction values, reduce
     // to a nearby power-of-2. Can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
-    if (NumReducedVals < 4)
+    if (Try2WayRdx && NumReducedVals != 2)
+      return false;
+    unsigned MinRdxVals = Try2WayRdx ? 2 : 4;
+    if (NumReducedVals < MinRdxVals)
       return false;
 
     unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-
+    unsigned MinRdxWidth = Log2_32(MinRdxVals);
     Value *VectorizedTree = nullptr;
 
     // FIXME: Fast-math-flags should be set based on the instructions in the
@@ -6433,7 +6436,7 @@ class HorizontalReduction {
     SmallVector<Value *, 16> IgnoreList;
     for (auto &V : ReductionOps)
       IgnoreList.append(V.begin(), V.end());
-    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
+    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
       V.buildTree(VL, ExternallyUsedValues, IgnoreList);
       Optional<ArrayRef<unsigned>> Order = V.bestOrder();
@@ -6759,7 +6762,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
 /// performed.
 static bool tryToVectorizeHorReductionOrInstOperands(
     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
-    TargetTransformInfo *TTI,
+    TargetTransformInfo *TTI, bool Try2WayRdx,
     const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
@@ -6790,7 +6793,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
     if (BI || SI) {
       HorizontalReduction HorRdx;
       if (HorRdx.matchAssociativeReduction(P, Inst)) {
-        if (HorRdx.tryToReduce(R, TTI)) {
+        if (HorRdx.tryToReduce(R, TTI, Try2WayRdx)) {
           Res = true;
           // Set P to nullptr to avoid re-analysis of phi node in
           // matchAssociativeReduction function unless this is the root node.
@@ -6833,7 +6836,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
 
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
                                                  BasicBlock *BB, BoUpSLP &R,
-                                                 TargetTransformInfo *TTI) {
+                                                 TargetTransformInfo *TTI,
+                                                 bool Try2WayRdx) {
   if (!V)
     return false;
   auto *I = dyn_cast<Instruction>(V);
@@ -6846,7 +6850,7 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
   auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
     return tryToVectorize(I, R);
   };
-  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, Try2WayRdx,
                                                   ExtraVectorization);
 }
 
@@ -7042,6 +7046,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessInstructions.push_back(&*it);
   }
 
+  // Make a final attempt to match a 2-way reduction if nothing else worked.
+  // We do not try this above because it may interfere with other vectorization
+  // attempts.
+  // TODO: The constraints are copied from the above call to
+  //       vectorizeRootInstruction(), but that might be too restrictive?
+  BasicBlock::iterator LastInst = --BB->end();
+  if (!Changed && LastInst->use_empty() &&
+      (LastInst->getType()->isVoidTy() || isa<CallInst>(LastInst) ||
+       isa<InvokeInst>(LastInst))) {
+    if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(LastInst)) {
+      for (auto *V : LastInst->operand_values()) {
+        Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI,
+                                            /* Try2WayRdx */ true);
+      }
+    }
+  }
+
   return Changed;
 }
 
diff --git a/llvm/test/Feature/weak_constant.ll b/llvm/test/Feature/weak_constant.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -O3 -S > %t
-; RUN:   grep undef %t | count 1
+; RUN:   grep undef %t | count 2
 ; RUN:   grep 5 %t | count 1
 ; RUN:   grep 7 %t | count 1
 ; RUN:   grep 9 %t | count 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
@@ -54,10 +54,10 @@ define double @foo(double* nocapture %D) {
 define i1 @two_wide_fcmp_reduction(<2 x double> %a0) {
 ; CHECK-LABEL: @two_wide_fcmp_reduction(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0
-; CHECK-NEXT:    [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1
-; CHECK-NEXT:    [[D:%.*]] = and i1 [[B]], [[C]]
-; CHECK-NEXT:    ret i1 [[D]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0>
   %b = extractelement <2 x i1> %a, i32 0
@@ -96,12 +96,11 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D>
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
 ; CHECK:       lor.lhs.false:
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0