[LV] Extend InstWidening with CM_Widen_Recursive

Hal Finkel · Hal Finkel · commit 3f92210a791a · 2017-12-16T02:55:24.000Z
Changes to the original scalar loop during LV code gen cause the return value of Legal->isConsecutivePtr() to be inconsistent with the return value during legal/cost phases (further analysis and information of the bug is in D39346). This patch is an alternative fix to PR34965 following the CM_Widen approach proposed by Ayal and Gil in D39346. It extends InstWidening enum with CM_Widen_Reverse to properly record the widening decision for consecutive reverse memory accesses and, consequently, get rid of the Legal->isConsetuviePtr() call in LV code gen. I think this is a simpler/cleaner solution to PR34965 than the one in D39346. Fixes PR34965. Patch by Diego Caballero, thanks! Differential Revision: https://reviews.llvm.org/D40742 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320913 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1612,6 +1612,8 @@ class LoopVectorizationLegality {
   /// 0 - Stride is unknown or non-consecutive.
   /// 1 - Address is consecutive.
   /// -1 - Address is consecutive, and decreasing.
+  /// NOTE: This method must only be used before modifying the original scalar
+  /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
   int isConsecutivePtr(Value *Ptr);
 
   /// Returns true if the value V is uniform within the loop.
@@ -1966,7 +1968,8 @@ class LoopVectorizationCostModel {
   /// Decision that was taken during cost calculation for memory instruction.
   enum InstWidening {
     CM_Unknown,
-    CM_Widen,
+    CM_Widen,         // For consecutive accesses with stride +1.
+    CM_Widen_Reverse, // For consecutive accesses with stride -1.
     CM_Interleave,
     CM_GatherScatter,
     CM_Scalarize
@@ -3204,8 +3207,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
-  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
-  bool Reverse = ConsecutiveStride < 0;
+  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
+  bool ConsecutiveStride =
+      Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
   bool CreateGatherScatter =
       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
 
@@ -5711,6 +5715,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
            "Widening decision should be ready at this moment");
 
     return (WideningDecision == CM_Widen ||
+            WideningDecision == CM_Widen_Reverse ||
             WideningDecision == CM_Interleave);
   };
   // Iterate over the instructions in the loop, and collect all
@@ -7180,7 +7185,12 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       // We assume that widening is the best solution when possible.
       if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
-        setWideningDecision(&I, VF, CM_Widen, Cost);
+        int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
+        assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+               "Expected consecutive stride.");
+        InstWidening Decision =
+            ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+        setWideningDecision(&I, VF, Decision, Cost);
         continue;
       }
 
@@ -7270,7 +7280,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
       // by cost functions, but since this involves the task of finding out
       // if the loaded register is involved in an address computation, it is
       // instead changed here when we know this is the case.
-      if (getWideningDecision(I, VF) == CM_Widen)
+      InstWidening Decision = getWideningDecision(I, VF);
+      if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
         // Scalarize a widened load of address.
         setWideningDecision(I, VF, CM_Scalarize,
                             (VF * getMemoryInstructionCost(I, 1)));
diff --git a/test/Transforms/LoopVectorize/consecutive-ptr-cg-bug.ll b/test/Transforms/LoopVectorize/consecutive-ptr-cg-bug.ll
@@ -0,0 +1,68 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR34965/D39346
+
+; LV retains the original scalar loop intact as remainder loop. However,
+; after this transformation, analysis information concerning the remainder
+; loop may differ from the original scalar loop. This test is an example of
+; that behaviour, where values inside the remainder loop which SCEV could
+; originally analyze now require flow-sensitive analysis currently not
+; supported in SCEV. In particular, during LV code generation, after turning
+; the original scalar loop into the remainder loop, LV expected
+; Legal->isConsecutivePtr() to be consistent and return the same output as
+; during legal/cost model phases (original scalar loop). Unfortunately, that
+; condition was not satisfied because of the aforementioned SCEV limitation.
+; After D39346, LV code generation doesn't rely on Legal->isConsecutivePtr(),
+; i.e., SCEV. This test verifies that LV is able to handle the described cases.
+;
+; TODO: The SCEV limitation described before may affect plans to further
+; optimize the remainder loop of this particular test case. One tentative
+; solution is to detect the problematic IVs in LV (%7 and %8) and perform an
+; in-place IV optimization by replacing:
+;   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] with
+; with
+;   %8 = sub i32 %7, 1.
+
+
+; Verify that store is vectorized as stride-1 memory access.
+
+; CHECK: vector.body:
+; CHECK: store <4 x i32>
+
+; Function Attrs: uwtable
+define void @test() {
+  br label %.outer
+
+; <label>:1:                                      ; preds = %2
+  ret void
+
+; <label>:2:                                      ; preds = %._crit_edge.loopexit
+  %3 = add nsw i32 %.ph, -2
+  br i1 undef, label %1, label %.outer
+
+.outer:                                           ; preds = %2, %0
+  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
+  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
+  %4 = and i32 %.ph, 30
+  %5 = add i32 %.ph2, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %6, %.outer
+  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
+  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
+  %9 = add i32 %8, 2
+  %10 = zext i32 %9 to i64
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
+  %12 = ashr i32 undef, %4
+  store i32 %12, i32 addrspace(1)* %11, align 4
+  %13 = add i32 %7, 1
+  %14 = icmp sgt i32 %13, 61
+  br i1 %14, label %._crit_edge.loopexit, label %6
+
+._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
+  br i1 undef, label %2, label %._crit_edge.loopexit
+}
+