Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 3f92210

Browse files
author
Hal Finkel
committed
[LV] Extend InstWidening with CM_Widen_Recursive
Changes to the original scalar loop during LV code gen cause the return value of Legal->isConsecutivePtr() to be inconsistent with the return value during legal/cost phases (further analysis and information of the bug is in D39346). This patch is an alternative fix to PR34965 following the CM_Widen approach proposed by Ayal and Gil in D39346. It extends InstWidening enum with CM_Widen_Reverse to properly record the widening decision for consecutive reverse memory accesses and, consequently, get rid of the Legal->isConsetuviePtr() call in LV code gen. I think this is a simpler/cleaner solution to PR34965 than the one in D39346. Fixes PR34965. Patch by Diego Caballero, thanks! Differential Revision: https://reviews.llvm.org/D40742 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320913 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent cd27af2 commit 3f92210

File tree

2 files changed

+84
-5
lines changed

2 files changed

+84
-5
lines changed

lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1612,6 +1612,8 @@ class LoopVectorizationLegality {
16121612
/// 0 - Stride is unknown or non-consecutive.
16131613
/// 1 - Address is consecutive.
16141614
/// -1 - Address is consecutive, and decreasing.
1615+
/// NOTE: This method must only be used before modifying the original scalar
1616+
/// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
16151617
int isConsecutivePtr(Value *Ptr);
16161618

16171619
/// Returns true if the value V is uniform within the loop.
@@ -1966,7 +1968,8 @@ class LoopVectorizationCostModel {
19661968
/// Decision that was taken during cost calculation for memory instruction.
19671969
enum InstWidening {
19681970
CM_Unknown,
1969-
CM_Widen,
1971+
CM_Widen, // For consecutive accesses with stride +1.
1972+
CM_Widen_Reverse, // For consecutive accesses with stride -1.
19701973
CM_Interleave,
19711974
CM_GatherScatter,
19721975
CM_Scalarize
@@ -3204,8 +3207,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
32043207

32053208
// Determine if the pointer operand of the access is either consecutive or
32063209
// reverse consecutive.
3207-
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
3208-
bool Reverse = ConsecutiveStride < 0;
3210+
bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
3211+
bool ConsecutiveStride =
3212+
Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
32093213
bool CreateGatherScatter =
32103214
(Decision == LoopVectorizationCostModel::CM_GatherScatter);
32113215

@@ -5711,6 +5715,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
57115715
"Widening decision should be ready at this moment");
57125716

57135717
return (WideningDecision == CM_Widen ||
5718+
WideningDecision == CM_Widen_Reverse ||
57145719
WideningDecision == CM_Interleave);
57155720
};
57165721
// Iterate over the instructions in the loop, and collect all
@@ -7180,7 +7185,12 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
71807185
// We assume that widening is the best solution when possible.
71817186
if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
71827187
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
7183-
setWideningDecision(&I, VF, CM_Widen, Cost);
7188+
int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
7189+
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7190+
"Expected consecutive stride.");
7191+
InstWidening Decision =
7192+
ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7193+
setWideningDecision(&I, VF, Decision, Cost);
71847194
continue;
71857195
}
71867196

@@ -7270,7 +7280,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
72707280
// by cost functions, but since this involves the task of finding out
72717281
// if the loaded register is involved in an address computation, it is
72727282
// instead changed here when we know this is the case.
7273-
if (getWideningDecision(I, VF) == CM_Widen)
7283+
InstWidening Decision = getWideningDecision(I, VF);
7284+
if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
72747285
// Scalarize a widened load of address.
72757286
setWideningDecision(I, VF, CM_Scalarize,
72767287
(VF * getMemoryInstructionCost(I, 1)));
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
; RUN: opt -loop-vectorize -S < %s | FileCheck %s
2+
3+
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
4+
target triple = "x86_64-unknown-linux-gnu"
5+
6+
; PR34965/D39346
7+
8+
; LV retains the original scalar loop intact as remainder loop. However,
9+
; after this transformation, analysis information concerning the remainder
10+
; loop may differ from the original scalar loop. This test is an example of
11+
; that behaviour, where values inside the remainder loop which SCEV could
12+
; originally analyze now require flow-sensitive analysis currently not
13+
; supported in SCEV. In particular, during LV code generation, after turning
14+
; the original scalar loop into the remainder loop, LV expected
15+
; Legal->isConsecutivePtr() to be consistent and return the same output as
16+
; during legal/cost model phases (original scalar loop). Unfortunately, that
17+
; condition was not satisfied because of the aforementioned SCEV limitation.
18+
; After D39346, LV code generation doesn't rely on Legal->isConsecutivePtr(),
19+
; i.e., SCEV. This test verifies that LV is able to handle the described cases.
20+
;
21+
; TODO: The SCEV limitation described before may affect plans to further
22+
; optimize the remainder loop of this particular test case. One tentative
23+
; solution is to detect the problematic IVs in LV (%7 and %8) and perform an
24+
; in-place IV optimization by replacing:
25+
; %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] with
26+
; with
27+
; %8 = sub i32 %7, 1.
28+
29+
30+
; Verify that store is vectorized as stride-1 memory access.
31+
32+
; CHECK: vector.body:
33+
; CHECK: store <4 x i32>
34+
35+
; Function Attrs: uwtable
36+
define void @test() {
37+
br label %.outer
38+
39+
; <label>:1: ; preds = %2
40+
ret void
41+
42+
; <label>:2: ; preds = %._crit_edge.loopexit
43+
%3 = add nsw i32 %.ph, -2
44+
br i1 undef, label %1, label %.outer
45+
46+
.outer: ; preds = %2, %0
47+
%.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
48+
%.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
49+
%4 = and i32 %.ph, 30
50+
%5 = add i32 %.ph2, 1
51+
br label %6
52+
53+
; <label>:6: ; preds = %6, %.outer
54+
%7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
55+
%8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
56+
%9 = add i32 %8, 2
57+
%10 = zext i32 %9 to i64
58+
%11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
59+
%12 = ashr i32 undef, %4
60+
store i32 %12, i32 addrspace(1)* %11, align 4
61+
%13 = add i32 %7, 1
62+
%14 = icmp sgt i32 %13, 61
63+
br i1 %14, label %._crit_edge.loopexit, label %6
64+
65+
._crit_edge.loopexit: ; preds = %._crit_edge.loopexit, %6
66+
br i1 undef, label %2, label %._crit_edge.loopexit
67+
}
68+

0 commit comments

Comments
 (0)