Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 751d6dd

Browse files
author
Charlie Turner
committed
[SLP] Try a bit harder to find reduction PHIs
Summary: Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads. There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT. Reviewers: jmolloy, mcrosier, nadav Subscribers: mssimpso, nadav, aemerson, llvm-commits Differential Revision: http://reviews.llvm.org/D14063 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent b13834e commit 751d6dd

File tree

2 files changed

+117
-5
lines changed

2 files changed

+117
-5
lines changed

lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3933,6 +3933,46 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {
39333933
return V->getType() < V2->getType();
39343934
}
39353935

3936+
/// \brief Try and get a reduction value from a phi node.
3937+
///
3938+
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
3939+
/// if they come from either \p ParentBB or a containing loop latch.
3940+
///
3941+
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
3942+
/// if not possible.
3943+
static Value *getReductionValue(PHINode *P, BasicBlock *ParentBB,
3944+
LoopInfo *LI) {
3945+
Value *Rdx = nullptr;
3946+
3947+
// Return the incoming value if it comes from the same BB as the phi node.
3948+
if (P->getIncomingBlock(0) == ParentBB) {
3949+
Rdx = P->getIncomingValue(0);
3950+
} else if (P->getIncomingBlock(1) == ParentBB) {
3951+
Rdx = P->getIncomingValue(1);
3952+
}
3953+
3954+
if (Rdx)
3955+
return Rdx;
3956+
3957+
// Otherwise, check whether we have a loop latch to look at.
3958+
Loop *BBL = LI->getLoopFor(ParentBB);
3959+
if (!BBL)
3960+
return Rdx;
3961+
BasicBlock *BBLatch = BBL->getLoopLatch();
3962+
if (!BBLatch)
3963+
return Rdx;
3964+
3965+
// There is a loop latch, return the incoming value if it comes from
3966+
// that. This reduction pattern occassionaly turns up.
3967+
if (P->getIncomingBlock(0) == BBLatch) {
3968+
Rdx = P->getIncomingValue(0);
3969+
} else if (P->getIncomingBlock(1) == BBLatch) {
3970+
Rdx = P->getIncomingValue(1);
3971+
}
3972+
3973+
return Rdx;
3974+
}
3975+
39363976
bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
39373977
bool Changed = false;
39383978
SmallVector<Value *, 4> Incoming;
@@ -4000,11 +4040,9 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
40004040
// Check that the PHI is a reduction PHI.
40014041
if (P->getNumIncomingValues() != 2)
40024042
return Changed;
4003-
Value *Rdx =
4004-
(P->getIncomingBlock(0) == BB
4005-
? (P->getIncomingValue(0))
4006-
: (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
4007-
: nullptr));
4043+
4044+
Value *Rdx = getReductionValue(P, BB, LI);
4045+
40084046
// Check if this is a Binary Operator.
40094047
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
40104048
if (!BI)

test/Transforms/SLPVectorizer/AArch64/horizontal.ll

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,77 @@ for.end: ; preds = %for.end.loopexit, %
7171
%s.0.lcssa = phi i32 [ 0, %entry ], [ %add27, %for.end.loopexit ]
7272
ret i32 %s.0.lcssa
7373
}
74+
75+
;; Check whether SLP can find a reduction phi whose incoming blocks are not
76+
;; the same as the block containing the phi.
77+
;;
78+
;; Came from code like,
79+
;;
80+
;; int s = 0;
81+
;; for (int j = 0; j < h; j++) {
82+
;; s += p1[0] * p2[0]
83+
;; s += p1[1] * p2[1];
84+
;; s += p1[2] * p2[2];
85+
;; s += p1[3] * p2[3];
86+
;; if (s >= lim)
87+
;; break;
88+
;; p1 += lx;
89+
;; p2 += lx;
90+
;; }
91+
define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) {
92+
; CHECK-LABEL: reduction_with_br
93+
; CHECK: load <4 x i32>
94+
; CHECK: load <4 x i32>
95+
; CHECK: mul nsw <4 x i32>
96+
entry:
97+
%cmp.16 = icmp sgt i32 %h, 0
98+
br i1 %cmp.16, label %for.body.lr.ph, label %for.end
99+
100+
for.body.lr.ph: ; preds = %entry
101+
%idx.ext = sext i32 %lx to i64
102+
br label %for.body
103+
104+
for.body: ; preds = %for.body.lr.ph, %if.end
105+
%s.020 = phi i32 [ 0, %for.body.lr.ph ], [ %add13, %if.end ]
106+
%j.019 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
107+
%p2.018 = phi i32* [ %blk2, %for.body.lr.ph ], [ %add.ptr16, %if.end ]
108+
%p1.017 = phi i32* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end ]
109+
%0 = load i32, i32* %p1.017, align 4
110+
%1 = load i32, i32* %p2.018, align 4
111+
%mul = mul nsw i32 %1, %0
112+
%add = add nsw i32 %mul, %s.020
113+
%arrayidx2 = getelementptr inbounds i32, i32* %p1.017, i64 1
114+
%2 = load i32, i32* %arrayidx2, align 4
115+
%arrayidx3 = getelementptr inbounds i32, i32* %p2.018, i64 1
116+
%3 = load i32, i32* %arrayidx3, align 4
117+
%mul4 = mul nsw i32 %3, %2
118+
%add5 = add nsw i32 %add, %mul4
119+
%arrayidx6 = getelementptr inbounds i32, i32* %p1.017, i64 2
120+
%4 = load i32, i32* %arrayidx6, align 4
121+
%arrayidx7 = getelementptr inbounds i32, i32* %p2.018, i64 2
122+
%5 = load i32, i32* %arrayidx7, align 4
123+
%mul8 = mul nsw i32 %5, %4
124+
%add9 = add nsw i32 %add5, %mul8
125+
%arrayidx10 = getelementptr inbounds i32, i32* %p1.017, i64 3
126+
%6 = load i32, i32* %arrayidx10, align 4
127+
%arrayidx11 = getelementptr inbounds i32, i32* %p2.018, i64 3
128+
%7 = load i32, i32* %arrayidx11, align 4
129+
%mul12 = mul nsw i32 %7, %6
130+
%add13 = add nsw i32 %add9, %mul12
131+
%cmp14 = icmp slt i32 %add13, %lim
132+
br i1 %cmp14, label %if.end, label %for.end.loopexit
133+
134+
if.end: ; preds = %for.body
135+
%add.ptr = getelementptr inbounds i32, i32* %p1.017, i64 %idx.ext
136+
%add.ptr16 = getelementptr inbounds i32, i32* %p2.018, i64 %idx.ext
137+
%inc = add nuw nsw i32 %j.019, 1
138+
%cmp = icmp slt i32 %inc, %h
139+
br i1 %cmp, label %for.body, label %for.end.loopexit
140+
141+
for.end.loopexit: ; preds = %for.body, %if.end
142+
br label %for.end
143+
144+
for.end: ; preds = %for.end.loopexit, %entry
145+
%s.1 = phi i32 [ 0, %entry ], [ %add13, %for.end.loopexit ]
146+
ret i32 %s.1
147+
}

0 commit comments

Comments
 (0)