Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 63fe164

Browse files
author
Charlie Turner
committed
[SLP] Be more aggressive about reduction width selection.
Summary: This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach. It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize. I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT. The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something! Reviewers: nadav, jmolloy Subscribers: mssimpso, llvm-commits, aemerson Differential Revision: http://reviews.llvm.org/D14116 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 751d6dd commit 63fe164

File tree

2 files changed

+158
-12
lines changed

2 files changed

+158
-12
lines changed

lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3659,16 +3659,17 @@ class HorizontalReduction {
36593659
unsigned ReductionOpcode;
36603660
/// The opcode of the values we perform a reduction on.
36613661
unsigned ReducedValueOpcode;
3662-
/// The width of one full horizontal reduction operation.
3663-
unsigned ReduxWidth;
36643662
/// Should we model this reduction as a pairwise reduction tree or a tree that
36653663
/// splits the vector in halves and adds those halves.
36663664
bool IsPairwiseReduction;
36673665

36683666
public:
3667+
/// The width of one full horizontal reduction operation.
3668+
unsigned ReduxWidth;
3669+
36693670
HorizontalReduction()
36703671
: ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3671-
ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3672+
ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
36723673

36733674
/// \brief Try to find a reduction tree.
36743675
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -3825,8 +3826,11 @@ class HorizontalReduction {
38253826
return VectorizedTree != nullptr;
38263827
}
38273828

3828-
private:
3829+
unsigned numReductionValues() const {
3830+
return ReducedVals.size();
3831+
}
38293832

3833+
private:
38303834
/// \brief Calculate the cost of a reduction.
38313835
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
38323836
Type *ScalarTy = FirstReducedVal->getType();
@@ -3973,6 +3977,30 @@ static Value *getReductionValue(PHINode *P, BasicBlock *ParentBB,
39733977
return Rdx;
39743978
}
39753979

3980+
/// \brief Attempt to reduce a horizontal reduction.
3981+
/// If it is legal to match a horizontal reduction feeding
3982+
/// the phi node P with reduction operators BI, then check if it
3983+
/// can be done.
3984+
/// \returns true if a horizontal reduction was matched and reduced.
3985+
/// \returns false if a horizontal reduction was not matched.
3986+
static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
3987+
BoUpSLP &R, TargetTransformInfo *TTI) {
3988+
if (!ShouldVectorizeHor)
3989+
return false;
3990+
3991+
HorizontalReduction HorRdx;
3992+
if (!HorRdx.matchAssociativeReduction(P, BI))
3993+
return false;
3994+
3995+
// If there is a sufficient number of reduction values, reduce
3996+
// to a nearby power-of-2. Can safely generate oversized
3997+
// vectors and rely on the backend to split them to legal sizes.
3998+
HorRdx.ReduxWidth =
3999+
std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
4000+
4001+
return HorRdx.tryToReduce(R, TTI);
4002+
}
4003+
39764004
bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
39774005
bool Changed = false;
39784006
SmallVector<Value *, 4> Incoming;
@@ -4049,9 +4077,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
40494077
continue;
40504078

40514079
// Try to match and vectorize a horizontal reduction.
4052-
HorizontalReduction HorRdx;
4053-
if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
4054-
HorRdx.tryToReduce(R, TTI)) {
4080+
if (canMatchHorizontalReduction(P, BI, R, TTI)) {
40554081
Changed = true;
40564082
it = BB->begin();
40574083
e = BB->end();
@@ -4074,15 +4100,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
40744100
continue;
40754101
}
40764102

4077-
// Try to vectorize horizontal reductions feeding into a store.
40784103
if (ShouldStartVectorizeHorAtStore)
40794104
if (StoreInst *SI = dyn_cast<StoreInst>(it))
40804105
if (BinaryOperator *BinOp =
40814106
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
4082-
HorizontalReduction HorRdx;
4083-
if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
4084-
HorRdx.tryToReduce(R, TTI)) ||
4085-
tryToVectorize(BinOp, R))) {
4107+
if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
4108+
tryToVectorize(BinOp, R)) {
40864109
Changed = true;
40874110
it = BB->begin();
40884111
e = BB->end();

test/Transforms/SLPVectorizer/AArch64/horizontal.ll

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,126 @@ for.end: ; preds = %for.end.loopexit, %
145145
%s.1 = phi i32 [ 0, %entry ], [ %add13, %for.end.loopexit ]
146146
ret i32 %s.1
147147
}
148+
149+
; CHECK: test_unrolled_select
150+
; CHECK: load <8 x i8>
151+
; CHECK: load <8 x i8>
152+
; CHECK: select <8 x i1>
153+
define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
154+
entry:
155+
%cmp.43 = icmp sgt i32 %h, 0
156+
br i1 %cmp.43, label %for.body.lr.ph, label %for.end
157+
158+
for.body.lr.ph: ; preds = %entry
159+
%idx.ext = sext i32 %lx to i64
160+
br label %for.body
161+
162+
for.body: ; preds = %for.body.lr.ph, %if.end.86
163+
%s.047 = phi i32 [ 0, %for.body.lr.ph ], [ %add82, %if.end.86 ]
164+
%j.046 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end.86 ]
165+
%p2.045 = phi i8* [ %blk2, %for.body.lr.ph ], [ %add.ptr88, %if.end.86 ]
166+
%p1.044 = phi i8* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end.86 ]
167+
%0 = load i8, i8* %p1.044, align 1
168+
%conv = zext i8 %0 to i32
169+
%1 = load i8, i8* %p2.045, align 1
170+
%conv2 = zext i8 %1 to i32
171+
%sub = sub nsw i32 %conv, %conv2
172+
%cmp3 = icmp slt i32 %sub, 0
173+
%sub5 = sub nsw i32 0, %sub
174+
%sub5.sub = select i1 %cmp3, i32 %sub5, i32 %sub
175+
%add = add nsw i32 %sub5.sub, %s.047
176+
%arrayidx6 = getelementptr inbounds i8, i8* %p1.044, i64 1
177+
%2 = load i8, i8* %arrayidx6, align 1
178+
%conv7 = zext i8 %2 to i32
179+
%arrayidx8 = getelementptr inbounds i8, i8* %p2.045, i64 1
180+
%3 = load i8, i8* %arrayidx8, align 1
181+
%conv9 = zext i8 %3 to i32
182+
%sub10 = sub nsw i32 %conv7, %conv9
183+
%cmp11 = icmp slt i32 %sub10, 0
184+
%sub14 = sub nsw i32 0, %sub10
185+
%v.1 = select i1 %cmp11, i32 %sub14, i32 %sub10
186+
%add16 = add nsw i32 %add, %v.1
187+
%arrayidx17 = getelementptr inbounds i8, i8* %p1.044, i64 2
188+
%4 = load i8, i8* %arrayidx17, align 1
189+
%conv18 = zext i8 %4 to i32
190+
%arrayidx19 = getelementptr inbounds i8, i8* %p2.045, i64 2
191+
%5 = load i8, i8* %arrayidx19, align 1
192+
%conv20 = zext i8 %5 to i32
193+
%sub21 = sub nsw i32 %conv18, %conv20
194+
%cmp22 = icmp slt i32 %sub21, 0
195+
%sub25 = sub nsw i32 0, %sub21
196+
%sub25.sub21 = select i1 %cmp22, i32 %sub25, i32 %sub21
197+
%add27 = add nsw i32 %add16, %sub25.sub21
198+
%arrayidx28 = getelementptr inbounds i8, i8* %p1.044, i64 3
199+
%6 = load i8, i8* %arrayidx28, align 1
200+
%conv29 = zext i8 %6 to i32
201+
%arrayidx30 = getelementptr inbounds i8, i8* %p2.045, i64 3
202+
%7 = load i8, i8* %arrayidx30, align 1
203+
%conv31 = zext i8 %7 to i32
204+
%sub32 = sub nsw i32 %conv29, %conv31
205+
%cmp33 = icmp slt i32 %sub32, 0
206+
%sub36 = sub nsw i32 0, %sub32
207+
%v.3 = select i1 %cmp33, i32 %sub36, i32 %sub32
208+
%add38 = add nsw i32 %add27, %v.3
209+
%arrayidx39 = getelementptr inbounds i8, i8* %p1.044, i64 4
210+
%8 = load i8, i8* %arrayidx39, align 1
211+
%conv40 = zext i8 %8 to i32
212+
%arrayidx41 = getelementptr inbounds i8, i8* %p2.045, i64 4
213+
%9 = load i8, i8* %arrayidx41, align 1
214+
%conv42 = zext i8 %9 to i32
215+
%sub43 = sub nsw i32 %conv40, %conv42
216+
%cmp44 = icmp slt i32 %sub43, 0
217+
%sub47 = sub nsw i32 0, %sub43
218+
%sub47.sub43 = select i1 %cmp44, i32 %sub47, i32 %sub43
219+
%add49 = add nsw i32 %add38, %sub47.sub43
220+
%arrayidx50 = getelementptr inbounds i8, i8* %p1.044, i64 5
221+
%10 = load i8, i8* %arrayidx50, align 1
222+
%conv51 = zext i8 %10 to i32
223+
%arrayidx52 = getelementptr inbounds i8, i8* %p2.045, i64 5
224+
%11 = load i8, i8* %arrayidx52, align 1
225+
%conv53 = zext i8 %11 to i32
226+
%sub54 = sub nsw i32 %conv51, %conv53
227+
%cmp55 = icmp slt i32 %sub54, 0
228+
%sub58 = sub nsw i32 0, %sub54
229+
%v.5 = select i1 %cmp55, i32 %sub58, i32 %sub54
230+
%add60 = add nsw i32 %add49, %v.5
231+
%arrayidx61 = getelementptr inbounds i8, i8* %p1.044, i64 6
232+
%12 = load i8, i8* %arrayidx61, align 1
233+
%conv62 = zext i8 %12 to i32
234+
%arrayidx63 = getelementptr inbounds i8, i8* %p2.045, i64 6
235+
%13 = load i8, i8* %arrayidx63, align 1
236+
%conv64 = zext i8 %13 to i32
237+
%sub65 = sub nsw i32 %conv62, %conv64
238+
%cmp66 = icmp slt i32 %sub65, 0
239+
%sub69 = sub nsw i32 0, %sub65
240+
%sub69.sub65 = select i1 %cmp66, i32 %sub69, i32 %sub65
241+
%add71 = add nsw i32 %add60, %sub69.sub65
242+
%arrayidx72 = getelementptr inbounds i8, i8* %p1.044, i64 7
243+
%14 = load i8, i8* %arrayidx72, align 1
244+
%conv73 = zext i8 %14 to i32
245+
%arrayidx74 = getelementptr inbounds i8, i8* %p2.045, i64 7
246+
%15 = load i8, i8* %arrayidx74, align 1
247+
%conv75 = zext i8 %15 to i32
248+
%sub76 = sub nsw i32 %conv73, %conv75
249+
%cmp77 = icmp slt i32 %sub76, 0
250+
%sub80 = sub nsw i32 0, %sub76
251+
%v.7 = select i1 %cmp77, i32 %sub80, i32 %sub76
252+
%add82 = add nsw i32 %add71, %v.7
253+
%cmp83 = icmp slt i32 %add82, %lim
254+
br i1 %cmp83, label %if.end.86, label %for.end.loopexit
255+
256+
if.end.86: ; preds = %for.body
257+
%add.ptr = getelementptr inbounds i8, i8* %p1.044, i64 %idx.ext
258+
%add.ptr88 = getelementptr inbounds i8, i8* %p2.045, i64 %idx.ext
259+
%inc = add nuw nsw i32 %j.046, 1
260+
%cmp = icmp slt i32 %inc, %h
261+
br i1 %cmp, label %for.body, label %for.end.loopexit
262+
263+
for.end.loopexit: ; preds = %for.body, %if.end.86
264+
br label %for.end
265+
266+
for.end: ; preds = %for.end.loopexit, %entry
267+
%s.1 = phi i32 [ 0, %entry ], [ %add82, %for.end.loopexit ]
268+
ret i32 %s.1
269+
}
270+

0 commit comments

Comments
 (0)