Skip to content

Commit a5dd6c6

Browse files
committed
[LoopVectorize] Don't interleave scalar ordered reductions for inner loops
Consider the following loop: void foo(float *dst, float *src, int N) { for (int i = 0; i < N; i++) { dst[i] = 0.0; for (int j = 0; j < N; j++) { dst[i] += src[(i * N) + j]; } } } When we are not building with -Ofast we may attempt to vectorise the inner loop using ordered reductions instead. In addition we also try to select an appropriate interleave count for the inner loop. However, when choosing a VF=1 the inner loop will be scalar and there is existing code in selectInterleaveCount that limits the interleave count to 2 for reductions due to concerns about increasing the critical path. For ordered reductions this problem is even worse due to the additional data dependency, and so I've added code to simply disable interleaving for scalar ordered reductions for now. Test added here: Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll Differential Revision: https://reviews.llvm.org/D106646
1 parent c78b954 commit a5dd6c6

File tree

2 files changed

+56
-2
lines changed

2 files changed

+56
-2
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6473,9 +6473,21 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
64736473

64746474
// If we have a scalar reduction (vector reductions are already dealt with
64756475
// by this point), we can increase the critical path length if the loop
6476-
// we're interleaving is inside another loop. Limit, by default to 2, so the
6477-
// critical path only gets increased by one reduction operation.
6476+
// we're interleaving is inside another loop. For tree-wise reductions
6477+
// set the limit to 2, and for ordered reductions it's best to disable
6478+
// interleaving entirely.
64786479
if (HasReductions && TheLoop->getLoopDepth() > 1) {
6480+
bool HasOrderedReductions =
6481+
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6482+
const RecurrenceDescriptor &RdxDesc = Reduction.second;
6483+
return RdxDesc.isOrdered();
6484+
});
6485+
if (HasOrderedReductions) {
6486+
LLVM_DEBUG(
6487+
dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6488+
return 1;
6489+
}
6490+
64796491
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
64806492
SmallIC = std::min(SmallIC, F);
64816493
StoresIC = std::min(StoresIC, F);
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -loop-vectorize -enable-strict-reductions=true -force-vector-width=1 -S < %s -debug 2>log | FileCheck %s
3+
; RUN: cat log | FileCheck %s --check-prefix=CHECK-DEBUG
4+
5+
target triple = "aarch64-unknown-linux-gnu"
6+
7+
; CHECK-DEBUG: LV: Not interleaving scalar ordered reductions.
8+
9+
define void @foo(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %M, i64 %N) {
10+
; CHECK-LABEL: @foo(
11+
; CHECK-NOT: vector.body
12+
13+
entry:
14+
br label %for.body.us
15+
16+
for.body.us: ; preds = %entry, %for.cond3
17+
%i.023.us = phi i64 [ %inc8.us, %for.cond3 ], [ 0, %entry ]
18+
%arrayidx.us = getelementptr inbounds float, float* %dst, i64 %i.023.us
19+
%mul.us = mul nsw i64 %i.023.us, %N
20+
br label %for.body3.us
21+
22+
for.body3.us: ; preds = %for.body.us, %for.body3.us
23+
%0 = phi float [ 0.000000e+00, %for.body.us ], [ %add6.us, %for.body3.us ]
24+
%j.021.us = phi i64 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ]
25+
%add.us = add nsw i64 %j.021.us, %mul.us
26+
%arrayidx4.us = getelementptr inbounds float, float* %src, i64 %add.us
27+
%1 = load float, float* %arrayidx4.us, align 4
28+
%add6.us = fadd float %1, %0
29+
%inc.us = add nuw nsw i64 %j.021.us, 1
30+
%exitcond.not = icmp eq i64 %inc.us, %N
31+
br i1 %exitcond.not, label %for.cond3, label %for.body3.us
32+
33+
for.cond3: ; preds = %for.body3.us
34+
%add6.us.lcssa = phi float [ %add6.us, %for.body3.us ]
35+
store float %add6.us.lcssa, float* %arrayidx.us, align 4
36+
%inc8.us = add nuw nsw i64 %i.023.us, 1
37+
%exitcond26.not = icmp eq i64 %inc8.us, %M
38+
br i1 %exitcond26.not, label %exit, label %for.body.us
39+
40+
exit: ; preds = %for.cond3
41+
ret void
42+
}

0 commit comments

Comments
 (0)