Skip to content

Commit 72d5967

Browse files
[ARM] Reduce loop unroll when low overhead branching is available
For processors with low overhead branching (LOB), runtime unrolling the innermost loop is often detrimental to performance. In these cases the loop remainder gets unrolled into a series of compare-and-jump blocks, which in deeply nested loops get executed multiple times, negating the benefits of LOB. This is particularly noticable when the loop trip count of the innermost loop varies withing the outer loop, such as in the case of triangular matrix decompositions. In these cases we will prefer to not unroll the innermost loop, with the intention for it to be executed as a low overhead loop.
1 parent da439d3 commit 72d5967

File tree

2 files changed

+31
-10
lines changed

2 files changed

+31
-10
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2592,11 +2592,23 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
25922592
return;
25932593
}
25942594

2595+
bool Runtime = true;
2596+
if (ST->hasLOB()) {
2597+
if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
2598+
const auto *BETC = SE.getBackedgeTakenCount(L);
2599+
auto *Outer = L->getOutermostLoop();
2600+
if ((L != Outer && Outer != L->getParentLoop()) ||
2601+
(L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2602+
Runtime = false;
2603+
}
2604+
}
2605+
}
2606+
25952607
LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
25962608
LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
25972609

25982610
UP.Partial = true;
2599-
UP.Runtime = true;
2611+
UP.Runtime = Runtime;
26002612
UP.UnrollRemainder = true;
26012613
UP.DefaultUnrollRuntimeCount = UnrollCount;
26022614
UP.UnrollAndJam = true;

llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
1+
; RUN: opt -mcpu=cortex-m7 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=NLOB
12
; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=LOB
23

34
; This test checks behaviour of loop unrolling on processors with low overhead branching available
45

5-
; LOB-CHECK-LABEL: for.body{{.*}}.prol
6-
; LOB-COUNT-1: fmul fast float
7-
; LOB-CHECK-LABEL: for.body{{.*}}.prol.1
8-
; LOB-COUNT-1: fmul fast float
9-
; LOB-CHECK-LABEL: for.body{{.*}}.prol.2
10-
; LOB-COUNT-1: fmul fast float
11-
; LOB-CHECK-LABEL: for.body{{.*}}
12-
; LOB-COUNT-4: fmul fast float
6+
; NLOB-LABEL: for.body{{.*}}.prol:
7+
; NLOB-COUNT-1: fmul fast float
8+
; NLOB-LABEL: for.body{{.*}}.prol.1:
9+
; NLOB-COUNT-1: fmul fast float
10+
; NLOB-LABEL: for.body{{.*}}.prol.2:
11+
; NLOB-COUNT-1: fmul fast float
12+
; NLOB-LABEL: for.body{{.*}}:
13+
; NLOB-COUNT-4: fmul fast float
14+
; NLOB-NOT: fmul fast float
15+
16+
; LOB-LABEL: for.body{{.*}}:
17+
; LOB: fmul fast float
1318
; LOB-NOT: fmul fast float
1419

20+
1521
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
1622
define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 {
1723
entry:
@@ -20,7 +26,7 @@ entry:
2026

2127
for.cond.loopexit: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body
2228
%exitcond49.not = icmp eq i32 %add, %n
23-
br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body
29+
br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
2430

2531
for.cond.cleanup: ; preds = %for.cond.loopexit, %entry
2632
ret void
@@ -61,3 +67,6 @@ for.cond6.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us
6167
br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us
6268
}
6369

70+
!0 = distinct !{!0, !1, !2}
71+
!1 = !{!"llvm.loop.mustprogress"}
72+
!2 = !{!"llvm.loop.unroll.disable"}

0 commit comments

Comments
 (0)