Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 8b3a547

Browse files
committed
[AArch64][Falkor] Try to avoid exhausting HW prefetcher resources when unrolling.
Reviewers: t.p.northover, mcrosier Subscribers: aemerson, rengolin, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34533 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306584 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 31c22b7 commit 8b3a547

File tree

2 files changed

+228
-0
lines changed

2 files changed

+228
-0
lines changed

lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ using namespace llvm;
2020

2121
#define DEBUG_TYPE "aarch64tti"
2222

23+
static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
24+
cl::init(true), cl::Hidden);
25+
2326
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
2427
const Function *Callee) const {
2528
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -645,6 +648,58 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
645648
return ST->getMaxInterleaveFactor();
646649
}
647650

651+
// For Falkor, we want to avoid having too many strided loads in a loop since
652+
// that can exhaust the HW prefetcher resources. We adjust the unroller
653+
// MaxCount preference below to attempt to ensure unrolling doesn't create too
654+
// many strided loads.
655+
static void
656+
getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
657+
TargetTransformInfo::UnrollingPreferences &UP) {
658+
const int MaxStridedLoads = 7;
659+
auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
660+
int StridedLoads = 0;
661+
// FIXME? We could make this more precise by looking at the CFG and
662+
// e.g. not counting loads in each side of an if-then-else diamond.
663+
for (const auto BB : L->blocks()) {
664+
for (auto &I : *BB) {
665+
LoadInst *LMemI = dyn_cast<LoadInst>(&I);
666+
if (!LMemI)
667+
continue;
668+
669+
Value *PtrValue = LMemI->getPointerOperand();
670+
if (L->isLoopInvariant(PtrValue))
671+
continue;
672+
673+
const SCEV *LSCEV = SE.getSCEV(PtrValue);
674+
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
675+
if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
676+
continue;
677+
678+
// FIXME? We could take pairing of unrolled load copies into account
679+
// by looking at the AddRec, but we would probably have to limit this
680+
// to loops with no stores or other memory optimization barriers.
681+
++StridedLoads;
682+
// We've seen enough strided loads that seeing more won't make a
683+
// difference.
684+
if (StridedLoads > MaxStridedLoads / 2)
685+
return StridedLoads;
686+
}
687+
}
688+
return StridedLoads;
689+
};
690+
691+
int StridedLoads = countStridedLoads(L, SE);
692+
DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
693+
<< " strided loads\n");
694+
// Pick the largest power of 2 unroll count that won't result in too many
695+
// strided loads.
696+
if (StridedLoads) {
697+
UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
698+
DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
699+
<< '\n');
700+
}
701+
}
702+
648703
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
649704
TTI::UnrollingPreferences &UP) {
650705
// Enable partial unrolling and runtime unrolling.
@@ -658,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
658713

659714
// Disable partial & runtime unrolling on -Os.
660715
UP.PartialOptSizeThreshold = 0;
716+
717+
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
718+
EnableFalkorHWPFUnrollFix)
719+
getFalkorUnrollingPreferences(L, SE, UP);
661720
}
662721

663722
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=falkor | FileCheck %s
2+
; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=falkor -enable-falkor-hwpf-unroll-fix=0 | FileCheck %s --check-prefix=NOHWPF
3+
4+
; Check that loop unroller doesn't exhaust HW prefetcher resources.
5+
6+
; Partial unroll 2 times for this loop on falkor instead of 4.
7+
; NOHWPF-LABEL: @unroll1(
8+
; NOHWPF-LABEL: loop:
9+
; NOHWPF-NEXT: phi
10+
; NOHWPF-NEXT: getelementptr
11+
; NOHWPF-NEXT: load
12+
; NOHWPF-NEXT: getelementptr
13+
; NOHWPF-NEXT: load
14+
; NOHWPF-NEXT: add
15+
; NOHWPF-NEXT: getelementptr
16+
; NOHWPF-NEXT: load
17+
; NOHWPF-NEXT: getelementptr
18+
; NOHWPF-NEXT: load
19+
; NOHWPF-NEXT: add
20+
; NOHWPF-NEXT: getelementptr
21+
; NOHWPF-NEXT: load
22+
; NOHWPF-NEXT: getelementptr
23+
; NOHWPF-NEXT: load
24+
; NOHWPF-NEXT: add
25+
; NOHWPF-NEXT: getelementptr
26+
; NOHWPF-NEXT: load
27+
; NOHWPF-NEXT: getelementptr
28+
; NOHWPF-NEXT: load
29+
; NOHWPF-NEXT: add
30+
; NOHWPF-NEXT: icmp
31+
; NOHWPF-NEXT: br
32+
; NOHWPF-NEXT-LABEL: exit:
33+
;
34+
; CHECK-LABEL: @unroll1(
35+
; CHECK-LABEL: loop:
36+
; CHECK-NEXT: phi
37+
; CHECK-NEXT: getelementptr
38+
; CHECK-NEXT: load
39+
; CHECK-NEXT: getelementptr
40+
; CHECK-NEXT: load
41+
; CHECK-NEXT: add
42+
; CHECK-NEXT: getelementptr
43+
; CHECK-NEXT: load
44+
; CHECK-NEXT: getelementptr
45+
; CHECK-NEXT: load
46+
; CHECK-NEXT: add
47+
; CHECK-NEXT: icmp
48+
; CHECK-NEXT: br
49+
; CHECK-NEXT-LABEL: exit:
50+
define void @unroll1(i32* %p, i32* %p2) {
51+
entry:
52+
br label %loop
53+
54+
loop:
55+
%iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
56+
57+
%gep = getelementptr inbounds i32, i32* %p, i32 %iv
58+
%load = load volatile i32, i32* %gep
59+
60+
%gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv
61+
%load2 = load volatile i32, i32* %gep2
62+
63+
%inc = add i32 %iv, 1
64+
%exitcnd = icmp uge i32 %inc, 1024
65+
br i1 %exitcnd, label %exit, label %loop
66+
67+
exit:
68+
ret void
69+
}
70+
71+
; Partial unroll 4 times for this loop on falkor instead of 8.
72+
; NOHWPF-LABEL: @unroll2(
73+
; NOHWPF-LABEL: loop2:
74+
; NOHWPF-NEXT: phi
75+
; NOHWPF-NEXT: phi
76+
; NOHWPF-NEXT: getelementptr
77+
; NOHWPF-NEXT: load
78+
; NOHWPF-NEXT: add
79+
; NOHWPF-NEXT: add
80+
; NOHWPF-NEXT: getelementptr
81+
; NOHWPF-NEXT: load
82+
; NOHWPF-NEXT: add
83+
; NOHWPF-NEXT: add
84+
; NOHWPF-NEXT: getelementptr
85+
; NOHWPF-NEXT: load
86+
; NOHWPF-NEXT: add
87+
; NOHWPF-NEXT: add
88+
; NOHWPF-NEXT: getelementptr
89+
; NOHWPF-NEXT: load
90+
; NOHWPF-NEXT: add
91+
; NOHWPF-NEXT: add
92+
; NOHWPF-NEXT: getelementptr
93+
; NOHWPF-NEXT: load
94+
; NOHWPF-NEXT: add
95+
; NOHWPF-NEXT: add
96+
; NOHWPF-NEXT: getelementptr
97+
; NOHWPF-NEXT: load
98+
; NOHWPF-NEXT: add
99+
; NOHWPF-NEXT: add
100+
; NOHWPF-NEXT: getelementptr
101+
; NOHWPF-NEXT: load
102+
; NOHWPF-NEXT: add
103+
; NOHWPF-NEXT: add
104+
; NOHWPF-NEXT: getelementptr
105+
; NOHWPF-NEXT: load
106+
; NOHWPF-NEXT: add
107+
; NOHWPF-NEXT: add
108+
; NOHWPF-NEXT: icmp
109+
; NOHWPF-NEXT: br
110+
; NOHWPF-NEXT-LABEL: exit2:
111+
;
112+
; CHECK-LABEL: @unroll2(
113+
; CHECK-LABEL: loop2:
114+
; CHECK-NEXT: phi
115+
; CHECK-NEXT: phi
116+
; CHECK-NEXT: getelementptr
117+
; CHECK-NEXT: load
118+
; CHECK-NEXT: add
119+
; CHECK-NEXT: add
120+
; CHECK-NEXT: getelementptr
121+
; CHECK-NEXT: load
122+
; CHECK-NEXT: add
123+
; CHECK-NEXT: add
124+
; CHECK-NEXT: getelementptr
125+
; CHECK-NEXT: load
126+
; CHECK-NEXT: add
127+
; CHECK-NEXT: add
128+
; CHECK-NEXT: getelementptr
129+
; CHECK-NEXT: load
130+
; CHECK-NEXT: add
131+
; CHECK-NEXT: add
132+
; CHECK-NEXT: icmp
133+
; CHECK-NEXT: br
134+
; CHECK-NEXT-LABEL: exit2:
135+
136+
define void @unroll2(i32* %p) {
137+
entry:
138+
br label %loop1
139+
140+
loop1:
141+
%iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]
142+
%outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ]
143+
br label %loop2.header
144+
145+
loop2.header:
146+
br label %loop2
147+
148+
loop2:
149+
%iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]
150+
%sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ]
151+
%gep = getelementptr inbounds i32, i32* %p, i32 %iv2
152+
%load = load i32, i32* %gep
153+
%sum.inc = add i32 %sum, %load
154+
%inc2 = add i32 %iv2, 1
155+
%exitcnd2 = icmp uge i32 %inc2, 1024
156+
br i1 %exitcnd2, label %exit2, label %loop2
157+
158+
exit2:
159+
br label %loop1.latch
160+
161+
loop1.latch:
162+
%inc1 = add i32 %iv1, 1
163+
%exitcnd1 = icmp uge i32 %inc1, 1024
164+
br i1 %exitcnd2, label %exit, label %loop1
165+
166+
exit:
167+
ret void
168+
}
169+

0 commit comments

Comments
 (0)