Skip to content

Commit 303a783

Browse files
authored
[GreedyRA] Improve RA for nested loop induction variables (#72093)
Imagine a loop of the form: ``` preheader: %r = def header: bcc latch, inner inner1: .. inner2: b latch latch: %r = subs %r bcc header ``` It can be possible for code to spend a decent amount of time in the header<->latch loop, not going into the inner part of the loop as much. The greedy register allocator can prefer to spill _around_ %r though, adding spills around the subs in the loop, which can be very detrimental for performance. (The case I am looking at is actually a very deeply nested set of loops that repeat the header<->latch pattern at multiple different levels). The greedy RA will apply a preference to spill to the IV, as it is live through the header block. This patch attempts to add a heuristic to prevent that in this case for variables that look like IVs, in a similar regard to the extra spill weight that gets added to variables that look like IVs, that are expensive to spill. That will mean spills are more likely to be pushed into the inner blocks, where they are less likely to be executed and not as expensive as spills around the IV. This gives a 8% speedup in the exchange benchmark from spec2017 when compiled with flang-new, whilst importantly stabilising the scores to be less chaotic to other changes. Running ctmark showed no difference in the compile time. I've tried to run a range of benchmarking for performance, most of which were relatively flat not showing many large differences. One matrix multiply case improved 21.3% due to removing a cascading chains of spills, and some other knock-on effects happen which usually cause small differences in the scores.
1 parent 56d0e8c commit 303a783

File tree

5 files changed

+62
-25
lines changed

5 files changed

+62
-25
lines changed

llvm/lib/CodeGen/RegAllocGreedy.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -767,10 +767,28 @@ bool RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
767767
if (Cand.PhysReg) {
768768
if (!addThroughConstraints(Cand.Intf, NewBlocks))
769769
return false;
770-
} else
771-
// Provide a strong negative bias on through blocks to prevent unwanted
772-
// liveness on loop backedges.
773-
SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
770+
} else {
771+
// Providing that the variable being spilled does not look like a loop
772+
// induction variable, which is expensive to spill around and better
773+
// pushed into a condition inside the loop if possible, provide a strong
774+
// negative bias on through blocks to prevent unwanted liveness on loop
775+
// backedges.
776+
bool PrefSpill = true;
777+
if (SA->looksLikeLoopIV() && NewBlocks.size() >= 2) {
778+
// Check that the current bundle is adding a Header + start+end of
779+
// loop-internal blocks. If the block is indeed a header, don't make
780+
// the NewBlocks as PrefSpill to allow the variable to be live in
781+
// Header<->Latch.
782+
MachineLoop *L = Loops->getLoopFor(MF->getBlockNumbered(NewBlocks[0]));
783+
if (L && L->getHeader()->getNumber() == (int)NewBlocks[0] &&
784+
all_of(NewBlocks.drop_front(), [&](unsigned Block) {
785+
return L == Loops->getLoopFor(MF->getBlockNumbered(Block));
786+
}))
787+
PrefSpill = false;
788+
}
789+
if (PrefSpill)
790+
SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
791+
}
774792
AddedTo = ActiveBlocks.size();
775793

776794
// Perhaps iterating can enable more bundles?

llvm/lib/CodeGen/SplitKit.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ using namespace llvm;
4545

4646
#define DEBUG_TYPE "regalloc"
4747

48+
static cl::opt<bool>
49+
EnableLoopIVHeuristic("enable-split-loopiv-heuristic",
50+
cl::desc("Enable loop iv regalloc heuristic"),
51+
cl::init(true));
52+
4853
STATISTIC(NumFinished, "Number of splits finished");
4954
STATISTIC(NumSimple, "Number of splits that were simple");
5055
STATISTIC(NumCopies, "Number of copies inserted for splitting");
@@ -293,6 +298,13 @@ void SplitAnalysis::calcLiveBlockInfo() {
293298
MFI = LIS.getMBBFromIndex(LVI->start)->getIterator();
294299
}
295300

301+
LooksLikeLoopIV = EnableLoopIVHeuristic && UseBlocks.size() == 2 &&
302+
any_of(UseBlocks, [this](BlockInfo &BI) {
303+
MachineLoop *L = Loops.getLoopFor(BI.MBB);
304+
return BI.LiveIn && BI.LiveOut && BI.FirstDef && L &&
305+
L->isLoopLatch(BI.MBB);
306+
});
307+
296308
assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
297309
}
298310

llvm/lib/CodeGen/SplitKit.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,10 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
159159
/// NumThroughBlocks - Number of live-through blocks.
160160
unsigned NumThroughBlocks = 0u;
161161

162+
/// LooksLikeLoopIV - The variable defines what looks like it could be a loop
163+
/// IV, where it defs a variable in the latch.
164+
bool LooksLikeLoopIV = false;
165+
162166
// Sumarize statistics by counting instructions using CurLI.
163167
void analyzeUses();
164168

@@ -209,6 +213,8 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
209213
return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
210214
}
211215

216+
bool looksLikeLoopIV() const { return LooksLikeLoopIV; }
217+
212218
/// countLiveBlocks - Return the number of blocks where li is live. This is
213219
/// guaranteed to return the same number as getNumLiveBlocks() after calling
214220
/// analyze(li).

llvm/test/CodeGen/AArch64/nested-iv-regalloc.mir

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
22
# RUN: llc -mtriple aarch64 --run-pass=greedy,virtregrewriter -verify-machineinstrs %s -o - | FileCheck %s
33

4+
# We should ideally not spill around any of the SUBSWri in the loop exit blocks (if.end and if.end27).
5+
46
--- |
57
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
68
target triple = "aarch64"
@@ -206,16 +208,15 @@ body: |
206208
; CHECK-NEXT: renamable $w8 = LDRWui renamable $x0, 0 :: (load (s32) from %ir.p)
207209
; CHECK-NEXT: renamable $w9 = LDRWui killed renamable $x0, 1 :: (load (s32) from %ir.arrayidx2)
208210
; CHECK-NEXT: dead $wzr = SUBSWri renamable $w8, 1, 0, implicit-def $nzcv
209-
; CHECK-NEXT: renamable $w8 = CSINCWr killed renamable $w8, $wzr, 12, implicit $nzcv
210-
; CHECK-NEXT: STRWui killed renamable $w8, %stack.2, 0 :: (store (s32) into %stack.2)
211+
; CHECK-NEXT: renamable $w11 = CSINCWr killed renamable $w8, $wzr, 12, implicit $nzcv
211212
; CHECK-NEXT: renamable $x8 = COPY killed renamable $x10
212213
; CHECK-NEXT: dead $wzr = SUBSWri renamable $w9, 1, 0, implicit-def $nzcv
213214
; CHECK-NEXT: renamable $w10 = CSINCWr killed renamable $w9, $wzr, 12, implicit $nzcv
214215
; CHECK-NEXT: STRXui renamable $x2, %stack.0, 0 :: (store (s64) into %stack.0)
215216
; CHECK-NEXT: {{ $}}
216217
; CHECK-NEXT: bb.1.do.body:
217218
; CHECK-NEXT: successors: %bb.3(0x50000000), %bb.2(0x30000000)
218-
; CHECK-NEXT: liveins: $w10, $x2, $x8
219+
; CHECK-NEXT: liveins: $w10, $w11, $x2, $x8
219220
; CHECK-NEXT: {{ $}}
220221
; CHECK-NEXT: STRXui renamable $x8, %stack.1, 0 :: (store (s64) into %stack.1)
221222
; CHECK-NEXT: renamable $w9 = MOVi32imm 36, implicit-def $x9
@@ -227,23 +228,24 @@ body: |
227228
; CHECK-NEXT: {{ $}}
228229
; CHECK-NEXT: bb.2:
229230
; CHECK-NEXT: successors: %bb.10(0x80000000)
230-
; CHECK-NEXT: liveins: $w10, $x2
231+
; CHECK-NEXT: liveins: $w10, $w11, $x2
231232
; CHECK-NEXT: {{ $}}
232233
; CHECK-NEXT: renamable $x8 = LDRXui %stack.1, 0 :: (load (s64) from %stack.1)
233234
; CHECK-NEXT: B %bb.10
234235
; CHECK-NEXT: {{ $}}
235236
; CHECK-NEXT: bb.3.do.body12.preheader:
236237
; CHECK-NEXT: successors: %bb.4(0x80000000)
237-
; CHECK-NEXT: liveins: $w10, $x2
238+
; CHECK-NEXT: liveins: $w10, $w11, $x2
238239
; CHECK-NEXT: {{ $}}
239-
; CHECK-NEXT: renamable $x11 = COPY $xzr
240+
; CHECK-NEXT: renamable $x12 = COPY $xzr
241+
; CHECK-NEXT: STRWui renamable $w11, %stack.2, 0 :: (store (s32) into %stack.2)
240242
; CHECK-NEXT: {{ $}}
241243
; CHECK-NEXT: bb.4.do.body12:
242244
; CHECK-NEXT: successors: %bb.5(0x50000000), %bb.8(0x30000000)
243-
; CHECK-NEXT: liveins: $w10, $x2, $x11
245+
; CHECK-NEXT: liveins: $w10, $w11, $x2, $x12
244246
; CHECK-NEXT: {{ $}}
245247
; CHECK-NEXT: renamable $w8 = MOVi32imm 36, implicit-def $x8
246-
; CHECK-NEXT: renamable $x8 = MADDXrrr renamable $x11, killed renamable $x8, $xzr
248+
; CHECK-NEXT: renamable $x8 = MADDXrrr renamable $x12, killed renamable $x8, $xzr
247249
; CHECK-NEXT: renamable $x9 = MOVaddr target-flags(aarch64-page) @g, target-flags(aarch64-pageoff, aarch64-nc) @g
248250
; CHECK-NEXT: renamable $w8 = LDRWroX killed renamable $x9, killed renamable $x8, 0, 0 :: (load (s32) from %ir.arrayidx14)
249251
; CHECK-NEXT: dead $wzr = SUBSWri killed renamable $w8, 1, 0, implicit-def $nzcv
@@ -252,11 +254,11 @@ body: |
252254
; CHECK-NEXT: {{ $}}
253255
; CHECK-NEXT: bb.5.if.then17:
254256
; CHECK-NEXT: successors: %bb.7(0x80000000)
255-
; CHECK-NEXT: liveins: $w10, $x2, $x11
257+
; CHECK-NEXT: liveins: $w10, $x2, $x12
256258
; CHECK-NEXT: {{ $}}
257259
; CHECK-NEXT: STRWui killed renamable $w10, %stack.3, 0 :: (store (s32) into %stack.3)
258-
; CHECK-NEXT: STRXui renamable $x11, %stack.4, 0 :: (store (s64) into %stack.4)
259-
; CHECK-NEXT: renamable $w20 = LDRWroX killed renamable $x2, killed renamable $x11, 0, 1 :: (volatile load (s32) from %ir.arrayidx19)
260+
; CHECK-NEXT: STRXui renamable $x12, %stack.4, 0 :: (store (s64) into %stack.4)
261+
; CHECK-NEXT: renamable $w20 = LDRWroX killed renamable $x2, killed renamable $x12, 0, 1 :: (volatile load (s32) from %ir.arrayidx19)
260262
; CHECK-NEXT: renamable $w19 = MOVi32imm 100
261263
; CHECK-NEXT: B %bb.7
262264
; CHECK-NEXT: {{ $}}
@@ -267,8 +269,9 @@ body: |
267269
; CHECK-NEXT: renamable $w1 = COPY killed renamable $w20
268270
; CHECK-NEXT: INLINEASM &"nop;nop", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $x0, 12 /* clobber */, implicit-def dead early-clobber $x2, 12 /* clobber */, implicit-def dead early-clobber $x3, 12 /* clobber */, implicit-def dead early-clobber $x4, 12 /* clobber */, implicit-def dead early-clobber $x5, 12 /* clobber */, implicit-def dead early-clobber $x6, 12 /* clobber */, implicit-def dead early-clobber $x7, 12 /* clobber */, implicit-def dead early-clobber $x8, 12 /* clobber */, implicit-def dead early-clobber $x9, 12 /* clobber */, implicit-def dead early-clobber $x10, 12 /* clobber */, implicit-def dead early-clobber $x11, 12 /* clobber */, implicit-def dead early-clobber $x12, 12 /* clobber */, implicit-def dead early-clobber $x13, 12 /* clobber */, implicit-def dead early-clobber $x14, 12 /* clobber */, implicit-def dead early-clobber $x15, 12 /* clobber */, implicit-def dead early-clobber $x16, 12 /* clobber */, implicit-def dead early-clobber $x17, 12 /* clobber */, implicit-def dead early-clobber $x18, 12 /* clobber */, implicit-def dead early-clobber $x19, 12 /* clobber */, implicit-def dead early-clobber $x20, 12 /* clobber */, implicit-def dead early-clobber $x21, 12 /* clobber */, implicit-def dead early-clobber $x22, 12 /* clobber */, implicit-def dead early-clobber $x23, 12 /* clobber */, implicit-def dead early-clobber $x24, 12 /* clobber */, implicit-def dead early-clobber $x25, 12 /* clobber */, implicit-def dead early-clobber $x26, 12 /* clobber */, implicit-def dead early-clobber $x27, 12 /* clobber */, implicit-def dead early-clobber $x28, 12 /* clobber */, implicit-def dead early-clobber $fp, 12 /* clobber */, implicit-def dead early-clobber $lr
269271
; CHECK-NEXT: renamable $x2 = LDRXui %stack.0, 0 :: (load (s64) from %stack.0)
270-
; CHECK-NEXT: renamable $x11 = LDRXui %stack.4, 0 :: (load (s64) from %stack.4)
271-
; CHECK-NEXT: STRWroX killed renamable $w1, renamable $x2, renamable $x11, 0, 1 :: (volatile store (s32) into %ir.sunkaddr1)
272+
; CHECK-NEXT: renamable $x12 = LDRXui %stack.4, 0 :: (load (s64) from %stack.4)
273+
; CHECK-NEXT: STRWroX killed renamable $w1, renamable $x2, renamable $x12, 0, 1 :: (volatile store (s32) into %ir.sunkaddr1)
274+
; CHECK-NEXT: renamable $w11 = LDRWui %stack.2, 0 :: (load (s32) from %stack.2)
272275
; CHECK-NEXT: renamable $w10 = LDRWui %stack.3, 0 :: (load (s32) from %stack.3)
273276
; CHECK-NEXT: B %bb.8
274277
; CHECK-NEXT: {{ $}}
@@ -286,28 +289,26 @@ body: |
286289
; CHECK-NEXT: {{ $}}
287290
; CHECK-NEXT: bb.8.if.end:
288291
; CHECK-NEXT: successors: %bb.9(0x04000000), %bb.4(0x7c000000)
289-
; CHECK-NEXT: liveins: $w10, $x2, $x11
292+
; CHECK-NEXT: liveins: $w10, $w11, $x2, $x12
290293
; CHECK-NEXT: {{ $}}
291294
; CHECK-NEXT: renamable $w10 = nsw SUBSWri killed renamable $w10, 1, 0, implicit-def $nzcv
292-
; CHECK-NEXT: renamable $x11 = nuw nsw ADDXri killed renamable $x11, 1, 0
295+
; CHECK-NEXT: renamable $x12 = nuw nsw ADDXri killed renamable $x12, 1, 0
293296
; CHECK-NEXT: Bcc 1, %bb.4, implicit $nzcv
294297
; CHECK-NEXT: B %bb.9
295298
; CHECK-NEXT: {{ $}}
296299
; CHECK-NEXT: bb.9.do.end:
297300
; CHECK-NEXT: successors: %bb.10(0x80000000)
298-
; CHECK-NEXT: liveins: $x2
301+
; CHECK-NEXT: liveins: $w11, $x2
299302
; CHECK-NEXT: {{ $}}
300303
; CHECK-NEXT: renamable $w10 = COPY $wzr
301304
; CHECK-NEXT: renamable $x8 = LDRXui %stack.1, 0 :: (load (s64) from %stack.1)
302305
; CHECK-NEXT: STRWroX $wzr, renamable $x2, renamable $x8, 0, 1 :: (volatile store (s32) into %ir.arrayidx26)
303306
; CHECK-NEXT: {{ $}}
304307
; CHECK-NEXT: bb.10.if.end27:
305308
; CHECK-NEXT: successors: %bb.11(0x04000000), %bb.1(0x7c000000)
306-
; CHECK-NEXT: liveins: $w10, $x2, $x8
309+
; CHECK-NEXT: liveins: $w10, $w11, $x2, $x8
307310
; CHECK-NEXT: {{ $}}
308-
; CHECK-NEXT: renamable $w9 = LDRWui %stack.2, 0 :: (load (s32) from %stack.2)
309-
; CHECK-NEXT: renamable $w9 = nsw SUBSWri killed renamable $w9, 1, 0, implicit-def $nzcv
310-
; CHECK-NEXT: STRWui killed renamable $w9, %stack.2, 0 :: (store (s32) into %stack.2)
311+
; CHECK-NEXT: renamable $w11 = nsw SUBSWri killed renamable $w11, 1, 0, implicit-def $nzcv
311312
; CHECK-NEXT: renamable $x8 = nuw nsw ADDXri killed renamable $x8, 1, 0
312313
; CHECK-NEXT: Bcc 1, %bb.1, implicit $nzcv
313314
; CHECK-NEXT: B %bb.11

llvm/test/DebugInfo/MIR/InstrRef/memory-operand-folding-tieddef.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# RUN: llc %s -o - -experimental-debug-variable-locations \
22
# RUN: -start-before=x86-flags-copy-lowering -stop-after=virtregrewriter \
3-
# RUN: -mtriple x86_64-unknown-unknown \
3+
# RUN: -mtriple x86_64-unknown-unknown -enable-split-loopiv-heuristic=false \
44
# RUN: | FileCheck %s
55
#
66
# This test is for stack spill folding -- the INC32r near the end of the MIR

0 commit comments

Comments
 (0)