Skip to content

Commit eac3487

Browse files
[LoopInterchange] Try to achieve the most optimal access pattern after interchange
Motivated by pr43326 (https://bugs.llvm.org/show_bug.cgi?id=43326), where a slightly modified case is as follows. void f(int e[10][10][10], int f[10][10][10]) { for (int a = 0; a < 10; a++) for (int b = 0; b < 10; b++) for (int c = 0; c < 10; c++) f[c][b][a] = e[c][b][a]; } The ideal optimal access pattern after running interchange is supposed to be the following void f(int e[10][10][10], int f[10][10][10]) { for (int c = 0; c < 10; c++) for (int b = 0; b < 10; b++) for (int a = 0; a < 10; a++) f[c][b][a] = e[c][b][a]; } Currently loop interchange is limited to picking up the innermost loop and finding an order that is locally optimal for it. However, the pass failed to produce the globally optimal loop access order. For more complex examples what we get could be quite far from the globally optimal ordering. What is proposed in this patch is to do a "bubble-sort" fashion when doing interchange. By comparing neighbors in `LoopList` in each iteration, we would be able to move each loop onto a most appropriate place, hence this is an approach that tries to achieve the globally optimal ordering. The motivating example above is added as a test case. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D120386
1 parent 375d934 commit eac3487

File tree

3 files changed

+109
-26
lines changed

3 files changed

+109
-26
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -266,26 +266,28 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
266266
return true;
267267
}
268268

269-
static LoopVector populateWorklist(Loop &L) {
269+
static void populateWorklist(Loop &L, LoopVector &LoopList) {
270270
LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
271271
<< L.getHeader()->getParent()->getName() << " Loop: %"
272272
<< L.getHeader()->getName() << '\n');
273-
LoopVector LoopList;
273+
assert(LoopList.empty() && "LoopList should initially be empty!");
274274
Loop *CurrentLoop = &L;
275275
const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
276276
while (!Vec->empty()) {
277277
// The current loop has multiple subloops in it hence it is not tightly
278278
// nested.
279279
// Discard all loops above it added into Worklist.
280-
if (Vec->size() != 1)
281-
return {};
280+
if (Vec->size() != 1) {
281+
LoopList = {};
282+
return;
283+
}
282284

283285
LoopList.push_back(CurrentLoop);
284286
CurrentLoop = Vec->front();
285287
Vec = &CurrentLoop->getSubLoops();
286288
}
287289
LoopList.push_back(CurrentLoop);
288-
return LoopList;
290+
return;
289291
}
290292

291293
namespace {
@@ -419,12 +421,13 @@ struct LoopInterchange {
419421
bool run(Loop *L) {
420422
if (L->getParentLoop())
421423
return false;
422-
423-
return processLoopList(populateWorklist(*L));
424+
SmallVector<Loop *, 8> LoopList;
425+
populateWorklist(*L, LoopList);
426+
return processLoopList(LoopList);
424427
}
425428

426429
bool run(LoopNest &LN) {
427-
const auto &LoopList = LN.getLoops();
430+
SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());
428431
for (unsigned I = 1; I < LoopList.size(); ++I)
429432
if (LoopList[I]->getParentLoop() != LoopList[I - 1])
430433
return false;
@@ -456,7 +459,7 @@ struct LoopInterchange {
456459
return LoopList.size() - 1;
457460
}
458461

459-
bool processLoopList(ArrayRef<Loop *> LoopList) {
462+
bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
460463
bool Changed = false;
461464
unsigned LoopNestDepth = LoopList.size();
462465
if (LoopNestDepth < 2) {
@@ -496,20 +499,32 @@ struct LoopInterchange {
496499
}
497500

498501
unsigned SelecLoopId = selectLoopForInterchange(LoopList);
499-
// Move the selected loop outwards to the best possible position.
500-
Loop *LoopToBeInterchanged = LoopList[SelecLoopId];
501-
for (unsigned i = SelecLoopId; i > 0; i--) {
502-
bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i,
503-
i - 1, DependencyMatrix);
504-
if (!Interchanged)
505-
return Changed;
506-
// Update the DependencyMatrix
507-
interChangeDependencies(DependencyMatrix, i, i - 1);
502+
// We try to achieve the globally optimal memory access for the loopnest,
503+
// and do interchange based on a bubble-sort fasion. We start from
504+
// the innermost loop, move it outwards to the best possible position
505+
// and repeat this process.
506+
for (unsigned j = SelecLoopId; j > 0; j--) {
507+
bool ChangedPerIter = false;
508+
for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
509+
bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
510+
DependencyMatrix);
511+
if (!Interchanged)
512+
continue;
513+
// Loops interchanged, update LoopList accordingly.
514+
std::swap(LoopList[i - 1], LoopList[i]);
515+
// Update the DependencyMatrix
516+
interChangeDependencies(DependencyMatrix, i, i - 1);
508517
#ifdef DUMP_DEP_MATRICIES
509-
LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
510-
printDepMatrix(DependencyMatrix);
518+
LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
519+
printDepMatrix(DependencyMatrix);
511520
#endif
512-
Changed |= Interchanged;
521+
ChangedPerIter |= Interchanged;
522+
Changed |= Interchanged;
523+
}
524+
// Early abort if there was no interchange during an entire round of
525+
// moving loops outwards.
526+
if (!ChangedPerIter)
527+
break;
513528
}
514529
return Changed;
515530
}
@@ -1419,9 +1434,13 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
14191434

14201435
// Incoming values are guaranteed be instructions currently.
14211436
auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1437+
// In case of multi-level nested loops, follow LCSSA to find the incoming
1438+
// value defined from the innermost loop.
1439+
auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
14221440
// Skip phis with incoming values from the inner loop body, excluding the
14231441
// header and latch.
1424-
if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
1442+
if (IncIInnerMost->getParent() != InnerLatch &&
1443+
IncIInnerMost->getParent() != InnerHeader)
14251444
continue;
14261445

14271446
assert(all_of(P.users(),

llvm/test/Transforms/LoopInterchange/phi-ordering.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=-1000 -S 2>&1 | FileCheck %s
2+
; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=0 -S 2>&1 | FileCheck %s
33
;; Checks the order of the inner phi nodes does not cause havoc.
44
;; The inner loop has a reduction into c. The IV is not the first phi.
55

@@ -9,7 +9,7 @@ target triple = "armv8--linux-gnueabihf"
99

1010

1111
; Function Attrs: norecurse nounwind
12-
define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 {
12+
define void @test(i32 %T, [90 x i32]* noalias nocapture %C, [90 x [90 x i16]]* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 {
1313
; CHECK-LABEL: @test(
1414
; CHECK-NEXT: entry:
1515
; CHECK-NEXT: br label [[FOR3_PREHEADER:%.*]]
@@ -31,7 +31,7 @@ define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocaptu
3131
; CHECK-NEXT: br label [[FOR1_HEADER_PREHEADER]]
3232
; CHECK: for3.split1:
3333
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[K]], [[MUL]]
34-
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[ADD]]
34+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* [[A:%.*]], i32 [[ADD]], i32 [[J]], i32 [[I]]
3535
; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
3636
; CHECK-NEXT: [[ADD15:%.*]] = add nsw i16 [[TMP0]], 1
3737
; CHECK-NEXT: store i16 [[ADD15]], i16* [[ARRAYIDX]]
@@ -70,7 +70,7 @@ for2.header: ; preds = %for2.inc16, %for1.heade
7070
for3: ; preds = %for3, %for2.header
7171
%k = phi i32 [ 1, %for2.header ], [ %inc, %for3 ]
7272
%add = add nsw i32 %k, %mul
73-
%arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
73+
%arrayidx = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* %A, i32 %add, i32 %j, i32 %i
7474
%0 = load i16, i16* %arrayidx, align 2
7575
%add15 = add nsw i16 %0, 1
7676
store i16 %add15, i16* %arrayidx
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; RUN: opt < %s -basic-aa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
2+
; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1
3+
; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
4+
5+
; Triply nested loop, should be able to do interchange three times
6+
; to get the ideal access pattern.
7+
; void f(int e[10][10][10], int f[10][10][10]) {
8+
; for (int a = 0; a < 10; a++) {
9+
; for (int b = 0; b < 10; b++) {
10+
; for (int c = 0; c < 10; c++) {
11+
; f[c][b][a] = e[c][b][a];
12+
; }
13+
; }
14+
; }
15+
; }
16+
17+
; REMARKS: --- !Passed
18+
; REMARKS-NEXT: Pass: loop-interchange
19+
; REMARKS-NEXT: Name: Interchanged
20+
; REMARKS-NEXT: Function: pr43326-triply-nested
21+
; REMARKS: --- !Passed
22+
; REMARKS-NEXT: Pass: loop-interchange
23+
; REMARKS-NEXT: Name: Interchanged
24+
; REMARKS-NEXT: Function: pr43326-triply-nested
25+
; REMARKS: --- !Passed
26+
; REMARKS-NEXT: Pass: loop-interchange
27+
; REMARKS-NEXT: Name: Interchanged
28+
; REMARKS-NEXT: Function: pr43326-triply-nested
29+
30+
define void @pr43326-triply-nested([10 x [10 x i32]]* %e, [10 x [10 x i32]]* %f) {
31+
entry:
32+
br label %for.outermost.header
33+
34+
for.outermost.header: ; preds = %entry, %for.outermost.latch
35+
%indvars.outermost = phi i64 [ 0, %entry ], [ %indvars.outermost.next, %for.outermost.latch ]
36+
br label %for.middle.header
37+
38+
for.cond.cleanup: ; preds = %for.outermost.latch
39+
ret void
40+
41+
for.middle.header: ; preds = %for.outermost.header, %for.middle.latch
42+
%indvars.middle = phi i64 [ 0, %for.outermost.header ], [ %indvars.middle.next, %for.middle.latch ]
43+
br label %for.innermost
44+
45+
for.outermost.latch: ; preds = %for.middle.latch
46+
%indvars.outermost.next = add nuw nsw i64 %indvars.outermost, 1
47+
%exitcond.outermost = icmp ne i64 %indvars.outermost.next, 10
48+
br i1 %exitcond.outermost, label %for.outermost.header, label %for.cond.cleanup
49+
50+
for.middle.latch: ; preds = %for.innermost
51+
%indvars.middle.next = add nuw nsw i64 %indvars.middle, 1
52+
%exitcond.middle = icmp ne i64 %indvars.middle.next, 10
53+
br i1 %exitcond.middle, label %for.middle.header, label %for.outermost.latch
54+
55+
for.innermost: ; preds = %for.middle.header, %for.innermost
56+
%indvars.innermost = phi i64 [ 0, %for.middle.header ], [ %indvars.innermost.next, %for.innermost ]
57+
%arrayidx12 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %e, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost
58+
%0 = load i32, i32* %arrayidx12
59+
%arrayidx18 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %f, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost
60+
store i32 %0, i32* %arrayidx18
61+
%indvars.innermost.next = add nuw nsw i64 %indvars.innermost, 1
62+
%exitcond.innermost = icmp ne i64 %indvars.innermost.next, 10
63+
br i1 %exitcond.innermost, label %for.innermost, label %for.middle.latch
64+
}

0 commit comments

Comments
 (0)