[LoopInterchange] Try to achieve the most optimal access pattern after interchange

CongzheUalberta · CongzheUalberta · commit eac348751098 · 2022-04-06T15:31:56.000-04:00
Motivated by pr43326 (https://bugs.llvm.org/show_bug.cgi?id=43326), where a slightly modified case is as follows. void f(int e[10][10][10], int f[10][10][10]) { for (int a = 0; a < 10; a++) for (int b = 0; b < 10; b++) for (int c = 0; c < 10; c++) f[c][b][a] = e[c][b][a]; } The ideal optimal access pattern after running interchange is supposed to be the following void f(int e[10][10][10], int f[10][10][10]) { for (int c = 0; c < 10; c++) for (int b = 0; b < 10; b++) for (int a = 0; a < 10; a++) f[c][b][a] = e[c][b][a]; } Currently loop interchange is limited to picking up the innermost loop and finding an order that is locally optimal for it. However, the pass failed to produce the globally optimal loop access order. For more complex examples what we get could be quite far from the globally optimal ordering. What is proposed in this patch is to do a "bubble-sort" fashion when doing interchange. By comparing neighbors in `LoopList` in each iteration, we would be able to move each loop onto a most appropriate place, hence this is an approach that tries to achieve the globally optimal ordering. The motivating example above is added as a test case. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D120386
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -266,26 +266,28 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
   return true;
 }
 
-static LoopVector populateWorklist(Loop &L) {
+static void populateWorklist(Loop &L, LoopVector &LoopList) {
   LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
                     << L.getHeader()->getParent()->getName() << " Loop: %"
                     << L.getHeader()->getName() << '\n');
-  LoopVector LoopList;
+  assert(LoopList.empty() && "LoopList should initially be empty!");
   Loop *CurrentLoop = &L;
   const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
   while (!Vec->empty()) {
     // The current loop has multiple subloops in it hence it is not tightly
     // nested.
     // Discard all loops above it added into Worklist.
-    if (Vec->size() != 1)
-      return {};
+    if (Vec->size() != 1) {
+      LoopList = {};
+      return;
+    }
 
     LoopList.push_back(CurrentLoop);
     CurrentLoop = Vec->front();
     Vec = &CurrentLoop->getSubLoops();
   }
   LoopList.push_back(CurrentLoop);
-  return LoopList;
+  return;
 }
 
 namespace {
@@ -419,12 +421,13 @@ struct LoopInterchange {
   bool run(Loop *L) {
     if (L->getParentLoop())
       return false;
-
-    return processLoopList(populateWorklist(*L));
+    SmallVector<Loop *, 8> LoopList;
+    populateWorklist(*L, LoopList);
+    return processLoopList(LoopList);
   }
 
   bool run(LoopNest &LN) {
-    const auto &LoopList = LN.getLoops();
+    SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());
     for (unsigned I = 1; I < LoopList.size(); ++I)
       if (LoopList[I]->getParentLoop() != LoopList[I - 1])
         return false;
@@ -456,7 +459,7 @@ struct LoopInterchange {
     return LoopList.size() - 1;
   }
 
-  bool processLoopList(ArrayRef<Loop *> LoopList) {
+  bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
     bool Changed = false;
     unsigned LoopNestDepth = LoopList.size();
     if (LoopNestDepth < 2) {
@@ -496,20 +499,32 @@ struct LoopInterchange {
     }
 
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
-    // Move the selected loop outwards to the best possible position.
-    Loop *LoopToBeInterchanged = LoopList[SelecLoopId];
-    for (unsigned i = SelecLoopId; i > 0; i--) {
-      bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i,
-                                      i - 1, DependencyMatrix);
-      if (!Interchanged)
-        return Changed;
-      // Update the DependencyMatrix
-      interChangeDependencies(DependencyMatrix, i, i - 1);
+    // We try to achieve the globally optimal memory access for the loopnest,
+    // and do interchange based on a bubble-sort fasion. We start from
+    // the innermost loop, move it outwards to the best possible position
+    // and repeat this process.
+    for (unsigned j = SelecLoopId; j > 0; j--) {
+      bool ChangedPerIter = false;
+      for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
+        bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
+                                        DependencyMatrix);
+        if (!Interchanged)
+          continue;
+        // Loops interchanged, update LoopList accordingly.
+        std::swap(LoopList[i - 1], LoopList[i]);
+        // Update the DependencyMatrix
+        interChangeDependencies(DependencyMatrix, i, i - 1);
 #ifdef DUMP_DEP_MATRICIES
-      LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
-      printDepMatrix(DependencyMatrix);
+        LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
+        printDepMatrix(DependencyMatrix);
 #endif
-      Changed |= Interchanged;
+        ChangedPerIter |= Interchanged;
+        Changed |= Interchanged;
+      }
+      // Early abort if there was no interchange during an entire round of
+      // moving loops outwards.
+      if (!ChangedPerIter)
+        break;
     }
     return Changed;
   }
@@ -1419,9 +1434,13 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
 
     // Incoming values are guaranteed be instructions currently.
     auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
+    // In case of multi-level nested loops, follow LCSSA to find the incoming
+    // value defined from the innermost loop.
+    auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
     // Skip phis with incoming values from the inner loop body, excluding the
     // header and latch.
-    if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
+    if (IncIInnerMost->getParent() != InnerLatch &&
+        IncIInnerMost->getParent() != InnerHeader)
       continue;
 
     assert(all_of(P.users(),
diff --git a/llvm/test/Transforms/LoopInterchange/phi-ordering.ll b/llvm/test/Transforms/LoopInterchange/phi-ordering.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=-1000 -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=0 -S 2>&1 | FileCheck %s
 ;; Checks the order of the inner phi nodes does not cause havoc.
 ;; The inner loop has a reduction into c. The IV is not the first phi.
 
@@ -9,7 +9,7 @@ target triple = "armv8--linux-gnueabihf"
 
 
 ; Function Attrs: norecurse nounwind
-define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 {
+define void @test(i32 %T, [90 x i32]* noalias nocapture %C, [90 x [90 x i16]]* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR3_PREHEADER:%.*]]
@@ -31,7 +31,7 @@ define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocaptu
 ; CHECK-NEXT:    br label [[FOR1_HEADER_PREHEADER]]
 ; CHECK:       for3.split1:
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[K]], [[MUL]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* [[A:%.*]], i32 [[ADD]], i32 [[J]], i32 [[I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
 ; CHECK-NEXT:    [[ADD15:%.*]] = add nsw i16 [[TMP0]], 1
 ; CHECK-NEXT:    store i16 [[ADD15]], i16* [[ARRAYIDX]]
@@ -70,7 +70,7 @@ for2.header:                                  ; preds = %for2.inc16, %for1.heade
 for3:                                        ; preds = %for3, %for2.header
   %k = phi i32 [ 1, %for2.header ], [ %inc, %for3 ]
   %add = add nsw i32 %k, %mul
-  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+  %arrayidx = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* %A, i32 %add, i32 %j, i32 %i
   %0 = load i16, i16* %arrayidx, align 2
   %add15 = add nsw i16 %0, 1
   store i16 %add15, i16* %arrayidx
diff --git a/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -basic-aa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1
+; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
+
+; Triply nested loop, should be able to do interchange three times
+; to get the ideal access pattern.
+; void f(int e[10][10][10], int f[10][10][10]) {
+;   for (int a = 0; a < 10; a++) {
+;     for (int b = 0; b < 10; b++) {
+;       for (int c = 0; c < 10; c++) {
+;         f[c][b][a] = e[c][b][a];
+;       }
+;     }
+;   }
+; }
+
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        pr43326-triply-nested
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        pr43326-triply-nested
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        pr43326-triply-nested
+
+define void @pr43326-triply-nested([10 x [10 x i32]]* %e, [10 x [10 x i32]]* %f) {
+entry:
+  br label %for.outermost.header
+
+for.outermost.header:                              ; preds = %entry, %for.outermost.latch
+  %indvars.outermost = phi i64 [ 0, %entry ], [ %indvars.outermost.next, %for.outermost.latch ]
+  br label %for.middle.header
+
+for.cond.cleanup:                                 ; preds = %for.outermost.latch
+  ret void
+
+for.middle.header:                              ; preds = %for.outermost.header, %for.middle.latch
+  %indvars.middle = phi i64 [ 0, %for.outermost.header ], [ %indvars.middle.next, %for.middle.latch ]
+  br label %for.innermost
+
+for.outermost.latch:                                ; preds = %for.middle.latch
+  %indvars.outermost.next = add nuw nsw i64 %indvars.outermost, 1
+  %exitcond.outermost = icmp ne i64 %indvars.outermost.next, 10
+  br i1 %exitcond.outermost, label %for.outermost.header, label %for.cond.cleanup
+
+for.middle.latch:                                ; preds = %for.innermost
+  %indvars.middle.next = add nuw nsw i64 %indvars.middle, 1
+  %exitcond.middle = icmp ne i64 %indvars.middle.next, 10
+  br i1 %exitcond.middle, label %for.middle.header, label %for.outermost.latch
+
+for.innermost:                                        ; preds = %for.middle.header, %for.innermost
+  %indvars.innermost = phi i64 [ 0, %for.middle.header ], [ %indvars.innermost.next, %for.innermost ]
+  %arrayidx12 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %e, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost
+  %0 = load i32, i32* %arrayidx12
+  %arrayidx18 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %f, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost
+  store i32 %0, i32* %arrayidx18
+  %indvars.innermost.next = add nuw nsw i64 %indvars.innermost, 1
+  %exitcond.innermost = icmp ne i64 %indvars.innermost.next, 10
+  br i1 %exitcond.innermost, label %for.innermost, label %for.middle.latch
+}