[LoopInterchange] Add an option to prioritize vectorization (#131988)

kasuga-fj · fhahn · web-flow · commit 17b202fc17f2 · 2025-03-21T17:03:06.000+09:00
The LoopInterchange cost-model consists of several decision rules. They are called one by one, and if some rule can determine the profitability, then the subsequent rules aren't called. In the current implementation, the rule for `CacheCostAnalysis` is called first, and if it fails to determine the profitability, then the rule for vectorization is called. However, there are cases where interchanging loops for vectorization makes the code faster even if such exchanges are detrimental to the cache. For example, exchanging the inner two loops in the following example looks about x3 faster in my local (compiled with `-O3 -mcpu=neoverse-v2 -mllvm -cache-line-size=64`), even though it's rejected by the rule based on cache cost. (NOTE: LoopInterchange cannot exchange these loops due to legality checks. This should also be improved.) ```c __attribute__((aligned(64))) float aa[256][256],bb[256][256],cc[256][256], dd[256][256],ee[256][256],ff[256][256]; // Alternative of TSVC s231 with more array accesses than the original. void s231_alternative() { for (int nl = 0; nl < 100*(100000/256); nl++) { for (int i = 0; i < 256; ++i) { for (int j = 1; j < 256; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] + cc[i][j] + dd[i][j] + ee[i][j] + ff[i][j]; } } } } ``` This patch introduces a new option to prioritize the vectorization rule over the cache cost rule. Related issue: #131130 --------- Co-authored-by: Florian Hahn <flo@fhahn.com>
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -84,6 +84,11 @@ static cl::opt<unsigned int> MaxLoopNestDepth(
     "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
     cl::desc("Maximum depth of loop nest considered for the transform"));
 
+static cl::opt<bool> PrioritizeVectorization(
+    "loop-interchange-prioritize-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Prioritize increasing vectorization opportunity over cache cost "
+             "when determining profitability"));
+
 #ifndef NDEBUG
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
@@ -1193,22 +1198,53 @@ bool LoopInterchangeProfitability::isProfitable(
     unsigned OuterLoopId, CharMatrix &DepMatrix,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
-  // isProfitable() is structured to avoid endless loop interchange.
-  // If loop cache analysis could decide the profitability then,
-  // profitability check will stop and return the analysis result.
-  // If cache analysis failed to analyze the loopnest (e.g.,
-  // due to delinearization issues) then only check whether it is
-  // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
-  // analysis the profitability then only, isProfitableForVectorization
-  // will decide.
-  std::optional<bool> shouldInterchange =
-      isProfitablePerLoopCacheAnalysis(CostMap, CC);
-  if (!shouldInterchange.has_value()) {
-    shouldInterchange = isProfitablePerInstrOrderCost();
-    if (!shouldInterchange.has_value())
+  // isProfitable() is structured to avoid endless loop interchange. If the
+  // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
+  // decide the profitability then, profitability check will stop and return the
+  // analysis result. If it failed to determine it (e.g., cache analysis failed
+  // to analyze the loopnest due to delinearization issues) then go ahead the
+  // second highest priority rule (isProfitablePerInstrOrderCost by default).
+  // Likewise, if it failed to analysis the profitability then only, the last
+  // rule (isProfitableForVectorization by default) will decide.
+  enum class RuleTy {
+    PerLoopCacheAnalysis,
+    PerInstrOrderCost,
+    ForVectorization,
+  };
+
+  // We prefer cache cost to vectorization by default.
+  RuleTy RuleOrder[3] = {RuleTy::PerLoopCacheAnalysis,
+                         RuleTy::PerInstrOrderCost, RuleTy::ForVectorization};
+
+  // If we prefer vectorization to cache cost, change the order of application
+  // of each rule.
+  if (PrioritizeVectorization) {
+    RuleOrder[0] = RuleTy::ForVectorization;
+    RuleOrder[1] = RuleTy::PerLoopCacheAnalysis;
+    RuleOrder[2] = RuleTy::PerInstrOrderCost;
+  }
+
+  std::optional<bool> shouldInterchange;
+  for (RuleTy RT : RuleOrder) {
+    switch (RT) {
+    case RuleTy::PerLoopCacheAnalysis:
+      shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
+      break;
+    case RuleTy::PerInstrOrderCost:
+      shouldInterchange = isProfitablePerInstrOrderCost();
+      break;
+    case RuleTy::ForVectorization:
       shouldInterchange =
           isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+      break;
+    }
+
+    // If this rule could determine the profitability, don't call subsequent
+    // rules.
+    if (shouldInterchange.has_value())
+      break;
   }
+
   if (!shouldInterchange.has_value()) {
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t --check-prefix=PROFIT-CACHE %s
+
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -disable-output -loop-interchange-prioritize-vectorization=1
+; RUN: FileCheck -input-file %t --check-prefix=PROFIT-VEC %s
+
+@A = dso_local global [256 x [256 x float]] zeroinitializer
+@B = dso_local global [256 x [256 x float]] zeroinitializer
+@C = dso_local global [256 x [256 x float]] zeroinitializer
+@D = dso_local global [256 x [256 x float]] zeroinitializer
+@E = dso_local global [256 x [256 x float]] zeroinitializer
+@F = dso_local global [256 x [256 x float]] zeroinitializer
+
+; Check the behavior of the LoopInterchange cost-model. In the below code,
+; exchanging the loops is not profitable in terms of cache, but it is necessary
+; to vectorize the innermost loop.
+;
+; for (int i = 0; i < 256; i++)
+;   for (int j = 1; j < 256; j++)
+;     A[j][i] = A[j-1][i] + B[j][i] + C[i][j] + D[i][j] + E[i][j] + F[i][j];
+;
+
+; PROFIT-CACHE:      --- !Missed
+; PROFIT-CACHE-NEXT: Pass:            loop-interchange
+; PROFIT-CACHE-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-CACHE-NEXT: Function:        f
+; PROFIT-CACHE-NEXT: Args:
+; PROFIT-CACHE-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; PROFIT-CACHE-NEXT: ...
+
+; PROFIT-VEC:      --- !Passed
+; PROFIT-VEC-NEXT: Pass:            loop-interchange
+; PROFIT-VEC-NEXT: Name:            Interchanged
+; PROFIT-VEC-NEXT: Function:        f
+; PROFIT-VEC-NEXT: Args:
+; PROFIT-VEC-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-VEC-NEXT: ...
+define void @f() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.i.inc ]
+  br label %for.j.body
+
+for.j.body:
+  %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+  %j.dec = add nsw i64 %j, -1
+  %a.0.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j.dec, i64 %i
+  %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %j, i64 %i
+  %c.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
+  %d.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @D, i64 %i, i64 %j
+  %e.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @E, i64 %i, i64 %j
+  %f.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @F, i64 %i, i64 %j
+  %a.0 = load float, ptr %a.0.index, align 4
+  %b = load float, ptr %b.index, align 4
+  %c = load float, ptr %c.index, align 4
+  %d = load float, ptr %d.index, align 4
+  %e = load float, ptr %e.index, align 4
+  %f = load float, ptr %f.index, align 4
+  %add.0 = fadd float %a.0, %b
+  %add.1 = fadd float %add.0, %c
+  %add.2 = fadd float %add.1, %d
+  %add.3 = fadd float %add.2, %e
+  %add.4 = fadd float %add.3, %f
+  %a.1.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j, i64 %i
+  store float %add.4, ptr %a.1.index, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %cmp.j = icmp eq i64 %j.next, 256
+  br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cmp.i = icmp eq i64 %i.next, 256
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}