llvm · kasuga-fj · Mar 21, 2025 · Mar 14, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -84,6 +84,11 @@ static cl::opt<unsigned int> MaxLoopNestDepth(
     "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
     cl::desc("Maximum depth of loop nest considered for the transform"));
 
+static cl::opt<bool> PrioritizeVectorization(
+    "loop-interchange-prioritize-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Prioritize increasing vectorization opportunity over cache cost "
+             "when determining profitability"));
+
 #ifndef NDEBUG
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
@@ -1193,22 +1198,53 @@ bool LoopInterchangeProfitability::isProfitable(
     unsigned OuterLoopId, CharMatrix &DepMatrix,
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
-  // isProfitable() is structured to avoid endless loop interchange.
-  // If loop cache analysis could decide the profitability then,
-  // profitability check will stop and return the analysis result.
-  // If cache analysis failed to analyze the loopnest (e.g.,
-  // due to delinearization issues) then only check whether it is
-  // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
-  // analysis the profitability then only, isProfitableForVectorization
-  // will decide.
-  std::optional<bool> shouldInterchange =
-      isProfitablePerLoopCacheAnalysis(CostMap, CC);
-  if (!shouldInterchange.has_value()) {
-    shouldInterchange = isProfitablePerInstrOrderCost();
-    if (!shouldInterchange.has_value())
+  // isProfitable() is structured to avoid endless loop interchange. If the
+  // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
+  // decide the profitability then, profitability check will stop and return the
+  // analysis result. If it failed to determine it (e.g., cache analysis failed
+  // to analyze the loopnest due to delinearization issues) then go ahead the
+  // second highest priority rule (isProfitablePerInstrOrderCost by default).
+  // Likewise, if it failed to analysis the profitability then only, the last
+  // rule (isProfitableForVectorization by default) will decide.
+  enum class RuleTy {
+    PerLoopCacheAnalysis,
+    PerInstrOrderCost,
+    ForVectorization,
+  };
+
+  // We prefer cache cost to vectorization by default.
+  RuleTy RuleOrder[3] = {RuleTy::PerLoopCacheAnalysis,
+                         RuleTy::PerInstrOrderCost, RuleTy::ForVectorization};
+
+  // If we prefer vectorization to cache cost, change the order of application
+  // of each rule.
+  if (PrioritizeVectorization) {
+    RuleOrder[0] = RuleTy::ForVectorization;
+    RuleOrder[1] = RuleTy::PerLoopCacheAnalysis;
+    RuleOrder[2] = RuleTy::PerInstrOrderCost;
+  }
+
+  std::optional<bool> shouldInterchange;
+  for (RuleTy RT : RuleOrder) {
+    switch (RT) {
+    case RuleTy::PerLoopCacheAnalysis:
+      shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
+      break;
+    case RuleTy::PerInstrOrderCost:
+      shouldInterchange = isProfitablePerInstrOrderCost();
+      break;
+    case RuleTy::ForVectorization:
       shouldInterchange =
           isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+      break;
+    }
+
+    // If this rule could determine the profitability, don't call subsequent
+    // rules.
+    if (shouldInterchange.has_value())
+      break;
   }
+
   if (!shouldInterchange.has_value()) {
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",

diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t --check-prefix=PROFIT-CACHE %s
+
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
+; RUN:     -pass-remarks-output=%t -disable-output -loop-interchange-prioritize-vectorization=1
+; RUN: FileCheck -input-file %t --check-prefix=PROFIT-VEC %s
+
+@A = dso_local global [256 x [256 x float]] zeroinitializer
+@B = dso_local global [256 x [256 x float]] zeroinitializer
+@C = dso_local global [256 x [256 x float]] zeroinitializer
+@D = dso_local global [256 x [256 x float]] zeroinitializer
+@E = dso_local global [256 x [256 x float]] zeroinitializer
+@F = dso_local global [256 x [256 x float]] zeroinitializer
+
+; Check the behavior of the LoopInterchange cost-model. In the below code,
+; exchanging the loops is not profitable in terms of cache, but it is necessary
+; to vectorize the innermost loop.
+;
+; for (int i = 0; i < 256; i++)
+;   for (int j = 1; j < 256; j++)
+;     A[j][i] = A[j-1][i] + B[j][i] + C[i][j] + D[i][j] + E[i][j] + F[i][j];
+;
+
+; PROFIT-CACHE:      --- !Missed
+; PROFIT-CACHE-NEXT: Pass:            loop-interchange
+; PROFIT-CACHE-NEXT: Name:            InterchangeNotProfitable
+; PROFIT-CACHE-NEXT: Function:        f
+; PROFIT-CACHE-NEXT: Args:
+; PROFIT-CACHE-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; PROFIT-CACHE-NEXT: ...
+
+; PROFIT-VEC:      --- !Passed
+; PROFIT-VEC-NEXT: Pass:            loop-interchange
+; PROFIT-VEC-NEXT: Name:            Interchanged
+; PROFIT-VEC-NEXT: Function:        f
+; PROFIT-VEC-NEXT: Args:
+; PROFIT-VEC-NEXT:   - String:          Loop interchanged with enclosing loop.
+; PROFIT-VEC-NEXT: ...
+define void @f() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.i.inc ]
+  br label %for.j.body
+
+for.j.body:
+  %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+  %j.dec = add nsw i64 %j, -1
+  %a.0.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j.dec, i64 %i
+  %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %j, i64 %i
+  %c.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
+  %d.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @D, i64 %i, i64 %j
+  %e.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @E, i64 %i, i64 %j
+  %f.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @F, i64 %i, i64 %j
+  %a.0 = load float, ptr %a.0.index, align 4
+  %b = load float, ptr %b.index, align 4
+  %c = load float, ptr %c.index, align 4
+  %d = load float, ptr %d.index, align 4
+  %e = load float, ptr %e.index, align 4
+  %f = load float, ptr %f.index, align 4
+  %add.0 = fadd float %a.0, %b
+  %add.1 = fadd float %add.0, %c
+  %add.2 = fadd float %add.1, %d
+  %add.3 = fadd float %add.2, %e
+  %add.4 = fadd float %add.3, %f
+  %a.1.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j, i64 %i
+  store float %add.4, ptr %a.1.index, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %cmp.j = icmp eq i64 %j.next, 256
+  br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cmp.i = icmp eq i64 %i.next, 256
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}