[TTI] Add SCEVExpansionBudget to loop unrolling options. #118316

fhahn · 2024-12-02T16:04:46Z

Add an extra know to UnrollingPreferences to let backends control the maximum budget for SCEV expansions.

This gives backends more fine-grained control on the cost of the runtime checks for runtime unrolling.

Add an extra know to UnrollingPreferences to let backends control the maximum budget for SCEV expansions. This gives backends more fine-grained control on the cost of the runtime checks for runtime unrolling.

llvmbot · 2024-12-02T16:05:22Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-analysis

Author: Florian Hahn (fhahn)

Changes

Add an extra know to UnrollingPreferences to let backends control the maximum budget for SCEV expansions.

This gives backends more fine-grained control on the cost of the runtime checks for runtime unrolling.

Full diff: https://github.com/llvm/llvm-project/pull/118316.diff

7 Files Affected:

(modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+3)
(modified) llvm/include/llvm/Transforms/Utils/UnrollLoop.h (+2-1)
(modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+3)
(modified) llvm/lib/Transforms/Utils/LoopUnroll.cpp (+5-4)
(modified) llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp (+3-1)
(modified) llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp (+4-3)
(modified) llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp (+1-1)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 89231e23e388a7..aa530b54c5c6d3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -615,6 +615,9 @@ class TargetTransformInfo {
     unsigned MaxIterationsCountToAnalyze;
     /// Don't disable runtime unroll for the loops which were vectorized.
     bool UnrollVectorizedLoop = false;
+    /// Don't allow runtime unrolling if expanding the trip count takes more
+    /// than SCEVExpansionBudget.
+    unsigned SCEVExpansionBudget;
   };
 
   /// Get target-customized preferences for the generic loop unrolling
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 797c082333a76c..8cf17ced458c82 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -75,6 +75,7 @@ struct UnrollLoopOptions {
   bool UnrollRemainder;
   bool ForgetAllSCEV;
   const Instruction *Heart = nullptr;
+  unsigned SCEVExpansionBudget;
 };
 
 LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
@@ -90,7 +91,7 @@ bool UnrollRuntimeLoopRemainder(
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
-    Loop **ResultLoop = nullptr);
+    unsigned SCEVExpansionBudget, Loop **ResultLoop = nullptr);
 
 LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                                   unsigned TripMultiple, bool UnrollRemainder,
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 953748f07af028..260cc72c3188d2 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -57,6 +57,7 @@
 #include "llvm/Transforms/Utils/LoopPeel.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <algorithm>
@@ -218,6 +219,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.UnrollAndJam = false;
   UP.UnrollAndJamInnerLoopThreshold = 60;
   UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
+  UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, SE, UP, &ORE);
@@ -1349,6 +1351,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   ULO.Runtime = UP.Runtime;
   ULO.ForgetAllSCEV = ForgetAllSCEV;
   ULO.Heart = getLoopConvergenceHeart(L);
+  ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
   LoopUnrollResult UnrollResult = UnrollLoop(
       L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
   if (UnrollResult == LoopUnrollResult::Unmodified)
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 85d4415970a00a..b11d92836a998f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -60,6 +60,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -589,10 +590,10 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                                               : isEpilogProfitable(L);
 
   if (ULO.Runtime &&
-      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
-                                  EpilogProfitability, ULO.UnrollRemainder,
-                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
-                                  PreserveLCSSA, RemainderLoop)) {
+      !UnrollRuntimeLoopRemainder(
+          L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
+          ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+          PreserveLCSSA, ULO.SCEVExpansionBudget, RemainderLoop)) {
     if (ULO.Force)
       ULO.Runtime = false;
     else {
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 2c2400d9dd7a82..49209e33f2d1dd 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
@@ -241,7 +242,8 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
                                     /*UseEpilogRemainder*/ true,
                                     UnrollRemainder, /*ForgetAllSCEV*/ false,
-                                    LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
+                                    LI, SE, DT, AC, TTI, true,
+                                    SCEVCheapExpansionBudget, EpilogueLoop)) {
       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
       return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 0179d350b9e0e7..b0bc55cd64c377 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -582,7 +582,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
     Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
-    const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
+    const TargetTransformInfo *TTI, bool PreserveLCSSA,
+    unsigned SCEVExpansionBudget, Loop **ResultLoop) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -672,8 +673,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
   const DataLayout &DL = Header->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
-      Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
-                                   TTI, PreHeaderBR)) {
+      Expander.isHighCostExpansion(TripCountSC, L, SCEVExpansionBudget, TTI,
+                                   PreHeaderBR)) {
     LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
     return false;
   }
diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
index 241d55bd624576..b49e37d9eee984 100644
--- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
+++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
@@ -73,6 +73,6 @@ while.end:                                        ; preds = %while.cond
 
   bool ret =
       UnrollRuntimeLoopRemainder(L, 4, true, false, false, false, &LI, &SE, &DT,
-                                 &AC, /*TTI=*/nullptr, PreserveLCSSA);
+                                 &AC, /*TTI=*/nullptr, PreserveLCSSA, 4);
   EXPECT_FALSE(ret);
 }

Add initial heuristics to selectively enable runtime unrolling for loops where doing so is expected to be highly beneficial on Apple Silicon CPUs. To start with, we try to runtime-unroll small, single block loops, if they have load/store dependencies, to expose more parallel memory access streams [1] and to improve instruction delivery [2]. We also explicitly avoid runtime-unrolling for loop structures that may limit the expected gains from runtime unrolling. Such loops include loops with complex control flow (aren't innermost loops, have multiple exits, have a large number of blocks), trip count expansion is expensive and are expected to execute a small number of iterations. Note that the heuristics here may be overly conservative and we err on the side of avoiding runtime unrolling rather than unroll excessively. They are all subject to further refinement. [1] 4.6.10 in Apple Silicon CPU Optimization Guide [2] 4.4.4 in Apple Silicon CPU Optimization Guide (Depends on llvm#118316 for TTI changes, which are included in this PR for now)

nikic

LGTM

preames

LGTM

juliannagele · 2024-12-02T16:30:57Z

LGTM

fhahn · 2024-12-02T16:31:57Z

For more context, #118317 adds the first user of the new property

Add initial heuristics to selectively enable runtime unrolling for loops where doing so is expected to be highly beneficial on Apple Silicon CPUs. To start with, we try to runtime-unroll small, single block loops, if they have load/store dependencies, to expose more parallel memory access streams [1] and to improve instruction delivery [2]. We also explicitly avoid runtime-unrolling for loop structures that may limit the expected gains from runtime unrolling. Such loops include loops with complex control flow (aren't innermost loops, have multiple exits, have a large number of blocks), trip count expansion is expensive and are expected to execute a small number of iterations. Note that the heuristics here may be overly conservative and we err on the side of avoiding runtime unrolling rather than unroll excessively. They are all subject to further refinement. [1] 4.6.10 in Apple Silicon CPU Optimization Guide [2] 4.4.4 in Apple Silicon CPU Optimization Guide (Depends on llvm#118316 for TTI changes, which are included in this PR for now)

…s. (#118317) Add initial heuristics to selectively enable runtime unrolling for loops where doing so is expected to be highly beneficial on Apple Silicon CPUs. To start with, we try to runtime-unroll small, single block loops, if they have load/store dependencies, to expose more parallel memory access streams [1] and to improve instruction delivery [2]. We also explicitly avoid runtime-unrolling for loop structures that may limit the expected gains from runtime unrolling. Such loops include loops with complex control flow (aren't innermost loops, have multiple exits, have a large number of blocks), trip count expansion is expensive and are expected to execute a small number of iterations. Note that the heuristics here may be overly conservative and we err on the side of avoiding runtime unrolling rather than unroll excessively. They are all subject to further refinement. Across a large set of workloads, this increase the total number of unrolled loops by 2.9%. [1] 4.6.10 in Apple Silicon CPU Optimization Guide [2] 4.4.4 in Apple Silicon CPU Optimization Guide Depends on #118316 for TTI changes. PR: #118317

Add an extra know to UnrollingPreferences to let backends control the maximum budget for SCEV expansions. This gives backends more fine-grained control on the cost of the runtime checks for runtime unrolling. PR: llvm#118316 (cherry picked from commit 4226e0a)

…s. (llvm#118317) Add initial heuristics to selectively enable runtime unrolling for loops where doing so is expected to be highly beneficial on Apple Silicon CPUs. To start with, we try to runtime-unroll small, single block loops, if they have load/store dependencies, to expose more parallel memory access streams [1] and to improve instruction delivery [2]. We also explicitly avoid runtime-unrolling for loop structures that may limit the expected gains from runtime unrolling. Such loops include loops with complex control flow (aren't innermost loops, have multiple exits, have a large number of blocks), trip count expansion is expensive and are expected to execute a small number of iterations. Note that the heuristics here may be overly conservative and we err on the side of avoiding runtime unrolling rather than unroll excessively. They are all subject to further refinement. Across a large set of workloads, this increase the total number of unrolled loops by 2.9%. [1] 4.6.10 in Apple Silicon CPU Optimization Guide [2] 4.4.4 in Apple Silicon CPU Optimization Guide Depends on llvm#118316 for TTI changes. PR: llvm#118317 (cherry picked from commit 0bb7bd4)

[TTI] Add SCEVExpansionBudget to loop unrolling options.

c8d968a

Add an extra know to UnrollingPreferences to let backends control the maximum budget for SCEV expansions. This gives backends more fine-grained control on the cost of the runtime checks for runtime unrolling.

fhahn requested review from aemerson, nikic, preames, juliannagele and aeubanks December 2, 2024 16:04

llvmbot added llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Dec 2, 2024

fhahn mentioned this pull request Dec 2, 2024

[AArch64] Runtime-unroll small load/store loops for Apple Silicon CPUs. #118317

Merged

nikic approved these changes Dec 2, 2024

View reviewed changes

preames approved these changes Dec 2, 2024

View reviewed changes

juliannagele approved these changes Dec 2, 2024

View reviewed changes

fhahn merged commit 4226e0a into llvm:main Dec 2, 2024
11 checks passed

fhahn deleted the tti-rt-scevexpansionbudget branch December 2, 2024 21:35

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[TTI] Add SCEVExpansionBudget to loop unrolling options. #118316

[TTI] Add SCEVExpansionBudget to loop unrolling options. #118316

Uh oh!

fhahn commented Dec 2, 2024

Uh oh!

llvmbot commented Dec 2, 2024 •

edited

Loading

Uh oh!

nikic left a comment

Uh oh!

preames left a comment

Uh oh!

juliannagele commented Dec 2, 2024

Uh oh!

fhahn commented Dec 2, 2024

Uh oh!

Uh oh!

Uh oh!

[TTI] Add SCEVExpansionBudget to loop unrolling options. #118316

[TTI] Add SCEVExpansionBudget to loop unrolling options. #118316

Uh oh!

Conversation

fhahn commented Dec 2, 2024

Uh oh!

llvmbot commented Dec 2, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

juliannagele commented Dec 2, 2024

Uh oh!

fhahn commented Dec 2, 2024

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Dec 2, 2024 •

edited

Loading