Skip to content

Commit 5d281a4

Browse files
authored
[LoopInterchange] Constrain number of load/stores in a loop (#118973)
In the current state of the code, the transform computes entries for the dependency matrix until `MaxMemInstrCount` which is 100. After 99th entry, it terminates and thus overall wastes compile-time. It would be nice if we can compute total number of entries upfront and early exit if the number of entries > 100. However, computing the number of entries is not always possible as it depends on two factors: 1. Number of load-store pairs in a loop. 2. Number of common loop levels for each of the pair. This patch constrains the whole computation on the number of loads and stores instructions in the loop. In another approach, I experimented with computing 1 and constraining the number of pairs, but that did not lead to any additional benefit in terms of compile time. However, when other issues are fixed, I can revisit this approach.
1 parent afced70 commit 5d281a4

File tree

2 files changed

+289
-13
lines changed

2 files changed

+289
-13
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ static cl::opt<int> LoopInterchangeCostThreshold(
5757
"loop-interchange-threshold", cl::init(0), cl::Hidden,
5858
cl::desc("Interchange if you gain more than this number"));
5959

60+
// Maximum number of load-stores that can be handled in the dependency matrix.
61+
static cl::opt<unsigned int> MaxMemInstrCount(
62+
"loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
63+
cl::desc(
64+
"Maximum number of load-store instructions that should be handled "
65+
"in the dependency matrix. Higher value may lead to more interchanges "
66+
"at the cost of compile-time"));
67+
6068
namespace {
6169

6270
using LoopVector = SmallVector<Loop *, 8>;
@@ -66,9 +74,6 @@ using CharMatrix = std::vector<std::vector<char>>;
6674

6775
} // end anonymous namespace
6876

69-
// Maximum number of dependencies that can be handled in the dependency matrix.
70-
static const unsigned MaxMemInstrCount = 100;
71-
7277
// Maximum loop depth supported.
7378
static const unsigned MaxLoopNestDepth = 10;
7479

@@ -84,7 +89,8 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
8489

8590
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
8691
Loop *L, DependenceInfo *DI,
87-
ScalarEvolution *SE) {
92+
ScalarEvolution *SE,
93+
OptimizationRemarkEmitter *ORE) {
8894
using ValueVector = SmallVector<Value *, 16>;
8995

9096
ValueVector MemInstr;
@@ -109,7 +115,18 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
109115

110116
LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
111117
<< " Loads and Stores to analyze\n");
112-
118+
if (MemInstr.size() > MaxMemInstrCount) {
119+
LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
120+
<< MaxMemInstrCount << " load/stores in a loop\n");
121+
ORE->emit([&]() {
122+
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
123+
L->getStartLoc(), L->getHeader())
124+
<< "Number of loads/stores exceeded, the supported maximum "
125+
"can be increased with option "
126+
"-loop-interchange-maxmeminstr-count.";
127+
});
128+
return false;
129+
}
113130
ValueVector::iterator I, IE, J, JE;
114131
StringSet<> Seen;
115132

@@ -155,12 +172,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
155172
// Make sure we only add unique entries to the dependency matrix.
156173
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
157174
DepMatrix.push_back(Dep);
158-
159-
if (DepMatrix.size() > MaxMemInstrCount) {
160-
LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
161-
<< " dependencies inside loop\n");
162-
return false;
163-
}
164175
}
165176
}
166177
}
@@ -444,7 +455,7 @@ struct LoopInterchange {
444455
CharMatrix DependencyMatrix;
445456
Loop *OuterMostLoop = *(LoopList.begin());
446457
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
447-
OuterMostLoop, DI, SE)) {
458+
OuterMostLoop, DI, SE, ORE)) {
448459
LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
449460
return false;
450461
}
@@ -1719,10 +1730,15 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
17191730
LPMUpdater &U) {
17201731
Function &F = *LN.getParent();
17211732
SmallVector<Loop *, 8> LoopList(LN.getLoops());
1733+
1734+
if (MaxMemInstrCount < 1) {
1735+
LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
1736+
return PreservedAnalyses::all();
1737+
}
1738+
17221739
// Ensure minimum depth of the loop nest to do the interchange.
17231740
if (!hasMinimumLoopDepth(LoopList))
17241741
return PreservedAnalyses::all();
1725-
17261742
DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
17271743
std::unique_ptr<CacheCost> CC =
17281744
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
; RUN: opt < %s -passes=loop-interchange --pass-remarks-missed=loop-interchange -disable-output 2>&1 | FileCheck %s
2+
; RUN: opt < %s -passes=loop-interchange --pass-remarks-missed=loop-interchange -loop-interchange-max-meminstr-count=75
3+
; -disable-output 2>&1 | FileCheck --check-prefix=CHECK-INSTR-COUNT %s
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
@A = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
7+
@B = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
8+
@C = dso_local local_unnamed_addr global [2048 x [2048 x i32]] zeroinitializer, align 4
9+
10+
; CHECK: Number of loads/stores exceeded, the supported maximum
11+
; can be increased with option -loop-interchange-maxmeminstr-count.
12+
; CHECK-INSTR-COUNT-NOT: Number of loads/stores exceeded, the supported maximum
13+
; can be increased with option -loop-interchange-maxmeminstr-count.
14+
define dso_local noundef i32 @many_load_stores() {
15+
br label %1
16+
17+
1: ; preds = %9, %0
18+
%2 = phi i32 [ 0, %0 ], [ %10, %9 ]
19+
%3 = icmp slt i32 %2, 2048
20+
br i1 %3, label %5, label %4
21+
22+
4: ; preds = %1
23+
ret i32 0
24+
25+
5: ; preds = %1
26+
br label %6
27+
28+
6: ; preds = %11, %5
29+
%7 = phi i32 [ 0, %5 ], [ %208, %11 ]
30+
%8 = icmp slt i32 %7, 85
31+
br i1 %8, label %11, label %9
32+
33+
9: ; preds = %6
34+
%10 = add nsw i32 %2, 1
35+
br label %1
36+
37+
11: ; preds = %6
38+
%12 = sext i32 %2 to i64
39+
%13 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @B, i64 0, i64 %12
40+
%14 = sext i32 %7 to i64
41+
%15 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %14
42+
%16 = load i32, i32* %15, align 4
43+
%17 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @C, i64 0, i64 %12
44+
%18 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %14
45+
%19 = load i32, i32* %18, align 4
46+
%20 = add nsw i32 %16, %19
47+
%21 = getelementptr inbounds [2048 x [2048 x i32]], [2048 x [2048 x i32]]* @A, i64 0, i64 %12
48+
%22 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %14
49+
store i32 %20, i32* %22, align 4
50+
%23 = add nsw i32 %7, 1
51+
%24 = sext i32 %23 to i64
52+
%25 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %24
53+
%26 = load i32, i32* %25, align 4
54+
%27 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %24
55+
%28 = load i32, i32* %27, align 4
56+
%29 = add nsw i32 %26, %28
57+
%30 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %24
58+
store i32 %29, i32* %30, align 4
59+
%31 = add nsw i32 %23, 1
60+
%32 = sext i32 %31 to i64
61+
%33 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %32
62+
%34 = load i32, i32* %33, align 4
63+
%35 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %32
64+
%36 = load i32, i32* %35, align 4
65+
%37 = add nsw i32 %34, %36
66+
%38 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %32
67+
store i32 %37, i32* %38, align 4
68+
%39 = add nsw i32 %31, 1
69+
%40 = sext i32 %39 to i64
70+
%41 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %40
71+
%42 = load i32, i32* %41, align 4
72+
%43 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %40
73+
%44 = load i32, i32* %43, align 4
74+
%45 = add nsw i32 %42, %44
75+
%46 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %40
76+
store i32 %45, i32* %46, align 4
77+
%47 = add nsw i32 %39, 1
78+
%48 = sext i32 %47 to i64
79+
%49 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %48
80+
%50 = load i32, i32* %49, align 4
81+
%51 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %48
82+
%52 = load i32, i32* %51, align 4
83+
%53 = add nsw i32 %50, %52
84+
%54 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %48
85+
store i32 %53, i32* %54, align 4
86+
%55 = add nsw i32 %47, 1
87+
%56 = sext i32 %55 to i64
88+
%57 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %56
89+
%58 = load i32, i32* %57, align 4
90+
%59 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %56
91+
%60 = load i32, i32* %59, align 4
92+
%61 = add nsw i32 %58, %60
93+
%62 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %56
94+
store i32 %61, i32* %62, align 4
95+
%63 = add nsw i32 %55, 1
96+
%64 = sext i32 %63 to i64
97+
%65 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %64
98+
%66 = load i32, i32* %65, align 4
99+
%67 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %64
100+
%68 = load i32, i32* %67, align 4
101+
%69 = add nsw i32 %66, %68
102+
%70 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %64
103+
store i32 %69, i32* %70, align 4
104+
%71 = add nsw i32 %63, 1
105+
%72 = sext i32 %71 to i64
106+
%73 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %72
107+
%74 = load i32, i32* %73, align 4
108+
%75 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %72
109+
%76 = load i32, i32* %75, align 4
110+
%77 = add nsw i32 %74, %76
111+
%78 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %72
112+
store i32 %77, i32* %78, align 4
113+
%79 = add nsw i32 %71, 1
114+
%80 = sext i32 %79 to i64
115+
%81 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %80
116+
%82 = load i32, i32* %81, align 4
117+
%83 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %80
118+
%84 = load i32, i32* %83, align 4
119+
%85 = add nsw i32 %82, %84
120+
%86 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %80
121+
store i32 %85, i32* %86, align 4
122+
%87 = add nsw i32 %79, 1
123+
%88 = sext i32 %87 to i64
124+
%89 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %88
125+
%90 = load i32, i32* %89, align 4
126+
%91 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %88
127+
%92 = load i32, i32* %91, align 4
128+
%93 = add nsw i32 %90, %92
129+
%94 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %88
130+
store i32 %93, i32* %94, align 4
131+
%95 = add nsw i32 %87, 1
132+
%96 = sext i32 %95 to i64
133+
%97 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %96
134+
%98 = load i32, i32* %97, align 4
135+
%99 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %96
136+
%100 = load i32, i32* %99, align 4
137+
%101 = add nsw i32 %98, %100
138+
%102 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %96
139+
store i32 %101, i32* %102, align 4
140+
%103 = add nsw i32 %95, 1
141+
%104 = sext i32 %103 to i64
142+
%105 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %104
143+
%106 = load i32, i32* %105, align 4
144+
%107 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %104
145+
%108 = load i32, i32* %107, align 4
146+
%109 = add nsw i32 %106, %108
147+
%110 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %104
148+
store i32 %109, i32* %110, align 4
149+
%111 = add nsw i32 %103, 1
150+
%112 = sext i32 %111 to i64
151+
%113 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %112
152+
%114 = load i32, i32* %113, align 4
153+
%115 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %112
154+
%116 = load i32, i32* %115, align 4
155+
%117 = add nsw i32 %114, %116
156+
%118 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %112
157+
store i32 %117, i32* %118, align 4
158+
%119 = add nsw i32 %111, 1
159+
%120 = sext i32 %119 to i64
160+
%121 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %120
161+
%122 = load i32, i32* %121, align 4
162+
%123 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %120
163+
%124 = load i32, i32* %123, align 4
164+
%125 = add nsw i32 %122, %124
165+
%126 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %120
166+
store i32 %125, i32* %126, align 4
167+
%127 = add nsw i32 %119, 1
168+
%128 = sext i32 %127 to i64
169+
%129 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %128
170+
%130 = load i32, i32* %129, align 4
171+
%131 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %128
172+
%132 = load i32, i32* %131, align 4
173+
%133 = add nsw i32 %130, %132
174+
%134 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %128
175+
store i32 %133, i32* %134, align 4
176+
%135 = add nsw i32 %127, 1
177+
%136 = sext i32 %135 to i64
178+
%137 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %136
179+
%138 = load i32, i32* %137, align 4
180+
%139 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %136
181+
%140 = load i32, i32* %139, align 4
182+
%141 = add nsw i32 %138, %140
183+
%142 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %136
184+
store i32 %141, i32* %142, align 4
185+
%143 = add nsw i32 %135, 1
186+
%144 = sext i32 %143 to i64
187+
%145 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %144
188+
%146 = load i32, i32* %145, align 4
189+
%147 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %144
190+
%148 = load i32, i32* %147, align 4
191+
%149 = add nsw i32 %146, %148
192+
%150 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %144
193+
store i32 %149, i32* %150, align 4
194+
%151 = add nsw i32 %143, 1
195+
%152 = sext i32 %151 to i64
196+
%153 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %152
197+
%154 = load i32, i32* %153, align 4
198+
%155 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %152
199+
%156 = load i32, i32* %155, align 4
200+
%157 = add nsw i32 %154, %156
201+
%158 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %152
202+
store i32 %157, i32* %158, align 4
203+
%159 = add nsw i32 %151, 1
204+
%160 = sext i32 %159 to i64
205+
%161 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %160
206+
%162 = load i32, i32* %161, align 4
207+
%163 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %160
208+
%164 = load i32, i32* %163, align 4
209+
%165 = add nsw i32 %162, %164
210+
%166 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %160
211+
store i32 %165, i32* %166, align 4
212+
%167 = add nsw i32 %159, 1
213+
%168 = sext i32 %167 to i64
214+
%169 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %168
215+
%170 = load i32, i32* %169, align 4
216+
%171 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %168
217+
%172 = load i32, i32* %171, align 4
218+
%173 = add nsw i32 %170, %172
219+
%174 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %168
220+
store i32 %173, i32* %174, align 4
221+
%175 = add nsw i32 %167, 1
222+
%176 = sext i32 %175 to i64
223+
%177 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %176
224+
%178 = load i32, i32* %177, align 4
225+
%179 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %176
226+
%180 = load i32, i32* %179, align 4
227+
%181 = add nsw i32 %178, %180
228+
%182 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %176
229+
store i32 %181, i32* %182, align 4
230+
%183 = add nsw i32 %175, 1
231+
%184 = sext i32 %183 to i64
232+
%185 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %184
233+
%186 = load i32, i32* %185, align 4
234+
%187 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %184
235+
%188 = load i32, i32* %187, align 4
236+
%189 = add nsw i32 %186, %188
237+
%190 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %184
238+
store i32 %189, i32* %190, align 4
239+
%191 = add nsw i32 %183, 1
240+
%192 = sext i32 %191 to i64
241+
%193 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %192
242+
%194 = load i32, i32* %193, align 4
243+
%195 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %192
244+
%196 = load i32, i32* %195, align 4
245+
%197 = add nsw i32 %194, %196
246+
%198 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %192
247+
store i32 %197, i32* %198, align 4
248+
%199 = add nsw i32 %191, 1
249+
%200 = sext i32 %199 to i64
250+
%201 = getelementptr inbounds [2048 x i32], [2048 x i32]* %13, i64 0, i64 %200
251+
%202 = load i32, i32* %201, align 4
252+
%203 = getelementptr inbounds [2048 x i32], [2048 x i32]* %17, i64 0, i64 %200
253+
%204 = load i32, i32* %203, align 4
254+
%205 = add nsw i32 %202, %204
255+
%206 = getelementptr inbounds [2048 x i32], [2048 x i32]* %21, i64 0, i64 %200
256+
store i32 %205, i32* %206, align 4
257+
%207 = add nsw i32 %199, 1
258+
%208 = add nsw i32 %207, 24
259+
br label %6
260+
}

0 commit comments

Comments
 (0)