Skip to content

Commit 17b202f

Browse files
kasuga-fjfhahn
andauthored
[LoopInterchange] Add an option to prioritize vectorization (#131988)
The LoopInterchange cost-model consists of several decision rules. They are called one by one, and if some rule can determine the profitability, then the subsequent rules aren't called. In the current implementation, the rule for `CacheCostAnalysis` is called first, and if it fails to determine the profitability, then the rule for vectorization is called. However, there are cases where interchanging loops for vectorization makes the code faster even if such exchanges are detrimental to the cache. For example, exchanging the inner two loops in the following example looks about x3 faster in my local (compiled with `-O3 -mcpu=neoverse-v2 -mllvm -cache-line-size=64`), even though it's rejected by the rule based on cache cost. (NOTE: LoopInterchange cannot exchange these loops due to legality checks. This should also be improved.) ```c __attribute__((aligned(64))) float aa[256][256],bb[256][256],cc[256][256], dd[256][256],ee[256][256],ff[256][256]; // Alternative of TSVC s231 with more array accesses than the original. void s231_alternative() { for (int nl = 0; nl < 100*(100000/256); nl++) { for (int i = 0; i < 256; ++i) { for (int j = 1; j < 256; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] + cc[i][j] + dd[i][j] + ee[i][j] + ff[i][j]; } } } } ``` This patch introduces a new option to prioritize the vectorization rule over the cache cost rule. Related issue: #131130 --------- Co-authored-by: Florian Hahn <[email protected]>
1 parent be258a2 commit 17b202f

File tree

2 files changed

+130
-13
lines changed

2 files changed

+130
-13
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ static cl::opt<unsigned int> MaxLoopNestDepth(
8484
"loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
8585
cl::desc("Maximum depth of loop nest considered for the transform"));
8686

87+
static cl::opt<bool> PrioritizeVectorization(
88+
"loop-interchange-prioritize-vectorization", cl::init(false), cl::Hidden,
89+
cl::desc("Prioritize increasing vectorization opportunity over cache cost "
90+
"when determining profitability"));
91+
8792
#ifndef NDEBUG
8893
static void printDepMatrix(CharMatrix &DepMatrix) {
8994
for (auto &Row : DepMatrix) {
@@ -1193,22 +1198,53 @@ bool LoopInterchangeProfitability::isProfitable(
11931198
unsigned OuterLoopId, CharMatrix &DepMatrix,
11941199
const DenseMap<const Loop *, unsigned> &CostMap,
11951200
std::unique_ptr<CacheCost> &CC) {
1196-
// isProfitable() is structured to avoid endless loop interchange.
1197-
// If loop cache analysis could decide the profitability then,
1198-
// profitability check will stop and return the analysis result.
1199-
// If cache analysis failed to analyze the loopnest (e.g.,
1200-
// due to delinearization issues) then only check whether it is
1201-
// profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
1202-
// analysis the profitability then only, isProfitableForVectorization
1203-
// will decide.
1204-
std::optional<bool> shouldInterchange =
1205-
isProfitablePerLoopCacheAnalysis(CostMap, CC);
1206-
if (!shouldInterchange.has_value()) {
1207-
shouldInterchange = isProfitablePerInstrOrderCost();
1208-
if (!shouldInterchange.has_value())
1201+
// isProfitable() is structured to avoid endless loop interchange. If the
1202+
// highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1203+
// decide the profitability then, profitability check will stop and return the
1204+
// analysis result. If it failed to determine it (e.g., cache analysis failed
1205+
// to analyze the loopnest due to delinearization issues) then go ahead the
1206+
// second highest priority rule (isProfitablePerInstrOrderCost by default).
1207+
// Likewise, if it failed to analysis the profitability then only, the last
1208+
// rule (isProfitableForVectorization by default) will decide.
1209+
enum class RuleTy {
1210+
PerLoopCacheAnalysis,
1211+
PerInstrOrderCost,
1212+
ForVectorization,
1213+
};
1214+
1215+
// We prefer cache cost to vectorization by default.
1216+
RuleTy RuleOrder[3] = {RuleTy::PerLoopCacheAnalysis,
1217+
RuleTy::PerInstrOrderCost, RuleTy::ForVectorization};
1218+
1219+
// If we prefer vectorization to cache cost, change the order of application
1220+
// of each rule.
1221+
if (PrioritizeVectorization) {
1222+
RuleOrder[0] = RuleTy::ForVectorization;
1223+
RuleOrder[1] = RuleTy::PerLoopCacheAnalysis;
1224+
RuleOrder[2] = RuleTy::PerInstrOrderCost;
1225+
}
1226+
1227+
std::optional<bool> shouldInterchange;
1228+
for (RuleTy RT : RuleOrder) {
1229+
switch (RT) {
1230+
case RuleTy::PerLoopCacheAnalysis:
1231+
shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1232+
break;
1233+
case RuleTy::PerInstrOrderCost:
1234+
shouldInterchange = isProfitablePerInstrOrderCost();
1235+
break;
1236+
case RuleTy::ForVectorization:
12091237
shouldInterchange =
12101238
isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1239+
break;
1240+
}
1241+
1242+
// If this rule could determine the profitability, don't call subsequent
1243+
// rules.
1244+
if (shouldInterchange.has_value())
1245+
break;
12111246
}
1247+
12121248
if (!shouldInterchange.has_value()) {
12131249
ORE->emit([&]() {
12141250
return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
2+
; RUN: -pass-remarks-output=%t -disable-output
3+
; RUN: FileCheck -input-file %t --check-prefix=PROFIT-CACHE %s
4+
5+
; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
6+
; RUN: -pass-remarks-output=%t -disable-output -loop-interchange-prioritize-vectorization=1
7+
; RUN: FileCheck -input-file %t --check-prefix=PROFIT-VEC %s
8+
9+
@A = dso_local global [256 x [256 x float]] zeroinitializer
10+
@B = dso_local global [256 x [256 x float]] zeroinitializer
11+
@C = dso_local global [256 x [256 x float]] zeroinitializer
12+
@D = dso_local global [256 x [256 x float]] zeroinitializer
13+
@E = dso_local global [256 x [256 x float]] zeroinitializer
14+
@F = dso_local global [256 x [256 x float]] zeroinitializer
15+
16+
; Check the behavior of the LoopInterchange cost-model. In the below code,
17+
; exchanging the loops is not profitable in terms of cache, but it is necessary
18+
; to vectorize the innermost loop.
19+
;
20+
; for (int i = 0; i < 256; i++)
21+
; for (int j = 1; j < 256; j++)
22+
; A[j][i] = A[j-1][i] + B[j][i] + C[i][j] + D[i][j] + E[i][j] + F[i][j];
23+
;
24+
25+
; PROFIT-CACHE: --- !Missed
26+
; PROFIT-CACHE-NEXT: Pass: loop-interchange
27+
; PROFIT-CACHE-NEXT: Name: InterchangeNotProfitable
28+
; PROFIT-CACHE-NEXT: Function: f
29+
; PROFIT-CACHE-NEXT: Args:
30+
; PROFIT-CACHE-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
31+
; PROFIT-CACHE-NEXT: ...
32+
33+
; PROFIT-VEC: --- !Passed
34+
; PROFIT-VEC-NEXT: Pass: loop-interchange
35+
; PROFIT-VEC-NEXT: Name: Interchanged
36+
; PROFIT-VEC-NEXT: Function: f
37+
; PROFIT-VEC-NEXT: Args:
38+
; PROFIT-VEC-NEXT: - String: Loop interchanged with enclosing loop.
39+
; PROFIT-VEC-NEXT: ...
40+
define void @f() {
41+
entry:
42+
br label %for.i.header
43+
44+
for.i.header:
45+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.i.inc ]
46+
br label %for.j.body
47+
48+
for.j.body:
49+
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
50+
%j.dec = add nsw i64 %j, -1
51+
%a.0.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j.dec, i64 %i
52+
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %j, i64 %i
53+
%c.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
54+
%d.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @D, i64 %i, i64 %j
55+
%e.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @E, i64 %i, i64 %j
56+
%f.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @F, i64 %i, i64 %j
57+
%a.0 = load float, ptr %a.0.index, align 4
58+
%b = load float, ptr %b.index, align 4
59+
%c = load float, ptr %c.index, align 4
60+
%d = load float, ptr %d.index, align 4
61+
%e = load float, ptr %e.index, align 4
62+
%f = load float, ptr %f.index, align 4
63+
%add.0 = fadd float %a.0, %b
64+
%add.1 = fadd float %add.0, %c
65+
%add.2 = fadd float %add.1, %d
66+
%add.3 = fadd float %add.2, %e
67+
%add.4 = fadd float %add.3, %f
68+
%a.1.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j, i64 %i
69+
store float %add.4, ptr %a.1.index, align 4
70+
%j.next = add nuw nsw i64 %j, 1
71+
%cmp.j = icmp eq i64 %j.next, 256
72+
br i1 %cmp.j, label %for.i.inc, label %for.j.body
73+
74+
for.i.inc:
75+
%i.next = add nuw nsw i64 %i, 1
76+
%cmp.i = icmp eq i64 %i.next, 256
77+
br i1 %cmp.i, label %exit, label %for.i.header
78+
79+
exit:
80+
ret void
81+
}

0 commit comments

Comments
 (0)