Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 8babc52

Browse files
committed
[LSR] Attempt to increase the accuracy of LSR's setup cost
In some loops, we end up generating loop induction variables that look like: {(-1 * (zext i16 (%i0 * %i1) to i32))<nsw>,+,1} As opposed to the simpler: {(zext i16 (%i0 * %i1) to i32),+,-1} i.e we count up from -limit to 0, not the simpler counting down from limit to 0. This is because the scores, as LSR calculates them, are the same and the second is filtered in place of the first. We end up with a redundant SUB from 0 in the code. This patch tries to make the calculation of the setup cost a little more thoroughly, recursing into the scev members to better approximate the setup required. The cost function for comparing LSR costs is: return std::tie(C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < std::tie(C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost); So this will only alter results if none of the other variables turn out to be different. Differential Revision: https://reviews.llvm.org/D58770 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355597 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent deb1a98 commit 8babc52

File tree

5 files changed

+129
-10
lines changed

5 files changed

+129
-10
lines changed

lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@
115115
#include <cstdlib>
116116
#include <iterator>
117117
#include <limits>
118+
#include <numeric>
118119
#include <map>
119120
#include <utility>
120121

@@ -163,6 +164,10 @@ static cl::opt<unsigned> ComplexityLimit(
163164
cl::init(std::numeric_limits<uint16_t>::max()),
164165
cl::desc("LSR search space complexity limit"));
165166

167+
static cl::opt<bool> EnableRecursiveSetupCost(
168+
"lsr-recursive-setupcost", cl::Hidden, cl::init(true),
169+
cl::desc("Enable more thorough lsr setup cost calculation"));
170+
166171
#ifndef NDEBUG
167172
// Stress test IV chain generation.
168173
static cl::opt<bool> StressIVChain(
@@ -1211,6 +1216,25 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
12111216
bool HasBaseReg, int64_t Scale,
12121217
Instruction *Fixup = nullptr);
12131218

1219+
static unsigned getSetupCost(const SCEV *Reg) {
1220+
if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1221+
return 1;
1222+
if (!EnableRecursiveSetupCost)
1223+
return 0;
1224+
if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1225+
return getSetupCost(S->getStart());
1226+
if (auto S = dyn_cast<SCEVCastExpr>(Reg))
1227+
return getSetupCost(S->getOperand());
1228+
if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1229+
return std::accumulate(S->op_begin(), S->op_end(), 0,
1230+
[](unsigned i, const SCEV *Reg) {
1231+
return i + getSetupCost(Reg);
1232+
});
1233+
if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1234+
return getSetupCost(S->getLHS()) + getSetupCost(S->getRHS());
1235+
return 0;
1236+
}
1237+
12141238
/// Tally up interesting quantities from the given register.
12151239
void Cost::RateRegister(const Formula &F, const SCEV *Reg,
12161240
SmallPtrSetImpl<const SCEV *> &Regs,
@@ -1276,12 +1300,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
12761300

12771301
// Rough heuristic; favor registers which don't require extra setup
12781302
// instructions in the preheader.
1279-
if (!isa<SCEVUnknown>(Reg) &&
1280-
!isa<SCEVConstant>(Reg) &&
1281-
!(isa<SCEVAddRecExpr>(Reg) &&
1282-
(isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
1283-
isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
1284-
++C.SetupCost;
1303+
C.SetupCost += getSetupCost(Reg);
12851304

12861305
C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
12871306
SE.hasComputableLoopEvolution(Reg, L);

test/CodeGen/ARM/lsr-setupcost.ll

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -mtriple=thumbv6m-none-eabi -loop-reduce %s -S -o - | FileCheck %s
3+
4+
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
5+
6+
%struct.arm_matrix_instance_q15 = type { i16, i16, i16* }
7+
8+
define i32 @arm_mat_add_q15(%struct.arm_matrix_instance_q15* nocapture readonly %pSrcA, %struct.arm_matrix_instance_q15* nocapture readonly %pSrcB, %struct.arm_matrix_instance_q15* nocapture readonly %pDst) {
9+
; CHECK-LABEL: @arm_mat_add_q15(
10+
; CHECK-NEXT: entry:
11+
; CHECK-NEXT: [[NUMROWS:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15:%.*]], %struct.arm_matrix_instance_q15* [[PSRCA:%.*]], i32 0, i32 0
12+
; CHECK-NEXT: [[I0:%.*]] = load i16, i16* [[NUMROWS]], align 4
13+
; CHECK-NEXT: [[NUMCOLS:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCA]], i32 0, i32 1
14+
; CHECK-NEXT: [[I1:%.*]] = load i16, i16* [[NUMCOLS]], align 2
15+
; CHECK-NEXT: [[MUL:%.*]] = mul i16 [[I1]], [[I0]]
16+
; CHECK-NEXT: [[CMP22:%.*]] = icmp eq i16 [[MUL]], 0
17+
; CHECK-NEXT: br i1 [[CMP22]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
18+
; CHECK: while.body.preheader:
19+
; CHECK-NEXT: [[CONV5:%.*]] = zext i16 [[MUL]] to i32
20+
; CHECK-NEXT: [[PDATA2:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PDST:%.*]], i32 0, i32 2
21+
; CHECK-NEXT: [[I2:%.*]] = load i16*, i16** [[PDATA2]], align 4
22+
; CHECK-NEXT: [[PDATA1:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCB:%.*]], i32 0, i32 2
23+
; CHECK-NEXT: [[I3:%.*]] = load i16*, i16** [[PDATA1]], align 4
24+
; CHECK-NEXT: [[PDATA:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCA]], i32 0, i32 2
25+
; CHECK-NEXT: [[I4:%.*]] = load i16*, i16** [[PDATA]], align 4
26+
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
27+
; CHECK: while.body:
28+
; CHECK-NEXT: [[PINA_026:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[I4]], [[WHILE_BODY_PREHEADER]] ]
29+
; CHECK-NEXT: [[BLKCNT_025:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[CONV5]], [[WHILE_BODY_PREHEADER]] ]
30+
; CHECK-NEXT: [[PINB_024:%.*]] = phi i16* [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY]] ], [ [[I3]], [[WHILE_BODY_PREHEADER]] ]
31+
; CHECK-NEXT: [[POUT_023:%.*]] = phi i16* [ [[INCDEC_PTR11:%.*]], [[WHILE_BODY]] ], [ [[I2]], [[WHILE_BODY_PREHEADER]] ]
32+
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PINA_026]], i32 1
33+
; CHECK-NEXT: [[I5:%.*]] = load i16, i16* [[PINA_026]], align 2
34+
; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[I5]] to i32
35+
; CHECK-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i16, i16* [[PINB_024]], i32 1
36+
; CHECK-NEXT: [[I6:%.*]] = load i16, i16* [[PINB_024]], align 2
37+
; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[I6]] to i32
38+
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV9]], [[CONV7]]
39+
; CHECK-NEXT: [[I7:%.*]] = icmp sgt i32 [[ADD]], -32768
40+
; CHECK-NEXT: [[SPEC_SELECT_I:%.*]] = select i1 [[I7]], i32 [[ADD]], i32 -32768
41+
; CHECK-NEXT: [[I8:%.*]] = icmp slt i32 [[SPEC_SELECT_I]], 32767
42+
; CHECK-NEXT: [[CALL21:%.*]] = select i1 [[I8]], i32 [[SPEC_SELECT_I]], i32 32767
43+
; CHECK-NEXT: [[CONV10:%.*]] = trunc i32 [[CALL21]] to i16
44+
; CHECK-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i16, i16* [[POUT_023]], i32 1
45+
; CHECK-NEXT: store i16 [[CONV10]], i16* [[POUT_023]], align 2
46+
; CHECK-NEXT: [[DEC]] = add nsw i32 [[BLKCNT_025]], -1
47+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
48+
; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
49+
; CHECK: while.end.loopexit:
50+
; CHECK-NEXT: br label [[WHILE_END]]
51+
; CHECK: while.end:
52+
; CHECK-NEXT: ret i32 0
53+
;
54+
entry:
55+
%numRows = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 0
56+
%i0 = load i16, i16* %numRows, align 4
57+
%numCols = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 1
58+
%i1 = load i16, i16* %numCols, align 2
59+
%mul = mul i16 %i1, %i0
60+
%cmp22 = icmp eq i16 %mul, 0
61+
br i1 %cmp22, label %while.end, label %while.body.preheader
62+
63+
while.body.preheader: ; preds = %entry
64+
%conv5 = zext i16 %mul to i32
65+
%pData2 = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pDst, i32 0, i32 2
66+
%i2 = load i16*, i16** %pData2, align 4
67+
%pData1 = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcB, i32 0, i32 2
68+
%i3 = load i16*, i16** %pData1, align 4
69+
%pData = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 2
70+
%i4 = load i16*, i16** %pData, align 4
71+
br label %while.body
72+
73+
while.body: ; preds = %while.body.preheader, %while.body
74+
%pInA.026 = phi i16* [ %incdec.ptr, %while.body ], [ %i4, %while.body.preheader ]
75+
%blkCnt.025 = phi i32 [ %dec, %while.body ], [ %conv5, %while.body.preheader ]
76+
%pInB.024 = phi i16* [ %incdec.ptr8, %while.body ], [ %i3, %while.body.preheader ]
77+
%pOut.023 = phi i16* [ %incdec.ptr11, %while.body ], [ %i2, %while.body.preheader ]
78+
%incdec.ptr = getelementptr inbounds i16, i16* %pInA.026, i32 1
79+
%i5 = load i16, i16* %pInA.026, align 2
80+
%conv7 = sext i16 %i5 to i32
81+
%incdec.ptr8 = getelementptr inbounds i16, i16* %pInB.024, i32 1
82+
%i6 = load i16, i16* %pInB.024, align 2
83+
%conv9 = sext i16 %i6 to i32
84+
%add = add nsw i32 %conv9, %conv7
85+
%i7 = icmp sgt i32 %add, -32768
86+
%spec.select.i = select i1 %i7, i32 %add, i32 -32768
87+
%i8 = icmp slt i32 %spec.select.i, 32767
88+
%call21 = select i1 %i8, i32 %spec.select.i, i32 32767
89+
%conv10 = trunc i32 %call21 to i16
90+
%incdec.ptr11 = getelementptr inbounds i16, i16* %pOut.023, i32 1
91+
store i16 %conv10, i16* %pOut.023, align 2
92+
%dec = add nsw i32 %blkCnt.025, -1
93+
%cmp = icmp eq i32 %dec, 0
94+
br i1 %cmp, label %while.end, label %while.body
95+
96+
while.end: ; preds = %while.body, %entry
97+
ret i32 0
98+
}
99+
100+

test/CodeGen/Hexagon/swp-carried-1.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
1+
; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-recursive-setupcost=0 < %s | FileCheck %s
22

33
; Test that we generate the correct code when a loop carried value
44
; is scheduled one stage earlier than it's use. The code in

test/CodeGen/Hexagon/swp-epilog-phi5.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77

88
; In this test case, the second loop is pipelined, block b5.
99

10-
; CHECK: loop0
10+
; CHECK: loop1
1111
; CHECK: [[REG0:r([0-9]+)]] += mpyi
1212
; CHECK: [[REG2:r([0-9]+)]] = add([[REG1:r([0-9]+)]],add([[REG0]],#8
13-
; CHECK: endloop0
13+
; CHECK: endloop1
1414

1515
%s.0 = type { %s.1*, %s.4*, %s.7*, i8*, i8, i32, %s.8*, i32, i32, i32, i8, i8, i32, i32, double, i8, i8, i8, i8, i8, i8, i8, i8, i32, i8, i8, i8, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %s.9*], [4 x %s.10*], [4 x %s.10*], i32, %s.23*, i8, i8, [16 x i8], [16 x i8], [16 x i8], i32, i8, i8, i8, i8, i16, i16, i8, i8, i8, %s.11*, i32, i32, i32, i32, i8*, i32, [4 x %s.23*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %s.12*, %s.13*, %s.14*, %s.15*, %s.16*, %s.17*, %s.18*, %s.19*, %s.20*, %s.21*, %s.22* }
1616
%s.1 = type { void (%s.2*)*, void (%s.2*, i32)*, void (%s.2*)*, void (%s.2*, i8*)*, void (%s.2*)*, i32, %s.3, i32, i32, i8**, i32, i8**, i32, i32 }

test/Transforms/LoopStrengthReduce/two-combinations-bug.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt < %s -loop-reduce -S | FileCheck %s
1+
; RUN: opt < %s -loop-reduce -lsr-recursive-setupcost=0 -S | FileCheck %s
22

33
; This test is adapted from the n-body test of the LLVM test-suite: A bug in
44
; r345114 caused LSR to generate incorrect code. The test verifies that the

0 commit comments

Comments
 (0)