Skip to content

Commit 28b9d6a

Browse files
committed
[ARM][SLP] Fix incorrect cost function for SLP Vectorization of ZExt/SExt
PR #117350 made changes to the SLP vectorizer which introduced a regression on ARM vectorization benchmarks. This was due to the changes assuming that SExt/ZExt vector instructions have constant cost. This behaviour is expected for RISCV but not on ARM where we take into account source and destination type of SExt/ZExt instructions when calculating vector cost. Change-Id: I6f995dcde26e5aaf62b779b63e52988fb333f941
1 parent 05bd7d2 commit 28b9d6a

File tree

2 files changed

+309
-2
lines changed

2 files changed

+309
-2
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,11 +1791,33 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
17911791

17921792
int ISD = TLI->InstructionOpcodeToISD(Opcode);
17931793

1794+
auto CastCost = [=]() -> unsigned {
1795+
// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
1796+
// instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
1797+
// are linearised so take more.
1798+
static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
1799+
{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
1800+
{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
1801+
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
1802+
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
1803+
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
1804+
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
1805+
};
1806+
1807+
if (ST->hasMVEIntegerOps()) {
1808+
if (const auto *Entry = ConvertCostTableLookup(
1809+
MVEVectorConversionTbl,
1810+
(IsUnsigned) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
1811+
ResVT.getSimpleVT(), ValVT.getSimpleVT()))
1812+
return Entry->Cost;
1813+
}
1814+
return 0;
1815+
};
1816+
17941817
switch (ISD) {
17951818
case ISD::ADD:
17961819
if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
17971820
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1798-
17991821
// The legal cases are:
18001822
// VADDV u/s 8/16/32
18011823
// VADDLV u/s 32
@@ -1807,7 +1829,7 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
18071829
((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
18081830
(LT.second == MVT::v8i16 && RevVTSize <= 32) ||
18091831
(LT.second == MVT::v4i32 && RevVTSize <= 64)))
1810-
return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1832+
return CastCost() + ST->getMVEVectorCostFactor(CostKind) * LT.first;
18111833
}
18121834
break;
18131835
default:
Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -passes="default<O1>,slp-vectorizer" -S -mtriple=arm-none-eabi --mattr=+mve | FileCheck %s
3+
4+
5+
define dso_local i64 @vadd(ptr noundef %0) #0 {
6+
; CHECK-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @vadd(
7+
; CHECK-SAME: ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
8+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
9+
; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
10+
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP3]])
11+
; CHECK-NEXT: ret i64 [[TMP21]]
12+
;
13+
%2 = alloca ptr, align 4
14+
store ptr %0, ptr %2, align 4
15+
%3 = load ptr, ptr %2, align 4
16+
%4 = getelementptr inbounds i32, ptr %3, i32 0
17+
%5 = load i32, ptr %4, align 4
18+
%6 = sext i32 %5 to i64
19+
%7 = load ptr, ptr %2, align 4
20+
%8 = getelementptr inbounds i32, ptr %7, i32 1
21+
%9 = load i32, ptr %8, align 4
22+
%10 = sext i32 %9 to i64
23+
%11 = add nsw i64 %6, %10
24+
%12 = load ptr, ptr %2, align 4
25+
%13 = getelementptr inbounds i32, ptr %12, i32 2
26+
%14 = load i32, ptr %13, align 4
27+
%15 = sext i32 %14 to i64
28+
%16 = add nsw i64 %11, %15
29+
%17 = load ptr, ptr %2, align 4
30+
%18 = getelementptr inbounds i32, ptr %17, i32 3
31+
%19 = load i32, ptr %18, align 4
32+
%20 = sext i32 %19 to i64
33+
%21 = add nsw i64 %16, %20
34+
ret i64 %21
35+
}
36+
37+
define dso_local i64 @vmul(ptr noundef %0) #0 {
38+
; CHECK-LABEL: define dso_local i64 @vmul(
39+
; CHECK-SAME: ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
40+
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
41+
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
42+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 4
43+
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
44+
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
45+
; CHECK-NEXT: [[TMP11:%.*]] = mul nsw i64 [[TMP10]], [[TMP6]]
46+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 8
47+
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
48+
; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
49+
; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i64 [[TMP11]], [[TMP15]]
50+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 12
51+
; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
52+
; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
53+
; CHECK-NEXT: [[TMP21:%.*]] = mul nsw i64 [[TMP16]], [[TMP20]]
54+
; CHECK-NEXT: ret i64 [[TMP21]]
55+
;
56+
%2 = alloca ptr, align 4
57+
store ptr %0, ptr %2, align 4
58+
%3 = load ptr, ptr %2, align 4
59+
%4 = getelementptr inbounds i32, ptr %3, i32 0
60+
%5 = load i32, ptr %4, align 4
61+
%6 = sext i32 %5 to i64
62+
%7 = load ptr, ptr %2, align 4
63+
%8 = getelementptr inbounds i32, ptr %7, i32 1
64+
%9 = load i32, ptr %8, align 4
65+
%10 = sext i32 %9 to i64
66+
%11 = mul nsw i64 %6, %10
67+
%12 = load ptr, ptr %2, align 4
68+
%13 = getelementptr inbounds i32, ptr %12, i32 2
69+
%14 = load i32, ptr %13, align 4
70+
%15 = sext i32 %14 to i64
71+
%16 = mul nsw i64 %11, %15
72+
%17 = load ptr, ptr %2, align 4
73+
%18 = getelementptr inbounds i32, ptr %17, i32 3
74+
%19 = load i32, ptr %18, align 4
75+
%20 = sext i32 %19 to i64
76+
%21 = mul nsw i64 %16, %20
77+
ret i64 %21
78+
}
79+
80+
define dso_local i64 @vand(ptr noundef %0) #0 {
81+
; CHECK-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @vand(
82+
; CHECK-SAME: ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
83+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
84+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 4
85+
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
86+
; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP9]], [[TMP2]]
87+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 8
88+
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
89+
; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP5]], [[TMP14]]
90+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 12
91+
; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
92+
; CHECK-NEXT: [[TMP11:%.*]] = and i32 [[TMP10]], [[TMP19]]
93+
; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[TMP11]] to i64
94+
; CHECK-NEXT: ret i64 [[TMP21]]
95+
;
96+
%2 = alloca ptr, align 4
97+
store ptr %0, ptr %2, align 4
98+
%3 = load ptr, ptr %2, align 4
99+
%4 = getelementptr inbounds i32, ptr %3, i32 0
100+
%5 = load i32, ptr %4, align 4
101+
%6 = sext i32 %5 to i64
102+
%7 = load ptr, ptr %2, align 4
103+
%8 = getelementptr inbounds i32, ptr %7, i32 1
104+
%9 = load i32, ptr %8, align 4
105+
%10 = sext i32 %9 to i64
106+
%11 = and i64 %6, %10
107+
%12 = load ptr, ptr %2, align 4
108+
%13 = getelementptr inbounds i32, ptr %12, i32 2
109+
%14 = load i32, ptr %13, align 4
110+
%15 = sext i32 %14 to i64
111+
%16 = and i64 %11, %15
112+
%17 = load ptr, ptr %2, align 4
113+
%18 = getelementptr inbounds i32, ptr %17, i32 3
114+
%19 = load i32, ptr %18, align 4
115+
%20 = sext i32 %19 to i64
116+
%21 = and i64 %16, %20
117+
ret i64 %21
118+
}
119+
120+
define dso_local i64 @vor(ptr noundef %0) #0 {
121+
; CHECK-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @vor(
122+
; CHECK-SAME: ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
123+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
124+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 4
125+
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
126+
; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP9]], [[TMP2]]
127+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 8
128+
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
129+
; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[TMP5]], [[TMP14]]
130+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 12
131+
; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
132+
; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[TMP10]], [[TMP19]]
133+
; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[TMP11]] to i64
134+
; CHECK-NEXT: ret i64 [[TMP21]]
135+
;
136+
%2 = alloca ptr, align 4
137+
store ptr %0, ptr %2, align 4
138+
%3 = load ptr, ptr %2, align 4
139+
%4 = getelementptr inbounds i32, ptr %3, i32 0
140+
%5 = load i32, ptr %4, align 4
141+
%6 = sext i32 %5 to i64
142+
%7 = load ptr, ptr %2, align 4
143+
%8 = getelementptr inbounds i32, ptr %7, i32 1
144+
%9 = load i32, ptr %8, align 4
145+
%10 = sext i32 %9 to i64
146+
%11 = or i64 %6, %10
147+
%12 = load ptr, ptr %2, align 4
148+
%13 = getelementptr inbounds i32, ptr %12, i32 2
149+
%14 = load i32, ptr %13, align 4
150+
%15 = sext i32 %14 to i64
151+
%16 = or i64 %11, %15
152+
%17 = load ptr, ptr %2, align 4
153+
%18 = getelementptr inbounds i32, ptr %17, i32 3
154+
%19 = load i32, ptr %18, align 4
155+
%20 = sext i32 %19 to i64
156+
%21 = or i64 %16, %20
157+
ret i64 %21
158+
}
159+
160+
define dso_local i64 @vxor(ptr noundef %0) #0 {
161+
; CHECK-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @vxor(
162+
; CHECK-SAME: ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
163+
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
164+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 4
165+
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
166+
; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP9]], [[TMP2]]
167+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 8
168+
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
169+
; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP5]], [[TMP14]]
170+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 12
171+
; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
172+
; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[TMP10]], [[TMP19]]
173+
; CHECK-NEXT: [[TMP21:%.*]] = sext i32 [[TMP11]] to i64
174+
; CHECK-NEXT: ret i64 [[TMP21]]
175+
;
176+
%2 = alloca ptr, align 4
177+
store ptr %0, ptr %2, align 4
178+
%3 = load ptr, ptr %2, align 4
179+
%4 = getelementptr inbounds i32, ptr %3, i32 0
180+
%5 = load i32, ptr %4, align 4
181+
%6 = sext i32 %5 to i64
182+
%7 = load ptr, ptr %2, align 4
183+
%8 = getelementptr inbounds i32, ptr %7, i32 1
184+
%9 = load i32, ptr %8, align 4
185+
%10 = sext i32 %9 to i64
186+
%11 = xor i64 %6, %10
187+
%12 = load ptr, ptr %2, align 4
188+
%13 = getelementptr inbounds i32, ptr %12, i32 2
189+
%14 = load i32, ptr %13, align 4
190+
%15 = sext i32 %14 to i64
191+
%16 = xor i64 %11, %15
192+
%17 = load ptr, ptr %2, align 4
193+
%18 = getelementptr inbounds i32, ptr %17, i32 3
194+
%19 = load i32, ptr %18, align 4
195+
%20 = sext i32 %19 to i64
196+
%21 = xor i64 %16, %20
197+
ret i64 %21
198+
}
199+
200+
define dso_local double @vfadd(ptr noundef %0) #0 {
201+
; CHECK-LABEL: define dso_local double @vfadd(
202+
; CHECK-SAME: ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
203+
; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP0]], align 4
204+
; CHECK-NEXT: [[TMP6:%.*]] = fpext float [[TMP5]] to double
205+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 4
206+
; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4
207+
; CHECK-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
208+
; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP6]], [[TMP10]]
209+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 8
210+
; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4
211+
; CHECK-NEXT: [[TMP15:%.*]] = fpext float [[TMP14]] to double
212+
; CHECK-NEXT: [[TMP16:%.*]] = fadd double [[TMP11]], [[TMP15]]
213+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 12
214+
; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4
215+
; CHECK-NEXT: [[TMP20:%.*]] = fpext float [[TMP19]] to double
216+
; CHECK-NEXT: [[TMP21:%.*]] = fadd double [[TMP16]], [[TMP20]]
217+
; CHECK-NEXT: ret double [[TMP21]]
218+
;
219+
%2 = alloca ptr, align 4
220+
store ptr %0, ptr %2, align 4
221+
%3 = load ptr, ptr %2, align 4
222+
%4 = getelementptr inbounds float, ptr %3, i32 0
223+
%5 = load float, ptr %4, align 4
224+
%6 = fpext float %5 to double
225+
%7 = load ptr, ptr %2, align 4
226+
%8 = getelementptr inbounds float, ptr %7, i32 1
227+
%9 = load float, ptr %8, align 4
228+
%10 = fpext float %9 to double
229+
%11 = fadd double %6, %10
230+
%12 = load ptr, ptr %2, align 4
231+
%13 = getelementptr inbounds float, ptr %12, i32 2
232+
%14 = load float, ptr %13, align 4
233+
%15 = fpext float %14 to double
234+
%16 = fadd double %11, %15
235+
%17 = load ptr, ptr %2, align 4
236+
%18 = getelementptr inbounds float, ptr %17, i32 3
237+
%19 = load float, ptr %18, align 4
238+
%20 = fpext float %19 to double
239+
%21 = fadd double %16, %20
240+
ret double %21
241+
}
242+
243+
define dso_local double @vfmul(ptr noundef %0) #0 {
244+
; CHECK-LABEL: define dso_local double @vfmul(
245+
; CHECK-SAME: ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
246+
; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP0]], align 4
247+
; CHECK-NEXT: [[TMP6:%.*]] = fpext float [[TMP5]] to double
248+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 4
249+
; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4
250+
; CHECK-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
251+
; CHECK-NEXT: [[TMP11:%.*]] = fmul double [[TMP6]], [[TMP10]]
252+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 8
253+
; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4
254+
; CHECK-NEXT: [[TMP15:%.*]] = fpext float [[TMP14]] to double
255+
; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP11]], [[TMP15]]
256+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i32 12
257+
; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4
258+
; CHECK-NEXT: [[TMP20:%.*]] = fpext float [[TMP19]] to double
259+
; CHECK-NEXT: [[TMP21:%.*]] = fmul double [[TMP16]], [[TMP20]]
260+
; CHECK-NEXT: ret double [[TMP21]]
261+
;
262+
%2 = alloca ptr, align 4
263+
store ptr %0, ptr %2, align 4
264+
%3 = load ptr, ptr %2, align 4
265+
%4 = getelementptr inbounds float, ptr %3, i32 0
266+
%5 = load float, ptr %4, align 4
267+
%6 = fpext float %5 to double
268+
%7 = load ptr, ptr %2, align 4
269+
%8 = getelementptr inbounds float, ptr %7, i32 1
270+
%9 = load float, ptr %8, align 4
271+
%10 = fpext float %9 to double
272+
%11 = fmul double %6, %10
273+
%12 = load ptr, ptr %2, align 4
274+
%13 = getelementptr inbounds float, ptr %12, i32 2
275+
%14 = load float, ptr %13, align 4
276+
%15 = fpext float %14 to double
277+
%16 = fmul double %11, %15
278+
%17 = load ptr, ptr %2, align 4
279+
%18 = getelementptr inbounds float, ptr %17, i32 3
280+
%19 = load float, ptr %18, align 4
281+
%20 = fpext float %19 to double
282+
%21 = fmul double %16, %20
283+
ret double %21
284+
}
285+

0 commit comments

Comments
 (0)