Skip to content

Commit 9c47d6b

Browse files
committed
[llvm][sve] Lowering for VLS extending loads
This patch enables extending loads for fixed length SVE code generation. There is a slight regression here in the mulh tests; since these tests load the parameter and then extend it these are treated as extending loads which are merged, preventing the mulh instruction from being generated. As this affects scalable SVE codegen as well this should be addressed in a separate patch. Reviewed By: bsmith Differential Revision: https://reviews.llvm.org/D107057
1 parent 39bbbc2 commit 9c47d6b

File tree

3 files changed

+477
-112
lines changed

3 files changed

+477
-112
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1516,6 +1516,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
15161516
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
15171517
while (InnerVT != VT) {
15181518
setTruncStoreAction(VT, InnerVT, Custom);
1519+
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1520+
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
15191521
InnerVT = InnerVT.changeVectorElementType(
15201522
MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
15211523
}
@@ -4176,7 +4178,9 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
41764178
}
41774179

41784180
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
4179-
return ExtVal.getValueType().isScalableVector();
4181+
return ExtVal.getValueType().isScalableVector() ||
4182+
useSVEForFixedLengthVectorVT(ExtVal.getValueType(),
4183+
/*OverrideNEON=*/true);
41804184
}
41814185

41824186
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2+
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3+
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4+
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5+
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6+
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7+
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8+
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
9+
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
10+
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
11+
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
12+
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
13+
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
14+
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
15+
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
16+
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
17+
18+
target triple = "aarch64-unknown-linux-gnu"
19+
20+
; Don't use SVE when its registers are no bigger than NEON.
21+
; NO_SVE-NOT: ptrue
22+
23+
define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 {
24+
; CHECK-LABEL: load_zext_v4i16i32
25+
; CHECK: ldr d[[D0:[0-9]+]], [x0]
26+
; CHECK-NEXT: ushll v[[D0]].4s, v[[D0]].4h, #0
27+
; CHECK-NEXT: ret
28+
%a = load <4 x i16>, <4 x i16>* %ap
29+
%val = zext <4 x i16> %a to <4 x i32>
30+
ret <4 x i32> %val
31+
}
32+
33+
define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
34+
; CHECK-LABEL: load_zext_v8i16i32
35+
; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
36+
; CHECK-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
37+
; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
38+
; CHECK-NEXT: ret
39+
%a = load <8 x i16>, <8 x i16>* %ap
40+
%val = zext <8 x i16> %a to <8 x i32>
41+
ret <8 x i32> %val
42+
}
43+
44+
define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 {
45+
; CHECK-LABEL: load_zext_v16i16i32
46+
; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
47+
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
48+
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
49+
; VBITS_GE_512-NEXT: ret
50+
51+
; Ensure sensible type legalistaion
52+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
53+
; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
54+
; VBITS_EQ_256-DAG: mov x9, sp
55+
; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
56+
; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
57+
; VBITS_EQ_256-DAG: add x9, x8, #32
58+
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
59+
; VBITS_EQ_256-DAG: uunpklo z[[R0]].s, z[[R0]].h
60+
; VBITS_EQ_256-DAG: uunpklo z[[R1]].s, z[[R1]].h
61+
; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
62+
; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
63+
; VBITS_EQ_256-DAG: ret
64+
%a = load <16 x i16>, <16 x i16>* %ap
65+
%val = zext <16 x i16> %a to <16 x i32>
66+
ret <16 x i32> %val
67+
}
68+
69+
define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 {
70+
; CHECK-LABEL: load_zext_v32i16i32
71+
; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
72+
; VBITS_GE_1024-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
73+
; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
74+
; VBITS_GE_1024-NEXT: ret
75+
%a = load <32 x i16>, <32 x i16>* %ap
76+
%val = zext <32 x i16> %a to <32 x i32>
77+
ret <32 x i32> %val
78+
}
79+
80+
define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
81+
; CHECK-LABEL: load_zext_v64i16i32
82+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
83+
; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
84+
; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
85+
; VBITS_GE_2048-NEXT: ret
86+
%a = load <64 x i16>, <64 x i16>* %ap
87+
%val = zext <64 x i16> %a to <64 x i32>
88+
ret <64 x i32> %val
89+
}
90+
91+
define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
92+
; CHECK-LABEL: load_sext_v4i16i32
93+
; CHECK: ldr d[[D0:[0-9]+]], [x0]
94+
; CHECK-NEXT: sshll v[[D0]].4s, v[[D0]].4h, #0
95+
; CHECK-NEXT: ret
96+
%a = load <4 x i16>, <4 x i16>* %ap
97+
%val = sext <4 x i16> %a to <4 x i32>
98+
ret <4 x i32> %val
99+
}
100+
101+
define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
102+
; CHECK-LABEL: load_sext_v8i16i32
103+
; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
104+
; CHECK-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
105+
; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
106+
; CHECK-NEXT: ret
107+
%a = load <8 x i16>, <8 x i16>* %ap
108+
%val = sext <8 x i16> %a to <8 x i32>
109+
ret <8 x i32> %val
110+
}
111+
112+
define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 {
113+
; CHECK-LABEL: load_sext_v16i16i32
114+
; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
115+
; VBITS_GE_512-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
116+
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
117+
; VBITS_GE_512-NEXT: ret
118+
119+
; Ensure sensible type legalistaion
120+
; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
121+
; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
122+
; VBITS_EQ_256-DAG: mov x9, sp
123+
; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
124+
; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
125+
; VBITS_EQ_256-DAG: add x9, x8, #32
126+
; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
127+
; VBITS_EQ_256-DAG: sunpklo z[[R0]].s, z[[R0]].h
128+
; VBITS_EQ_256-DAG: sunpklo z[[R1]].s, z[[R1]].h
129+
; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
130+
; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
131+
; VBITS_EQ_256-DAG: ret
132+
%a = load <16 x i16>, <16 x i16>* %ap
133+
%val = sext <16 x i16> %a to <16 x i32>
134+
ret <16 x i32> %val
135+
}
136+
137+
define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 {
138+
; CHECK-LABEL: load_sext_v32i16i32
139+
; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
140+
; VBITS_GE_1024-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
141+
; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
142+
; VBITS_GE_1024-NEXT: ret
143+
%a = load <32 x i16>, <32 x i16>* %ap
144+
%val = sext <32 x i16> %a to <32 x i32>
145+
ret <32 x i32> %val
146+
}
147+
148+
define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
149+
; CHECK-LABEL: load_sext_v64i16i32
150+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
151+
; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
152+
; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
153+
; VBITS_GE_2048-NEXT: ret
154+
%a = load <64 x i16>, <64 x i16>* %ap
155+
%val = sext <64 x i16> %a to <64 x i32>
156+
ret <64 x i32> %val
157+
}
158+
159+
define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
160+
; CHECK-LABEL: load_zext_v32i8i64
161+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
162+
; VBITS_GE_2048-NEXT: ld1b { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
163+
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
164+
; VBITS_GE_2048-NEXT: ret
165+
%a = load <32 x i8>, <32 x i8>* %ap
166+
%val = zext <32 x i8> %a to <32 x i64>
167+
ret <32 x i64> %val
168+
}
169+
170+
define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
171+
; CHECK-LABEL: load_sext_v32i8i64
172+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
173+
; VBITS_GE_2048-NEXT: ld1sb { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
174+
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
175+
; VBITS_GE_2048-NEXT: ret
176+
%a = load <32 x i8>, <32 x i8>* %ap
177+
%val = sext <32 x i8> %a to <32 x i64>
178+
ret <32 x i64> %val
179+
}
180+
181+
define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
182+
; CHECK-LABEL: load_zext_v32i16i64
183+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
184+
; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
185+
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
186+
; VBITS_GE_2048-NEXT: ret
187+
%a = load <32 x i16>, <32 x i16>* %ap
188+
%val = zext <32 x i16> %a to <32 x i64>
189+
ret <32 x i64> %val
190+
}
191+
192+
define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
193+
; CHECK-LABEL: load_sext_v32i16i64
194+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
195+
; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
196+
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
197+
; VBITS_GE_2048-NEXT: ret
198+
%a = load <32 x i16>, <32 x i16>* %ap
199+
%val = sext <32 x i16> %a to <32 x i64>
200+
ret <32 x i64> %val
201+
}
202+
203+
define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
204+
; CHECK-LABEL: load_zext_v32i32i64
205+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
206+
; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
207+
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
208+
; VBITS_GE_2048-NEXT: ret
209+
%a = load <32 x i32>, <32 x i32>* %ap
210+
%val = zext <32 x i32> %a to <32 x i64>
211+
ret <32 x i64> %val
212+
}
213+
214+
define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
215+
; CHECK-LABEL: load_sext_v32i32i64
216+
; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
217+
; VBITS_GE_2048-NEXT: ld1sw { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
218+
; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
219+
; VBITS_GE_2048-NEXT: ret
220+
%a = load <32 x i32>, <32 x i32>* %ap
221+
%val = sext <32 x i32> %a to <32 x i64>
222+
ret <32 x i64> %val
223+
}
224+
225+
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)