Skip to content

Commit 3a6eb5f

Browse files
committed
[ARM] Disable VLD4 under MVE
Alas, using half the available vector registers in a single instruction is just too much for the register allocator to handle. The mve-vldst4.ll test here fails when these instructions are enabled at present. This patch disables the generation of VLD4 and VST4 by adding a mve-max-interleave-factor option, which we currently default to 2. Differential Revision: https://reviews.llvm.org/D71109
1 parent e8716a6 commit 3a6eb5f

File tree

7 files changed

+400
-89
lines changed

7 files changed

+400
-89
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,11 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
142142
cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143143
cl::init(128));
144144

145+
static cl::opt<unsigned>
146+
MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147+
cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148+
cl::init(2));
149+
145150
// The APCS parameter registers.
146151
static const MCPhysReg GPRArgRegs[] = {
147152
ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -16786,7 +16791,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
1678616791
if (Subtarget->hasNEON())
1678716792
return 4;
1678816793
if (Subtarget->hasMVEIntegerOps())
16789-
return 4;
16794+
return MVEMaxSupportedInterleaveFactor;
1679016795
return TargetLoweringBase::getMaxSupportedInterleaveFactor();
1679116796
}
1679216797

llvm/test/CodeGen/Thumb2/mve-vld4.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
2+
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s
33

44
; i32
55

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3+
4+
define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
5+
; CHECK-LABEL: vldst4:
6+
; CHECK: @ %bb.0: @ %entry
7+
; CHECK-NEXT: .save {r7, lr}
8+
; CHECK-NEXT: push {r7, lr}
9+
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
10+
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
11+
; CHECK-NEXT: .pad #40
12+
; CHECK-NEXT: sub sp, #40
13+
; CHECK-NEXT: muls r2, r3, r2
14+
; CHECK-NEXT: movs r3, #0
15+
; CHECK-NEXT: cmp.w r3, r2, lsr #2
16+
; CHECK-NEXT: beq.w .LBB0_3
17+
; CHECK-NEXT: @ %bb.1: @ %vector.ph
18+
; CHECK-NEXT: mvn r3, #7
19+
; CHECK-NEXT: and.w r2, r3, r2, lsr #2
20+
; CHECK-NEXT: vldr.16 s0, [sp, #112]
21+
; CHECK-NEXT: subs r2, #8
22+
; CHECK-NEXT: movs r3, #1
23+
; CHECK-NEXT: sub.w r12, r0, #64
24+
; CHECK-NEXT: add.w lr, r3, r2, lsr #3
25+
; CHECK-NEXT: vmov r2, s0
26+
; CHECK-NEXT: vdup.16 q0, r2
27+
; CHECK-NEXT: subs r1, #64
28+
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
29+
; CHECK-NEXT: dls lr, lr
30+
; CHECK-NEXT: .LBB0_2: @ %vector.body
31+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
32+
; CHECK-NEXT: vldrh.u16 q0, [r12, #64]!
33+
; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
34+
; CHECK-NEXT: vmov r3, s0
35+
; CHECK-NEXT: vldrh.u16 q7, [r12, #16]
36+
; CHECK-NEXT: vmov r2, s2
37+
; CHECK-NEXT: vmov.16 q1[0], r3
38+
; CHECK-NEXT: vmov.16 q1[1], r2
39+
; CHECK-NEXT: vmov r2, s28
40+
; CHECK-NEXT: vldrh.u16 q6, [r12, #32]
41+
; CHECK-NEXT: vmov.16 q1[2], r2
42+
; CHECK-NEXT: vmov r2, s30
43+
; CHECK-NEXT: vldrh.u16 q5, [r12, #48]
44+
; CHECK-NEXT: vmov.16 q1[3], r2
45+
; CHECK-NEXT: vmov r2, s24
46+
; CHECK-NEXT: vmov.16 q1[4], r2
47+
; CHECK-NEXT: vmov r2, s26
48+
; CHECK-NEXT: vmov.16 q1[5], r2
49+
; CHECK-NEXT: vmov r2, s20
50+
; CHECK-NEXT: vmov.16 q1[6], r2
51+
; CHECK-NEXT: vmov r2, s22
52+
; CHECK-NEXT: vmov.16 q1[7], r2
53+
; CHECK-NEXT: vmov q4, q2
54+
; CHECK-NEXT: vmul.f16 q1, q1, q2
55+
; CHECK-NEXT: vmovx.f16 s8, s2
56+
; CHECK-NEXT: vmov r3, s8
57+
; CHECK-NEXT: vmovx.f16 s8, s0
58+
; CHECK-NEXT: vmov r0, s8
59+
; CHECK-NEXT: vmovx.f16 s12, s28
60+
; CHECK-NEXT: vmov.16 q2[0], r0
61+
; CHECK-NEXT: vmov r0, s12
62+
; CHECK-NEXT: vmov.16 q2[1], r3
63+
; CHECK-NEXT: vmovx.f16 s12, s30
64+
; CHECK-NEXT: vmov.16 q2[2], r0
65+
; CHECK-NEXT: vmov r0, s12
66+
; CHECK-NEXT: vmovx.f16 s12, s24
67+
; CHECK-NEXT: vmov.16 q2[3], r0
68+
; CHECK-NEXT: vmov r0, s12
69+
; CHECK-NEXT: vmovx.f16 s12, s26
70+
; CHECK-NEXT: vmov.16 q2[4], r0
71+
; CHECK-NEXT: vmov r0, s12
72+
; CHECK-NEXT: vmovx.f16 s12, s20
73+
; CHECK-NEXT: vmov.16 q2[5], r0
74+
; CHECK-NEXT: vmov r0, s12
75+
; CHECK-NEXT: vmovx.f16 s12, s22
76+
; CHECK-NEXT: vmov.16 q2[6], r0
77+
; CHECK-NEXT: vmov r0, s12
78+
; CHECK-NEXT: vmov.16 q2[7], r0
79+
; CHECK-NEXT: vmov r2, s4
80+
; CHECK-NEXT: vmul.f16 q2, q2, q4
81+
; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
82+
; CHECK-NEXT: vmov q1, q4
83+
; CHECK-NEXT: vmov.16 q4[0], r2
84+
; CHECK-NEXT: vmov r0, s8
85+
; CHECK-NEXT: vmovx.f16 s0, s3
86+
; CHECK-NEXT: vmov.16 q4[1], r0
87+
; CHECK-NEXT: vmov r0, s1
88+
; CHECK-NEXT: vmov.16 q3[0], r0
89+
; CHECK-NEXT: vmov r2, s3
90+
; CHECK-NEXT: vmov.16 q3[1], r2
91+
; CHECK-NEXT: vmov r0, s29
92+
; CHECK-NEXT: vmov.16 q3[2], r0
93+
; CHECK-NEXT: vmov r0, s31
94+
; CHECK-NEXT: vmov.16 q3[3], r0
95+
; CHECK-NEXT: vmov r0, s25
96+
; CHECK-NEXT: vmov.16 q3[4], r0
97+
; CHECK-NEXT: vmov r0, s27
98+
; CHECK-NEXT: vmov.16 q3[5], r0
99+
; CHECK-NEXT: vmov r0, s21
100+
; CHECK-NEXT: vmov.16 q3[6], r0
101+
; CHECK-NEXT: vmov r0, s23
102+
; CHECK-NEXT: vmov.16 q3[7], r0
103+
; CHECK-NEXT: vmov r2, s0
104+
; CHECK-NEXT: vmul.f16 q3, q3, q1
105+
; CHECK-NEXT: vmovx.f16 s4, s1
106+
; CHECK-NEXT: vmov r0, s12
107+
; CHECK-NEXT: vmov.16 q4[2], r0
108+
; CHECK-NEXT: vmov r0, s4
109+
; CHECK-NEXT: vmov.16 q0[0], r0
110+
; CHECK-NEXT: vmovx.f16 s4, s29
111+
; CHECK-NEXT: vmov.16 q0[1], r2
112+
; CHECK-NEXT: vmov r0, s4
113+
; CHECK-NEXT: vmovx.f16 s4, s31
114+
; CHECK-NEXT: vmov.16 q0[2], r0
115+
; CHECK-NEXT: vmov r0, s4
116+
; CHECK-NEXT: vmovx.f16 s4, s25
117+
; CHECK-NEXT: vmov.16 q0[3], r0
118+
; CHECK-NEXT: vmov r0, s4
119+
; CHECK-NEXT: vmovx.f16 s4, s27
120+
; CHECK-NEXT: vmov.16 q0[4], r0
121+
; CHECK-NEXT: vmov r0, s4
122+
; CHECK-NEXT: vmovx.f16 s4, s21
123+
; CHECK-NEXT: vmov.16 q0[5], r0
124+
; CHECK-NEXT: vmov r0, s4
125+
; CHECK-NEXT: vmovx.f16 s4, s23
126+
; CHECK-NEXT: vmov.16 q0[6], r0
127+
; CHECK-NEXT: vmov r0, s4
128+
; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload
129+
; CHECK-NEXT: vmov.16 q0[7], r0
130+
; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload
131+
; CHECK-NEXT: vmul.f16 q5, q0, q1
132+
; CHECK-NEXT: vmov r2, s10
133+
; CHECK-NEXT: vmov r0, s20
134+
; CHECK-NEXT: vmovx.f16 s0, s24
135+
; CHECK-NEXT: vmov.16 q4[3], r0
136+
; CHECK-NEXT: vmov r0, s0
137+
; CHECK-NEXT: vmovx.f16 s0, s8
138+
; CHECK-NEXT: vmov.16 q4[4], r0
139+
; CHECK-NEXT: vmov r0, s0
140+
; CHECK-NEXT: vmovx.f16 s0, s12
141+
; CHECK-NEXT: vmov.16 q4[5], r0
142+
; CHECK-NEXT: vmov r0, s0
143+
; CHECK-NEXT: vmovx.f16 s0, s20
144+
; CHECK-NEXT: vmov.16 q4[6], r0
145+
; CHECK-NEXT: vmov r0, s0
146+
; CHECK-NEXT: vmovx.f16 s4, s26
147+
; CHECK-NEXT: vmov.16 q4[7], r0
148+
; CHECK-NEXT: vmov r0, s26
149+
; CHECK-NEXT: vmov.16 q0[0], r0
150+
; CHECK-NEXT: vmov r0, s14
151+
; CHECK-NEXT: vmov.16 q0[1], r2
152+
; CHECK-NEXT: vstrb.8 q4, [r1, #64]!
153+
; CHECK-NEXT: vmov.16 q0[2], r0
154+
; CHECK-NEXT: vmov r0, s22
155+
; CHECK-NEXT: vmov.16 q0[3], r0
156+
; CHECK-NEXT: vmov r0, s4
157+
; CHECK-NEXT: vmovx.f16 s4, s10
158+
; CHECK-NEXT: vmov.16 q0[4], r0
159+
; CHECK-NEXT: vmov r0, s4
160+
; CHECK-NEXT: vmovx.f16 s4, s14
161+
; CHECK-NEXT: vmov.16 q0[5], r0
162+
; CHECK-NEXT: vmov r0, s4
163+
; CHECK-NEXT: vmovx.f16 s4, s22
164+
; CHECK-NEXT: vmov.16 q0[6], r0
165+
; CHECK-NEXT: vmov r0, s4
166+
; CHECK-NEXT: vmov q4, q6
167+
; CHECK-NEXT: vmov.16 q0[7], r0
168+
; CHECK-NEXT: vmov r0, s19
169+
; CHECK-NEXT: vstrh.16 q0, [r1, #32]
170+
; CHECK-NEXT: vmov.16 q0[0], r0
171+
; CHECK-NEXT: vmov r2, s11
172+
; CHECK-NEXT: vmovx.f16 s4, s19
173+
; CHECK-NEXT: vmov.16 q0[1], r2
174+
; CHECK-NEXT: vmov r0, s15
175+
; CHECK-NEXT: vmov.16 q0[2], r0
176+
; CHECK-NEXT: vmov r0, s23
177+
; CHECK-NEXT: vmov.16 q0[3], r0
178+
; CHECK-NEXT: vmov r0, s4
179+
; CHECK-NEXT: vmovx.f16 s4, s11
180+
; CHECK-NEXT: vmov.16 q0[4], r0
181+
; CHECK-NEXT: vmov r0, s4
182+
; CHECK-NEXT: vmovx.f16 s4, s15
183+
; CHECK-NEXT: vmov.16 q0[5], r0
184+
; CHECK-NEXT: vmov r0, s4
185+
; CHECK-NEXT: vmovx.f16 s4, s23
186+
; CHECK-NEXT: vmov.16 q0[6], r0
187+
; CHECK-NEXT: vmov r0, s4
188+
; CHECK-NEXT: vmovx.f16 s4, s17
189+
; CHECK-NEXT: vmov.16 q0[7], r0
190+
; CHECK-NEXT: vmov r2, s17
191+
; CHECK-NEXT: vstrh.16 q0, [r1, #48]
192+
; CHECK-NEXT: vmov r0, s9
193+
; CHECK-NEXT: vmov.16 q0[0], r2
194+
; CHECK-NEXT: vmov.16 q0[1], r0
195+
; CHECK-NEXT: vmov r0, s13
196+
; CHECK-NEXT: vmov.16 q0[2], r0
197+
; CHECK-NEXT: vmov r0, s21
198+
; CHECK-NEXT: vmov.16 q0[3], r0
199+
; CHECK-NEXT: vmov r0, s4
200+
; CHECK-NEXT: vmovx.f16 s4, s9
201+
; CHECK-NEXT: vmov.16 q0[4], r0
202+
; CHECK-NEXT: vmov r0, s4
203+
; CHECK-NEXT: vmovx.f16 s4, s13
204+
; CHECK-NEXT: vmov.16 q0[5], r0
205+
; CHECK-NEXT: vmov r0, s4
206+
; CHECK-NEXT: vmovx.f16 s4, s21
207+
; CHECK-NEXT: vmov.16 q0[6], r0
208+
; CHECK-NEXT: vmov r0, s4
209+
; CHECK-NEXT: vmov.16 q0[7], r0
210+
; CHECK-NEXT: vstrh.16 q0, [r1, #16]
211+
; CHECK-NEXT: le lr, .LBB0_2
212+
; CHECK-NEXT: .LBB0_3: @ %while.end
213+
; CHECK-NEXT: add sp, #40
214+
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
215+
; CHECK-NEXT: pop {r7, pc}
216+
entry:
217+
%tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
218+
%l0 = bitcast i16 %tmp.0.extract.trunc to half
219+
%mul = mul i32 %numCols, %numRows
220+
%shr = lshr i32 %mul, 2
221+
%cmp38 = icmp eq i32 %shr, 0
222+
br i1 %cmp38, label %while.end, label %vector.ph
223+
224+
vector.ph: ; preds = %vector.memcheck
225+
%n.vec = and i32 %shr, 1073741816
226+
%l2 = shl nuw i32 %n.vec, 2
227+
%ind.end = getelementptr half, half* %pIn, i32 %l2
228+
%l3 = shl nuw i32 %n.vec, 2
229+
%ind.end48 = getelementptr half, half* %pOut, i32 %l3
230+
%ind.end50 = sub nsw i32 %shr, %n.vec
231+
%broadcast.splatinsert55 = insertelement <8 x half> undef, half %l0, i32 0
232+
%broadcast.splat56 = shufflevector <8 x half> %broadcast.splatinsert55, <8 x half> undef, <8 x i32> zeroinitializer
233+
br label %vector.body
234+
235+
vector.body: ; preds = %vector.body, %vector.ph
236+
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
237+
%l4 = shl i32 %index, 2
238+
%next.gep = getelementptr half, half* %pIn, i32 %l4
239+
%l5 = shl i32 %index, 2
240+
%l6 = bitcast half* %next.gep to <32 x half>*
241+
%wide.vec = load <32 x half>, <32 x half>* %l6, align 2
242+
%strided.vec = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
243+
%strided.vec52 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
244+
%strided.vec53 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
245+
%strided.vec54 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
246+
%l7 = fmul <8 x half> %strided.vec, %broadcast.splat56
247+
%l8 = fmul <8 x half> %strided.vec52, %broadcast.splat56
248+
%l9 = fmul <8 x half> %strided.vec53, %broadcast.splat56
249+
%l10 = fmul <8 x half> %strided.vec54, %broadcast.splat56
250+
%l11 = getelementptr inbounds half, half* %pOut, i32 %l5
251+
%l12 = bitcast half* %l11 to <32 x half>*
252+
%l13 = shufflevector <8 x half> %l7, <8 x half> %l8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
253+
%l14 = shufflevector <8 x half> %l9, <8 x half> %l10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
254+
%interleaved.vec = shufflevector <16 x half> %l13, <16 x half> %l14, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
255+
store <32 x half> %interleaved.vec, <32 x half>* %l12, align 2
256+
%index.next = add i32 %index, 8
257+
%l15 = icmp eq i32 %index.next, %n.vec
258+
br i1 %l15, label %while.end, label %vector.body
259+
260+
while.end: ; preds = %while.body, %middle.block, %entry
261+
ret void
262+
}

llvm/test/CodeGen/Thumb2/mve-vst4.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
2+
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s
33

44
; i32
55

0 commit comments

Comments
 (0)