Skip to content

Commit e73b20c

Browse files
committed
[ARM][MVE] Disallow VPSEL for tail predication
Due to the current way that we collect predicated instructions, we can't easily handle vpsel in tail predicated loops. There are a couple of issues: 1) It will use the VPR as a predicate operand, but doesn't have to be instead a VPT block, which means we can assert while building up the VPT block because we don't find another VPST to being a new one. 2) VPSEL still requires a VPR operand even after tail predicating, which means we can't remove it unless there is another instruction, such as vcmp, that can provide the VPR def. The first issue should be a relatively simple fix in the logic of the LowOverheadLoops pass, whereas the second will require us to represent the 'implicit' tail predication with an explicit value. Differential Revision: https://reviews.llvm.org/D72629
1 parent d6ea8ff commit e73b20c

File tree

7 files changed

+1199
-4
lines changed

7 files changed

+1199
-4
lines changed

llvm/lib/Target/ARM/ARMInstrMVE.td

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5712,7 +5712,6 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
57125712
let Inst{4} = 0b0;
57135713
let Inst{3-1} = Qm{2-0};
57145714
let Inst{0} = 0b1;
5715-
let validForTailPredication = 1;
57165715
}
57175716

57185717
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,9 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
485485
}
486486

487487
bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) {
488+
if (CannotTailPredicate)
489+
return false;
490+
488491
// Only support a single vctp.
489492
if (isVCTP(MI) && VCTP)
490493
return false;
@@ -494,10 +497,20 @@ bool LowOverheadLoop::RecordVPTBlocks(MachineInstr* MI) {
494497
VPTBlocks.emplace_back(MI, CurrentPredicate);
495498
CurrentBlock = &VPTBlocks.back();
496499
return true;
497-
}
498-
499-
if (isVCTP(MI))
500+
} else if (isVCTP(MI))
500501
VCTP = MI;
502+
else if (MI->getOpcode() == ARM::MVE_VPSEL ||
503+
MI->getOpcode() == ARM::MVE_VPNOT)
504+
return false;
505+
506+
// TODO: Allow VPSEL and VPNOT, we currently cannot because:
507+
// 1) It will use the VPR as a predicate operand, but doesn't have to be
508+
// instead a VPT block, which means we can assert while building up
509+
// the VPT block because we don't find another VPST to being a new
510+
// one.
511+
// 2) VPSEL still requires a VPR operand even after tail predicating,
512+
// which means we can't remove it unless there is another
513+
// instruction, such as vcmp, that can provide the VPR def.
501514

502515
unsigned VPROpNum = MI->getNumOperands() - 1;
503516
bool IsUse = false;
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
3+
4+
# Test that VPNOTs cannot be within a tail predicated loop.
5+
6+
--- |
7+
define dso_local void @inloop_vpnot(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32* nocapture %e, i32 %N) local_unnamed_addr #0 {
8+
entry:
9+
%cmp9 = icmp eq i32 %N, 0
10+
%tmp = add i32 %N, 3
11+
%tmp1 = lshr i32 %tmp, 2
12+
%tmp2 = shl nuw i32 %tmp1, 2
13+
%tmp3 = add i32 %tmp2, -4
14+
%tmp4 = lshr i32 %tmp3, 2
15+
%tmp5 = add nuw nsw i32 %tmp4, 1
16+
br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
17+
18+
vector.ph: ; preds = %entry
19+
call void @llvm.set.loop.iterations.i32(i32 %tmp5)
20+
br label %vector.body
21+
22+
vector.body: ; preds = %vector.body, %vector.ph
23+
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %tmp5, %vector.ph ]
24+
%lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
25+
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
26+
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
27+
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
28+
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
29+
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
30+
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
31+
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
32+
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
33+
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
34+
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
35+
%lsr.cast.e = bitcast i32* %lsr.iv.e to <4 x i32>*
36+
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
37+
%tmp9 = sub i32 %tmp7, 4
38+
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
39+
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
40+
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
41+
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
42+
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
43+
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
44+
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
45+
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
46+
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
47+
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
48+
%tmp13 = add <4 x i32> %tmp12, %mul.2
49+
%tmp14 = add <4 x i32> %tmp13, %vec.phi
50+
%vpnot = xor <4 x i1> %tmp8, <i1 true, i1 true, i1 true, i1 true>
51+
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp14, <4 x i32>* %lsr.cast.e, i32 4, <4 x i1> %vpnot)
52+
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
53+
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
54+
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
55+
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
56+
%scevgep.e = getelementptr i32, i32* %lsr.iv.e, i32 4
57+
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
58+
%tmp16 = icmp ne i32 %tmp15, 0
59+
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
60+
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
61+
62+
for.cond.cleanup: ; preds = %vector.body, %entry
63+
ret void
64+
}
65+
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
66+
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
67+
declare void @llvm.set.loop.iterations.i32(i32) #3
68+
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
69+
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
70+
71+
...
72+
---
73+
name: inloop_vpnot
74+
alignment: 2
75+
exposesReturnsTwice: false
76+
legalized: false
77+
regBankSelected: false
78+
selected: false
79+
failedISel: false
80+
tracksRegLiveness: true
81+
hasWinCFI: false
82+
registers: []
83+
liveins:
84+
- { reg: '$r0', virtual-reg: '' }
85+
- { reg: '$r1', virtual-reg: '' }
86+
- { reg: '$r2', virtual-reg: '' }
87+
- { reg: '$r3', virtual-reg: '' }
88+
frameInfo:
89+
isFrameAddressTaken: false
90+
isReturnAddressTaken: false
91+
hasStackMap: false
92+
hasPatchPoint: false
93+
stackSize: 16
94+
offsetAdjustment: 0
95+
maxAlignment: 4
96+
adjustsStack: false
97+
hasCalls: false
98+
stackProtector: ''
99+
maxCallFrameSize: 0
100+
cvBytesOfCalleeSavedRegisters: 0
101+
hasOpaqueSPAdjustment: false
102+
hasVAStart: false
103+
hasMustTailInVarArgFunc: false
104+
localFrameSize: 0
105+
savePoint: ''
106+
restorePoint: ''
107+
fixedStack:
108+
- { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
109+
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
110+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
111+
- { id: 1, type: default, offset: 0, size: 4, alignment: 8, stack-id: default,
112+
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
113+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
114+
stack:
115+
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
116+
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
117+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
118+
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
119+
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
120+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
121+
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
122+
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
123+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
124+
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
125+
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
126+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
127+
callSites: []
128+
constants: []
129+
machineFunctionInfo: {}
130+
body: |
131+
; CHECK-LABEL: name: inloop_vpnot
132+
; CHECK: bb.0.entry:
133+
; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000)
134+
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
135+
; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
136+
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
137+
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
138+
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
139+
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
140+
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
141+
; CHECK: renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.1)
142+
; CHECK: t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
143+
; CHECK: tBcc %bb.3, 0, killed $cpsr
144+
; CHECK: bb.1.vector.ph:
145+
; CHECK: successors: %bb.2(0x80000000)
146+
; CHECK: liveins: $r0, $r1, $r2, $r3, $r12
147+
; CHECK: renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
148+
; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
149+
; CHECK: renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
150+
; CHECK: renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
151+
; CHECK: renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
152+
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
153+
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
154+
; CHECK: $lr = t2DLS renamable $lr
155+
; CHECK: $r4 = tMOVr killed $lr, 14, $noreg
156+
; CHECK: bb.2.vector.body:
157+
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
158+
; CHECK: liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
159+
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
160+
; CHECK: MVE_VPST 4, implicit $vpr
161+
; CHECK: renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
162+
; CHECK: renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
163+
; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
164+
; CHECK: MVE_VPST 4, implicit $vpr
165+
; CHECK: renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
166+
; CHECK: renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
167+
; CHECK: renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
168+
; CHECK: $lr = tMOVr $r4, 14, $noreg
169+
; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
170+
; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
171+
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
172+
; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
173+
; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
174+
; CHECK: MVE_VPST 8, implicit $vpr
175+
; CHECK: renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
176+
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
177+
; CHECK: bb.3.for.cond.cleanup:
178+
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
179+
bb.0.entry:
180+
successors: %bb.3(0x30000000), %bb.1(0x50000000)
181+
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
182+
183+
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
184+
frame-setup CFI_INSTRUCTION def_cfa_offset 16
185+
frame-setup CFI_INSTRUCTION offset $lr, -4
186+
frame-setup CFI_INSTRUCTION offset $r7, -8
187+
frame-setup CFI_INSTRUCTION offset $r5, -12
188+
frame-setup CFI_INSTRUCTION offset $r4, -16
189+
renamable $r12 = t2LDRi12 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.0)
190+
t2CMPri renamable $r12, 0, 14, $noreg, implicit-def $cpsr
191+
tBcc %bb.3, 0, killed $cpsr
192+
193+
bb.1.vector.ph:
194+
successors: %bb.2(0x80000000)
195+
liveins: $r0, $r1, $r2, $r3, $r12
196+
197+
renamable $lr = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
198+
renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
199+
renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
200+
renamable $r5 = tLDRspi $sp, 4, 14, $noreg :: (load 4 from %fixed-stack.1, align 8)
201+
renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
202+
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
203+
renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
204+
t2DoLoopStart renamable $lr
205+
$r4 = tMOVr killed $lr, 14, $noreg
206+
207+
bb.2.vector.body:
208+
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
209+
liveins: $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r12
210+
211+
renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
212+
MVE_VPST 4, implicit $vpr
213+
renamable $r3, renamable $q1 = MVE_VLDRHS32_post killed renamable $r3, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17.d, align 2)
214+
renamable $r2, renamable $q2 = MVE_VLDRHS32_post killed renamable $r2, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820.c, align 2)
215+
renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
216+
MVE_VPST 4, implicit $vpr
217+
renamable $r0, renamable $q2 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
218+
renamable $r1, renamable $q3 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
219+
renamable $q2 = nsw MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
220+
$lr = tMOVr $r4, 14, $noreg
221+
renamable $q1 = MVE_VADDi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
222+
renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14, $noreg
223+
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
224+
renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
225+
renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
226+
MVE_VPST 8, implicit $vpr
227+
renamable $r5 = MVE_VSTRWU32_post renamable $q0, killed renamable $r5, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.cast.e, align 4)
228+
renamable $lr = t2LoopDec killed renamable $lr, 1
229+
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
230+
tB %bb.3, 14, $noreg
231+
232+
bb.3.for.cond.cleanup:
233+
tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
234+
235+
...

0 commit comments

Comments
 (0)