Skip to content

Commit 850f30c

Browse files
committed
[ARM][MVE] Don't allow tail-predication with else predicates
The test case contains a vpt block with an else predicated instruction. This might not be very unrealistic, but currently crashes due to not being able to handle the else. The instruction would need to be removed. This patch adds some extra checks that none of the instructions in vpt block is else predicated, leaving it using vctp.
1 parent 1c6746e commit 850f30c

File tree

2 files changed

+206
-2
lines changed

2 files changed

+206
-2
lines changed

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,9 @@ namespace {
251251
SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI];
252252
if (Exclusive && Predicates.size() != 1)
253253
return false;
254+
// We do not know how to convert an else predicate of a VCTP.
255+
if (getVPTInstrPredicate(*MI) == ARMVCC::Else)
256+
return false;
254257
return llvm::any_of(Predicates, isVCTP);
255258
}
256259

@@ -305,8 +308,12 @@ namespace {
305308
// isn't predicated on entry, check whether the vctp is within the block
306309
// and that all other instructions are then predicated on it.
307310
for (auto &Block : Blocks) {
308-
if (isEntryPredicatedOnVCTP(Block, false) ||
309-
hasImplicitlyValidVPT(Block, RDA))
311+
if (isEntryPredicatedOnVCTP(Block, false) &&
312+
!any_of(drop_begin(Block.getInsts()), [](const MachineInstr *MI) {
313+
return getVPTInstrPredicate(*MI) == ARMVCC::Else;
314+
}))
315+
continue;
316+
if (hasImplicitlyValidVPT(Block, RDA))
310317
continue;
311318

312319
SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3+
4+
; This loop has a vpt block that should not block tailpredication
5+
define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %pwLineMask, ptr %ptCopySize, i8 zeroext %chColour, i8 zeroext %chOpacity) {
6+
; CHECK-LABEL: convert_vptblock:
7+
; CHECK: @ %bb.0: @ %entry
8+
; CHECK-NEXT: ldrsh.w r12, [r3, #2]
9+
; CHECK-NEXT: cmp.w r12, #1
10+
; CHECK-NEXT: it lt
11+
; CHECK-NEXT: bxlt lr
12+
; CHECK-NEXT: .LBB0_1: @ %for.body.lr.ph
13+
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
14+
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
15+
; CHECK-NEXT: .pad #4
16+
; CHECK-NEXT: sub sp, #4
17+
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
18+
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
19+
; CHECK-NEXT: ldrsh.w r10, [r3]
20+
; CHECK-NEXT: mov.w r8, #0
21+
; CHECK-NEXT: ldrd r4, r5, [sp, #88]
22+
; CHECK-NEXT: mov r7, r0
23+
; CHECK-NEXT: cmp.w r10, #8
24+
; CHECK-NEXT: mov.w r0, #1
25+
; CHECK-NEXT: mov r3, r10
26+
; CHECK-NEXT: mov.w r11, #0
27+
; CHECK-NEXT: it ge
28+
; CHECK-NEXT: movge r3, #8
29+
; CHECK-NEXT: vidup.u16 q0, r8, #4
30+
; CHECK-NEXT: sub.w r3, r10, r3
31+
; CHECK-NEXT: vmov.i32 q1, #0x0
32+
; CHECK-NEXT: adds r3, #7
33+
; CHECK-NEXT: vmov.i16 q2, #0x100
34+
; CHECK-NEXT: vmov.i16 q3, #0xff
35+
; CHECK-NEXT: add.w r9, r0, r3, lsr #3
36+
; CHECK-NEXT: .LBB0_2: @ %for.body
37+
; CHECK-NEXT: @ =>This Loop Header: Depth=1
38+
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
39+
; CHECK-NEXT: mov r3, r10
40+
; CHECK-NEXT: vmov q4, q0
41+
; CHECK-NEXT: mov r6, r8
42+
; CHECK-NEXT: mov r0, r7
43+
; CHECK-NEXT: dls lr, r9
44+
; CHECK-NEXT: .LBB0_3: @ %do.body
45+
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
46+
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
47+
; CHECK-NEXT: vctp.16 r3
48+
; CHECK-NEXT: vpst
49+
; CHECK-NEXT: vldrbt.u16 q5, [r2, q4]
50+
; CHECK-NEXT: vmul.i16 q4, q5, r5
51+
; CHECK-NEXT: vshr.u16 q4, q4, #8
52+
; CHECK-NEXT: vsub.i16 q5, q2, q4
53+
; CHECK-NEXT: vpt.i16 eq, q4, q3
54+
; CHECK-NEXT: vmovt q5, q1
55+
; CHECK-NEXT: vctp.16 r3
56+
; CHECK-NEXT: vpst
57+
; CHECK-NEXT: vldrbt.u16 q6, [r0]
58+
; CHECK-NEXT: vsub.i16 q4, q2, q5
59+
; CHECK-NEXT: subs r3, #8
60+
; CHECK-NEXT: vmul.i16 q5, q5, q6
61+
; CHECK-NEXT: vmla.i16 q5, q4, r4
62+
; CHECK-NEXT: vshr.u16 q4, q5, #8
63+
; CHECK-NEXT: vpst
64+
; CHECK-NEXT: vstrbt.16 q4, [r0], #8
65+
; CHECK-NEXT: vidup.u16 q4, r6, #4
66+
; CHECK-NEXT: le lr, .LBB0_3
67+
; CHECK-NEXT: @ %bb.4: @ %do.end
68+
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
69+
; CHECK-NEXT: add.w r0, r11, #1
70+
; CHECK-NEXT: add r7, r1
71+
; CHECK-NEXT: sxth.w r11, r0
72+
; CHECK-NEXT: cmp r11, r12
73+
; CHECK-NEXT: blt .LBB0_2
74+
; CHECK-NEXT: @ %bb.5:
75+
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
76+
; CHECK-NEXT: add sp, #4
77+
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
78+
; CHECK-NEXT: bx lr
79+
entry:
80+
%iHeight1 = getelementptr inbounds i8, ptr %ptCopySize, i32 2
81+
%0 = load i16, ptr %iHeight1, align 2
82+
%cmp28 = icmp sgt i16 %0, 0
83+
br i1 %cmp28, label %for.body.lr.ph, label %for.cond.cleanup
84+
85+
for.body.lr.ph: ; preds = %entry
86+
%1 = load i16, ptr %ptCopySize, align 2
87+
%conv5 = sext i16 %1 to i32
88+
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 0, i32 4)
89+
%conv6 = zext i8 %chOpacity to i16
90+
%.splatinsert = insertelement <8 x i16> poison, i16 %conv6, i64 0
91+
%.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
92+
%conv7 = zext i8 %chColour to i16
93+
%.splatinsert.i = insertelement <8 x i16> poison, i16 %conv7, i64 0
94+
%.splat.i = shufflevector <8 x i16> %.splatinsert.i, <8 x i16> poison, <8 x i32> zeroinitializer
95+
%conv11 = sext i16 %iTargetStride to i32
96+
br label %for.body
97+
98+
for.cond.cleanup: ; preds = %do.end, %entry
99+
ret void
100+
101+
for.body: ; preds = %for.body.lr.ph, %do.end
102+
%pchTarget.addr.030 = phi ptr [ %pchTarget, %for.body.lr.ph ], [ %add.ptr12, %do.end ]
103+
%y.029 = phi i16 [ 0, %for.body.lr.ph ], [ %inc, %do.end ]
104+
br label %do.body
105+
106+
do.body: ; preds = %do.body, %for.body
107+
%blkCnt.0 = phi i32 [ %conv5, %for.body ], [ %sub8, %do.body ]
108+
%.pn = phi { <8 x i16>, i32 } [ %2, %for.body ], [ %13, %do.body ]
109+
%pchTargetLine.0 = phi ptr [ %pchTarget.addr.030, %for.body ], [ %add.ptr, %do.body ]
110+
%vStride4Offs.0 = extractvalue { <8 x i16>, i32 } %.pn, 0
111+
%incr.0 = extractvalue { <8 x i16>, i32 } %.pn, 1
112+
%3 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0)
113+
%4 = tail call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %pwLineMask, <8 x i16> %vStride4Offs.0, i32 8, i32 0, i32 1, <8 x i1> %3)
114+
%5 = mul <8 x i16> %4, %.splat
115+
%shr = lshr <8 x i16> %5, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
116+
%6 = icmp eq <8 x i16> %shr, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
117+
%7 = sub nuw nsw <8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, %shr
118+
%sub = select <8 x i1> %6, <8 x i16> zeroinitializer, <8 x i16> %7
119+
%8 = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %pchTargetLine.0, i32 1, <8 x i1> %3, <8 x i8> zeroinitializer)
120+
%9 = zext <8 x i8> %8 to <8 x i16>
121+
%sub.i = sub nsw <8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, %sub
122+
%10 = mul <8 x i16> %sub.i, %.splat.i
123+
%11 = mul <8 x i16> %sub, %9
124+
%add.i = add <8 x i16> %10, %11
125+
%shr.i = lshr <8 x i16> %add.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
126+
%12 = trunc nuw <8 x i16> %shr.i to <8 x i8>
127+
tail call void @llvm.masked.store.v8i8.p0(<8 x i8> %12, ptr %pchTargetLine.0, i32 1, <8 x i1> %3)
128+
%13 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %incr.0, i32 4)
129+
%add.ptr = getelementptr inbounds i8, ptr %pchTargetLine.0, i32 8
130+
%sub8 = add nsw i32 %blkCnt.0, -8
131+
%cmp9 = icmp sgt i32 %blkCnt.0, 8
132+
br i1 %cmp9, label %do.body, label %do.end
133+
134+
do.end: ; preds = %do.body
135+
%add.ptr12 = getelementptr inbounds i8, ptr %pchTarget.addr.030, i32 %conv11
136+
%inc = add nuw nsw i16 %y.029, 1
137+
%cmp = icmp slt i16 %inc, %0
138+
br i1 %cmp, label %for.body, label %for.cond.cleanup
139+
}
140+
141+
; This loop has an else predicate on the vqshl, which is not very realistic but
142+
; prevents us from converting to a vptblock without being able to remove it.
143+
define i32 @else(ptr %s1, ptr %s2, i32 %x, ptr %d, i32 %n) {
144+
; CHECK-LABEL: else:
145+
; CHECK: @ %bb.0: @ %entry
146+
; CHECK-NEXT: .save {r7, lr}
147+
; CHECK-NEXT: push {r7, lr}
148+
; CHECK-NEXT: ldr r2, [sp, #8]
149+
; CHECK-NEXT: cmp r2, #4
150+
; CHECK-NEXT: mov r3, r2
151+
; CHECK-NEXT: it ge
152+
; CHECK-NEXT: movge r3, #4
153+
; CHECK-NEXT: subs r3, r2, r3
154+
; CHECK-NEXT: add.w r12, r3, #3
155+
; CHECK-NEXT: movs r3, #1
156+
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
157+
; CHECK-NEXT: movs r3, #98
158+
; CHECK-NEXT: dls lr, r12
159+
; CHECK-NEXT: .LBB1_1: @ %do.body
160+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
161+
; CHECK-NEXT: vctp.32 r2
162+
; CHECK-NEXT: subs r2, #4
163+
; CHECK-NEXT: vpstt
164+
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
165+
; CHECK-NEXT: vldrwt.u32 q0, [r0]
166+
; CHECK-NEXT: vmov q2, q1
167+
; CHECK-NEXT: vpstet
168+
; CHECK-NEXT: vqdmlsdht.s32 q2, q1, q0
169+
; CHECK-NEXT: vqshle.u32 q2, r3
170+
; CHECK-NEXT: vstrwt.32 q2, [r0], #16
171+
; CHECK-NEXT: le lr, .LBB1_1
172+
; CHECK-NEXT: @ %bb.2: @ %do.end
173+
; CHECK-NEXT: movs r0, #0
174+
; CHECK-NEXT: pop {r7, pc}
175+
entry:
176+
br label %do.body
177+
178+
do.body: ; preds = %do.body, %entry
179+
%n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ]
180+
%s2.addr.0 = phi ptr [ %s2, %entry ], [ %add.ptr1, %do.body ]
181+
%s1.addr.0 = phi ptr [ %s1, %entry ], [ %add.ptr, %do.body ]
182+
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
183+
%1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
184+
%2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s2.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
185+
%3 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %2, <4 x i32> %1, i32 0, i32 0, i32 1, <4 x i1> %0)
186+
%4 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
187+
%5 = tail call <4 x i32> @llvm.arm.mve.vshl.scalar.predicated.v4i32.v4i1(<4 x i32> %3, i32 98, i32 1, i32 0, i32 1, <4 x i1> %4)
188+
tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %s1.addr.0, i32 4, <4 x i1> %0)
189+
%add.ptr = getelementptr inbounds i8, ptr %s1.addr.0, i32 16
190+
%add.ptr1 = getelementptr inbounds i8, ptr %s2.addr.0, i32 16
191+
%sub = add nsw i32 %n.addr.0, -4
192+
%cmp = icmp sgt i32 %n.addr.0, 4
193+
br i1 %cmp, label %do.body, label %do.end
194+
195+
do.end: ; preds = %do.body
196+
ret i32 0
197+
}

0 commit comments

Comments
 (0)