Skip to content

Commit b81c57d

Browse files
committed
[ARM][LowOverheadLoops] Allow tail predication on predicated instructions with unknown lane
values The effects of unpredicated vector instruction with unknown lanes cannot be predicted and therefore cannot be tail predicated. This does not apply to predicated vector instructions and so this patch allows tail predication on them. Differential Revision: https://reviews.llvm.org/D87376
1 parent 6313f55 commit b81c57d

File tree

3 files changed

+50
-16
lines changed

3 files changed

+50
-16
lines changed

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -723,7 +723,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
723723
continue;
724724
else if (!isPredicated && retainsOrReduces)
725725
return false;
726-
else
726+
else if (!isPredicated)
727727
FalseLanesUnknown.insert(&MI);
728728
}
729729

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -O3 -tail-predication=force-enabled-no-reductions %s -o - | FileCheck %s
3+
4+
define arm_aapcs_vfpcc <4 x float> @arm_max_no_idx_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) {
5+
; CHECK-LABEL: arm_max_no_idx_f32_mve:
6+
; CHECK: @ %bb.0: @ %entry
7+
; CHECK-NEXT: .save {r7, lr}
8+
; CHECK-NEXT: push {r7, lr}
9+
; CHECK-NEXT: subs r2, r1, #4
10+
; CHECK-NEXT: adr r3, .LCPI0_0
11+
; CHECK-NEXT: vldrw.u32 q0, [r3]
12+
; CHECK-NEXT: dlstp.32 lr, r1
13+
; CHECK-NEXT: .LBB0_1: @ %do.body
14+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
15+
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
16+
; CHECK-NEXT: vmaxnm.f32 q0, q1, q0
17+
; CHECK-NEXT: letp lr, .LBB0_1
18+
; CHECK-NEXT: @ %bb.2: @ %do.end
19+
; CHECK-NEXT: pop {r7, pc}
20+
entry:
21+
br label %do.body
22+
23+
do.body: ; preds = %do.body, %entry
24+
%blockSize.addr.0 = phi i32 [ %blockSize, %entry ], [ %sub, %do.body ]
25+
%curExtremValVec.0 = phi <4 x float> [ <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, %entry ], [ %3, %do.body ]
26+
%pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
27+
%0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blockSize.addr.0)
28+
%1 = bitcast float* %pSrc.addr.0 to <4 x float>*
29+
%2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
30+
%3 = tail call fast <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %curExtremValVec.0, i32 0, <4 x i1> %0, <4 x float> %curExtremValVec.0)
31+
%add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
32+
%sub = add i32 %blockSize.addr.0, -4
33+
%cmp = icmp sgt i32 %sub, 0
34+
br i1 %cmp, label %do.body, label %do.end
35+
36+
do.end: ; preds = %do.body
37+
ret <4 x float> %3
38+
}
39+
40+
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
41+
42+
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
43+
44+
declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, i32, <4 x i1>, <4 x float>)

llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,32 +9,22 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %
99
; CHECK-NEXT: .vsave {d8, d9}
1010
; CHECK-NEXT: vpush {d8, d9}
1111
; CHECK-NEXT: movs r6, #0
12-
; CHECK-NEXT: mov r12, r1
1312
; CHECK-NEXT: vidup.u32 q2, r6, #1
14-
; CHECK-NEXT: cmp r1, #4
15-
; CHECK-NEXT: it ge
16-
; CHECK-NEXT: movge.w r12, #4
17-
; CHECK-NEXT: sub.w r6, r1, r12
18-
; CHECK-NEXT: adds r6, #3
19-
; CHECK-NEXT: mov.w lr, #1
2013
; CHECK-NEXT: adr r4, .LCPI0_0
2114
; CHECK-NEXT: vmov.i32 q0, #0x0
22-
; CHECK-NEXT: add.w lr, lr, r6, lsr #2
2315
; CHECK-NEXT: vldrw.u32 q1, [r4]
2416
; CHECK-NEXT: vmov.i32 q3, #0x4
2517
; CHECK-NEXT: mov r12, r1
26-
; CHECK-NEXT: dls lr, lr
18+
; CHECK-NEXT: dlstp.32 lr, r12
2719
; CHECK-NEXT: .LBB0_1: @ %do.body
2820
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
29-
; CHECK-NEXT: vctp.32 r12
30-
; CHECK-NEXT: sub.w r12, r12, #4
31-
; CHECK-NEXT: vpstttt
32-
; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
33-
; CHECK-NEXT: vcmpt.f32 ge, q1, q4
21+
; CHECK-NEXT: vldrw.u32 q4, [r0], #16
22+
; CHECK-NEXT: vcmp.f32 ge, q1, q4
23+
; CHECK-NEXT: vpstt
3424
; CHECK-NEXT: vmovt q1, q4
3525
; CHECK-NEXT: vmovt q0, q2
3626
; CHECK-NEXT: vadd.i32 q2, q2, q3
37-
; CHECK-NEXT: le lr, .LBB0_1
27+
; CHECK-NEXT: letp lr, .LBB0_1
3828
; CHECK-NEXT: @ %bb.2: @ %do.end
3929
; CHECK-NEXT: vldr s8, .LCPI0_1
4030
; CHECK-NEXT: vdup.32 q3, r1

0 commit comments

Comments
 (0)