Skip to content

Commit 5fe7307

Browse files
authored
[ARM] Don't block tail-predication from unrelated VPT blocks. (#94239)
VPT blocks that do not produce an interesting 'output' (like a stored value or reduction result), do not need to be predicated on vctp for the whole loop to be tail-predicated. Just producing results for the valid tail predication lanes should be enough.
1 parent b3b9f8d commit 5fe7307

File tree

2 files changed

+21
-26
lines changed

2 files changed

+21
-26
lines changed

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ static bool shouldInspect(MachineInstr &MI) {
115115
return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
116116
}
117117

118+
static bool isHorizontalReduction(const MachineInstr &MI) {
119+
const MCInstrDesc &MCID = MI.getDesc();
120+
uint64_t Flags = MCID.TSFlags;
121+
return (Flags & ARMII::HorizontalReduction) != 0;
122+
}
123+
118124
namespace {
119125

120126
using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -275,6 +281,16 @@ namespace {
275281
if (VPT->getOpcode() == ARM::MVE_VPST)
276282
return false;
277283

284+
// If the VPT block does not define something that is an "output", then
285+
// the tail-predicated version will just perform a subset of the original
286+
// vpt block, where the last lanes should not be used.
287+
if (isVPTOpcode(VPT->getOpcode()) &&
288+
all_of(Block.getInsts(), [](const MachineInstr *MI) {
289+
return !MI->mayStore() && !MI->mayLoad() &&
290+
!isHorizontalReduction(*MI) && !isVCTP(MI);
291+
}))
292+
return true;
293+
278294
auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
279295
MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
280296
return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
@@ -813,12 +829,6 @@ static bool producesDoubleWidthResult(const MachineInstr &MI) {
813829
return (Flags & ARMII::DoubleWidthResult) != 0;
814830
}
815831

816-
static bool isHorizontalReduction(const MachineInstr &MI) {
817-
const MCInstrDesc &MCID = MI.getDesc();
818-
uint64_t Flags = MCID.TSFlags;
819-
return (Flags & ARMII::HorizontalReduction) != 0;
820-
}
821-
822832
// Can this instruction generate a non-zero result when given only zeroed
823833
// operands? This allows us to know that, given operands with false bytes
824834
// zeroed by masked loads, that the result will also contain zeros in those

llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,50 +20,35 @@ define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %p
2020
; CHECK-NEXT: mov.w r8, #0
2121
; CHECK-NEXT: ldrd r4, r5, [sp, #88]
2222
; CHECK-NEXT: mov r7, r0
23-
; CHECK-NEXT: cmp.w r10, #8
24-
; CHECK-NEXT: mov.w r0, #1
25-
; CHECK-NEXT: mov r3, r10
2623
; CHECK-NEXT: mov.w r11, #0
27-
; CHECK-NEXT: it ge
28-
; CHECK-NEXT: movge r3, #8
2924
; CHECK-NEXT: vidup.u16 q0, r8, #4
30-
; CHECK-NEXT: sub.w r3, r10, r3
3125
; CHECK-NEXT: vmov.i32 q1, #0x0
32-
; CHECK-NEXT: adds r3, #7
3326
; CHECK-NEXT: vmov.i16 q2, #0x100
3427
; CHECK-NEXT: vmov.i16 q3, #0xff
35-
; CHECK-NEXT: add.w r9, r0, r3, lsr #3
3628
; CHECK-NEXT: .LBB0_2: @ %for.body
3729
; CHECK-NEXT: @ =>This Loop Header: Depth=1
3830
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
39-
; CHECK-NEXT: mov r3, r10
4031
; CHECK-NEXT: vmov q4, q0
4132
; CHECK-NEXT: mov r6, r8
4233
; CHECK-NEXT: mov r0, r7
43-
; CHECK-NEXT: dls lr, r9
34+
; CHECK-NEXT: dlstp.16 lr, r10
4435
; CHECK-NEXT: .LBB0_3: @ %do.body
4536
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
4637
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
47-
; CHECK-NEXT: vctp.16 r3
48-
; CHECK-NEXT: vpst
49-
; CHECK-NEXT: vldrbt.u16 q5, [r2, q4]
38+
; CHECK-NEXT: vldrb.u16 q5, [r2, q4]
5039
; CHECK-NEXT: vmul.i16 q4, q5, r5
5140
; CHECK-NEXT: vshr.u16 q4, q4, #8
5241
; CHECK-NEXT: vsub.i16 q5, q2, q4
5342
; CHECK-NEXT: vpt.i16 eq, q4, q3
5443
; CHECK-NEXT: vmovt q5, q1
55-
; CHECK-NEXT: vctp.16 r3
56-
; CHECK-NEXT: vpst
57-
; CHECK-NEXT: vldrbt.u16 q6, [r0]
44+
; CHECK-NEXT: vldrb.u16 q6, [r0]
5845
; CHECK-NEXT: vsub.i16 q4, q2, q5
59-
; CHECK-NEXT: subs r3, #8
6046
; CHECK-NEXT: vmul.i16 q5, q5, q6
6147
; CHECK-NEXT: vmla.i16 q5, q4, r4
6248
; CHECK-NEXT: vshr.u16 q4, q5, #8
63-
; CHECK-NEXT: vpst
64-
; CHECK-NEXT: vstrbt.16 q4, [r0], #8
49+
; CHECK-NEXT: vstrb.16 q4, [r0], #8
6550
; CHECK-NEXT: vidup.u16 q4, r6, #4
66-
; CHECK-NEXT: le lr, .LBB0_3
51+
; CHECK-NEXT: letp lr, .LBB0_3
6752
; CHECK-NEXT: @ %bb.4: @ %do.end
6853
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
6954
; CHECK-NEXT: add.w r0, r11, #1

0 commit comments

Comments
 (0)