-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[ARM] Don't block tail-predication from unrelated VPT blocks. #94239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-arm Author: David Green (davemgreen) ChangesVPT blocks that do not produce an interesting 'output' (like a stored value or reduction result), do not need to be predicated on vctp for the whole loop to be tail-predicated. Just producing results for the valid tail predication lanes should be enough. Full diff: https://github.com/llvm/llvm-project/pull/94239.diff 2 Files Affected:
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index a46c383115e2d..d7c89ca6c6a57 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -115,6 +115,12 @@ static bool shouldInspect(MachineInstr &MI) {
return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
}
+static bool isHorizontalReduction(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::HorizontalReduction) != 0;
+}
+
namespace {
using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -275,6 +281,16 @@ namespace {
if (VPT->getOpcode() == ARM::MVE_VPST)
return false;
+ // If the VPT block does not define something that is an "output", then the
+ // tail-predicated version will just perform a subset of the original vpt
+ // block, where the last lanes should not be used.
+ if (isVPTOpcode(VPT->getOpcode()) &&
+ all_of(Block.getInsts(), [](const MachineInstr *MI) {
+ return !MI->mayStore() && !isHorizontalReduction(*MI) &&
+ !isVCTP(MI);
+ }))
+ return true;
+
auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
@@ -813,12 +829,6 @@ static bool producesDoubleWidthResult(const MachineInstr &MI) {
return (Flags & ARMII::DoubleWidthResult) != 0;
}
-static bool isHorizontalReduction(const MachineInstr &MI) {
- const MCInstrDesc &MCID = MI.getDesc();
- uint64_t Flags = MCID.TSFlags;
- return (Flags & ARMII::HorizontalReduction) != 0;
-}
-
// Can this instruction generate a non-zero result when given only zeroed
// operands? This allows us to know that, given operands with false bytes
// zeroed by masked loads, that the result will also contain zeros in those
diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
index f9b3757bb6d2c..6392452e06cdd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
@@ -20,50 +20,35 @@ define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %p
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: ldrd r4, r5, [sp, #88]
; CHECK-NEXT: mov r7, r0
-; CHECK-NEXT: cmp.w r10, #8
-; CHECK-NEXT: mov.w r0, #1
-; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: mov.w r11, #0
-; CHECK-NEXT: it ge
-; CHECK-NEXT: movge r3, #8
; CHECK-NEXT: vidup.u16 q0, r8, #4
-; CHECK-NEXT: sub.w r3, r10, r3
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: adds r3, #7
; CHECK-NEXT: vmov.i16 q2, #0x100
; CHECK-NEXT: vmov.i16 q3, #0xff
-; CHECK-NEXT: add.w r9, r0, r3, lsr #3
; CHECK-NEXT: .LBB0_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
-; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: mov r6, r8
; CHECK-NEXT: mov r0, r7
-; CHECK-NEXT: dls lr, r9
+; CHECK-NEXT: dlstp.16 lr, r10
; CHECK-NEXT: .LBB0_3: @ %do.body
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vctp.16 r3
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrbt.u16 q5, [r2, q4]
+; CHECK-NEXT: vldrb.u16 q5, [r2, q4]
; CHECK-NEXT: vmul.i16 q4, q5, r5
; CHECK-NEXT: vshr.u16 q4, q4, #8
; CHECK-NEXT: vsub.i16 q5, q2, q4
; CHECK-NEXT: vpt.i16 eq, q4, q3
; CHECK-NEXT: vmovt q5, q1
-; CHECK-NEXT: vctp.16 r3
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrbt.u16 q6, [r0]
+; CHECK-NEXT: vldrb.u16 q6, [r0]
; CHECK-NEXT: vsub.i16 q4, q2, q5
-; CHECK-NEXT: subs r3, #8
; CHECK-NEXT: vmul.i16 q5, q5, q6
; CHECK-NEXT: vmla.i16 q5, q4, r4
; CHECK-NEXT: vshr.u16 q4, q5, #8
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrbt.16 q4, [r0], #8
+; CHECK-NEXT: vstrb.16 q4, [r0], #8
; CHECK-NEXT: vidup.u16 q4, r6, #4
-; CHECK-NEXT: le lr, .LBB0_3
+; CHECK-NEXT: letp lr, .LBB0_3
; CHECK-NEXT: @ %bb.4: @ %do.end
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: add.w r0, r11, #1
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
VPT blocks that do not produce an interesting 'output' (like a stored value or reduction result), do not need to be predicated on vctp for the whole loop to be tail-predicated. Just producing results for the valid tail predication lanes should be enough.
03c6299
to
c5368ff
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, cheers! LGTM
VPT blocks that do not produce an interesting 'output' (like a stored value or reduction result), do not need to be predicated on vctp for the whole loop to be tail-predicated. Just producing results for the valid tail predication lanes should be enough.