Skip to content

Commit d5eb7ff

Browse files
author
Pierre-vh
committed
[Target][ARM] Fold or(A, B) more aggressively for I1 vectors
This patch makes the folding of or(A, B) into not(and(not(A), not(B))) more agressive for I1 vector. This only affects Thumb2 MVE and improves codegen, because it removes a lot of msr/mrs instructions on VPR.P0. This patch also adds a xor(vcmp) -> !vcmp fold for MVE. Differential Revision: https://reviews.llvm.org/D77202
1 parent ffdda49 commit d5eb7ff

File tree

6 files changed

+157
-168
lines changed

6 files changed

+157
-168
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12651,58 +12651,44 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) {
1265112651
};
1265212652
}
1265312653

12654+
static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
12655+
if (N->getOpcode() == ARMISD::VCMP)
12656+
return (ARMCC::CondCodes)N->getConstantOperandVal(2);
12657+
else if (N->getOpcode() == ARMISD::VCMPZ)
12658+
return (ARMCC::CondCodes)N->getConstantOperandVal(1);
12659+
else
12660+
llvm_unreachable("Not a VCMP/VCMPZ!");
12661+
}
12662+
12663+
static bool CanInvertMVEVCMP(SDValue N) {
12664+
ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
12665+
return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
12666+
}
12667+
1265412668
static SDValue PerformORCombine_i1(SDNode *N,
1265512669
TargetLowering::DAGCombinerInfo &DCI,
1265612670
const ARMSubtarget *Subtarget) {
1265712671
// Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
1265812672
// together with predicates
1265912673
EVT VT = N->getValueType(0);
12674+
SDLoc DL(N);
1266012675
SDValue N0 = N->getOperand(0);
1266112676
SDValue N1 = N->getOperand(1);
1266212677

12663-
ARMCC::CondCodes CondCode0 = ARMCC::AL;
12664-
ARMCC::CondCodes CondCode1 = ARMCC::AL;
12665-
if (N0->getOpcode() == ARMISD::VCMP)
12666-
CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
12667-
->getZExtValue();
12668-
else if (N0->getOpcode() == ARMISD::VCMPZ)
12669-
CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
12670-
->getZExtValue();
12671-
if (N1->getOpcode() == ARMISD::VCMP)
12672-
CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
12673-
->getZExtValue();
12674-
else if (N1->getOpcode() == ARMISD::VCMPZ)
12675-
CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
12676-
->getZExtValue();
12677-
12678-
if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
12679-
return SDValue();
12680-
12681-
unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
12682-
unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
12678+
auto IsFreelyInvertable = [&](SDValue V) {
12679+
if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
12680+
return CanInvertMVEVCMP(V);
12681+
return false;
12682+
};
1268312683

12684-
if (!isValidMVECond(Opposite0,
12685-
N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
12686-
!isValidMVECond(Opposite1,
12687-
N1->getOperand(0)->getValueType(0).isFloatingPoint()))
12684+
// At least one operand must be freely invertable.
12685+
if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
1268812686
return SDValue();
1268912687

12690-
SmallVector<SDValue, 4> Ops0;
12691-
Ops0.push_back(N0->getOperand(0));
12692-
if (N0->getOpcode() == ARMISD::VCMP)
12693-
Ops0.push_back(N0->getOperand(1));
12694-
Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
12695-
SmallVector<SDValue, 4> Ops1;
12696-
Ops1.push_back(N1->getOperand(0));
12697-
if (N1->getOpcode() == ARMISD::VCMP)
12698-
Ops1.push_back(N1->getOperand(1));
12699-
Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
12700-
12701-
SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
12702-
SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
12703-
SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
12704-
return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
12705-
DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
12688+
SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT);
12689+
SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT);
12690+
SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
12691+
return DCI.DAG.getLogicalNOT(DL, And, VT);
1270612692
}
1270712693

1270812694
/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
@@ -12823,6 +12809,27 @@ static SDValue PerformXORCombine(SDNode *N,
1282312809
return Result;
1282412810
}
1282512811

12812+
if (Subtarget->hasMVEIntegerOps()) {
12813+
// fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
12814+
SDValue N0 = N->getOperand(0);
12815+
SDValue N1 = N->getOperand(1);
12816+
const TargetLowering *TLI = Subtarget->getTargetLowering();
12817+
if (TLI->isConstTrueVal(N1.getNode()) &&
12818+
(N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
12819+
if (CanInvertMVEVCMP(N0)) {
12820+
SDLoc DL(N0);
12821+
ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
12822+
12823+
SmallVector<SDValue, 4> Ops;
12824+
Ops.push_back(N0->getOperand(0));
12825+
if (N0->getOpcode() == ARMISD::VCMP)
12826+
Ops.push_back(N0->getOperand(1));
12827+
Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32));
12828+
return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
12829+
}
12830+
}
12831+
}
12832+
1282612833
return SDValue();
1282712834
}
1282812835

llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -296,9 +296,8 @@ for.cond.cleanup: ; preds = %middle.block, %entr
296296
define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
297297
; CHECK-LABEL: or_mul_reduce_add:
298298
; CHECK: @ %bb.0: @ %entry
299-
; CHECK-NEXT: push {r4, r5, r6, lr}
300-
; CHECK-NEXT: sub sp, #4
301-
; CHECK-NEXT: ldr.w r12, [sp, #20]
299+
; CHECK-NEXT: push {r4, r5, r7, lr}
300+
; CHECK-NEXT: ldr.w r12, [sp, #16]
302301
; CHECK-NEXT: cmp.w r12, #0
303302
; CHECK-NEXT: beq .LBB3_4
304303
; CHECK-NEXT: @ %bb.1: @ %vector.ph
@@ -315,34 +314,27 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
315314
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
316315
; CHECK-NEXT: vctp.32 r12
317316
; CHECK-NEXT: vmov q0, q1
318-
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
319-
; CHECK-NEXT: sub.w r12, r12, #4
320317
; CHECK-NEXT: vpstt
321318
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
322319
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
320+
; CHECK-NEXT: vpnot
323321
; CHECK-NEXT: vsub.i32 q1, q2, q1
324-
; CHECK-NEXT: vcmp.i32 eq, q1, zr
325-
; CHECK-NEXT: vmrs r5, p0
326-
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
327-
; CHECK-NEXT: vmrs r6, p0
328-
; CHECK-NEXT: orrs r5, r6
329-
; CHECK-NEXT: vmsr p0, r5
330-
; CHECK-NEXT: vpstt
331-
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
332-
; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
322+
; CHECK-NEXT: sub.w r12, r12, #4
323+
; CHECK-NEXT: vpstee
324+
; CHECK-NEXT: vcmpt.i32 ne, q1, zr
325+
; CHECK-NEXT: vldrwe.u32 q1, [r3], #16
326+
; CHECK-NEXT: vldrwe.u32 q2, [r2], #16
333327
; CHECK-NEXT: vmul.i32 q1, q2, q1
334328
; CHECK-NEXT: vadd.i32 q1, q1, q0
335329
; CHECK-NEXT: le lr, .LBB3_2
336330
; CHECK-NEXT: @ %bb.3: @ %middle.block
337331
; CHECK-NEXT: vctp.32 r4
338332
; CHECK-NEXT: vpsel q0, q1, q0
339333
; CHECK-NEXT: vaddv.u32 r0, q0
340-
; CHECK-NEXT: add sp, #4
341-
; CHECK-NEXT: pop {r4, r5, r6, pc}
334+
; CHECK-NEXT: pop {r4, r5, r7, pc}
342335
; CHECK-NEXT: .LBB3_4:
343336
; CHECK-NEXT: movs r0, #0
344-
; CHECK-NEXT: add sp, #4
345-
; CHECK-NEXT: pop {r4, r5, r6, pc}
337+
; CHECK-NEXT: pop {r4, r5, r7, pc}
346338
entry:
347339
%cmp8 = icmp eq i32 %N, 0
348340
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

llvm/test/CodeGen/Thumb2/mve-pred-or.ll

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpulez_v4i1(<4 x i32> %a, <4 x i32> %b) {
124124
; CHECK-LABEL: cmpulez_v4i1:
125125
; CHECK: @ %bb.0: @ %entry
126126
; CHECK-NEXT: vcmp.u32 cs, q1, zr
127-
; CHECK-NEXT: vmrs r0, p0
128-
; CHECK-NEXT: vcmp.i32 eq, q0, zr
129-
; CHECK-NEXT: vmrs r1, p0
130-
; CHECK-NEXT: orrs r0, r1
131-
; CHECK-NEXT: vmsr p0, r0
132-
; CHECK-NEXT: vpsel q0, q0, q1
127+
; CHECK-NEXT: vpnot
128+
; CHECK-NEXT: vpst
129+
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
130+
; CHECK-NEXT: vpsel q0, q1, q0
133131
; CHECK-NEXT: bx lr
134132
entry:
135133
%c1 = icmp eq <4 x i32> %a, zeroinitializer
@@ -247,12 +245,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpult_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
247245
; CHECK-LABEL: cmpult_v4i1:
248246
; CHECK: @ %bb.0: @ %entry
249247
; CHECK-NEXT: vcmp.u32 hi, q2, q1
250-
; CHECK-NEXT: vmrs r0, p0
251-
; CHECK-NEXT: vcmp.i32 eq, q0, zr
252-
; CHECK-NEXT: vmrs r1, p0
253-
; CHECK-NEXT: orrs r0, r1
254-
; CHECK-NEXT: vmsr p0, r0
255-
; CHECK-NEXT: vpsel q0, q0, q1
248+
; CHECK-NEXT: vpnot
249+
; CHECK-NEXT: vpst
250+
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
251+
; CHECK-NEXT: vpsel q0, q1, q0
256252
; CHECK-NEXT: bx lr
257253
entry:
258254
%c1 = icmp eq <4 x i32> %a, zeroinitializer
@@ -266,12 +262,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugt_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
266262
; CHECK-LABEL: cmpugt_v4i1:
267263
; CHECK: @ %bb.0: @ %entry
268264
; CHECK-NEXT: vcmp.u32 hi, q1, q2
269-
; CHECK-NEXT: vmrs r0, p0
270-
; CHECK-NEXT: vcmp.i32 eq, q0, zr
271-
; CHECK-NEXT: vmrs r1, p0
272-
; CHECK-NEXT: orrs r0, r1
273-
; CHECK-NEXT: vmsr p0, r0
274-
; CHECK-NEXT: vpsel q0, q0, q1
265+
; CHECK-NEXT: vpnot
266+
; CHECK-NEXT: vpst
267+
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
268+
; CHECK-NEXT: vpsel q0, q1, q0
275269
; CHECK-NEXT: bx lr
276270
entry:
277271
%c1 = icmp eq <4 x i32> %a, zeroinitializer
@@ -285,12 +279,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpule_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
285279
; CHECK-LABEL: cmpule_v4i1:
286280
; CHECK: @ %bb.0: @ %entry
287281
; CHECK-NEXT: vcmp.u32 cs, q2, q1
288-
; CHECK-NEXT: vmrs r0, p0
289-
; CHECK-NEXT: vcmp.i32 eq, q0, zr
290-
; CHECK-NEXT: vmrs r1, p0
291-
; CHECK-NEXT: orrs r0, r1
292-
; CHECK-NEXT: vmsr p0, r0
293-
; CHECK-NEXT: vpsel q0, q0, q1
282+
; CHECK-NEXT: vpnot
283+
; CHECK-NEXT: vpst
284+
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
285+
; CHECK-NEXT: vpsel q0, q1, q0
294286
; CHECK-NEXT: bx lr
295287
entry:
296288
%c1 = icmp eq <4 x i32> %a, zeroinitializer
@@ -304,12 +296,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpuge_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
304296
; CHECK-LABEL: cmpuge_v4i1:
305297
; CHECK: @ %bb.0: @ %entry
306298
; CHECK-NEXT: vcmp.u32 cs, q1, q2
307-
; CHECK-NEXT: vmrs r0, p0
308-
; CHECK-NEXT: vcmp.i32 eq, q0, zr
309-
; CHECK-NEXT: vmrs r1, p0
310-
; CHECK-NEXT: orrs r0, r1
311-
; CHECK-NEXT: vmsr p0, r0
312-
; CHECK-NEXT: vpsel q0, q0, q1
299+
; CHECK-NEXT: vpnot
300+
; CHECK-NEXT: vpst
301+
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
302+
; CHECK-NEXT: vpsel q0, q1, q0
313303
; CHECK-NEXT: bx lr
314304
entry:
315305
%c1 = icmp eq <4 x i32> %a, zeroinitializer

llvm/test/CodeGen/Thumb2/mve-vcmpf.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -483,8 +483,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ugt_v4f32(<4 x float> %src, <4 x float>
483483
;
484484
; CHECK-MVEFP-LABEL: vcmp_ugt_v4f32:
485485
; CHECK-MVEFP: @ %bb.0: @ %entry
486-
; CHECK-MVEFP-NEXT: vcmp.f32 ge, q1, q0
487-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
486+
; CHECK-MVEFP-NEXT: vcmp.f32 lt, q1, q0
487+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
488488
; CHECK-MVEFP-NEXT: bx lr
489489
entry:
490490
%c = fcmp ugt <4 x float> %src, %src2
@@ -535,8 +535,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_uge_v4f32(<4 x float> %src, <4 x float>
535535
;
536536
; CHECK-MVEFP-LABEL: vcmp_uge_v4f32:
537537
; CHECK-MVEFP: @ %bb.0: @ %entry
538-
; CHECK-MVEFP-NEXT: vcmp.f32 gt, q1, q0
539-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
538+
; CHECK-MVEFP-NEXT: vcmp.f32 le, q1, q0
539+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
540540
; CHECK-MVEFP-NEXT: bx lr
541541
entry:
542542
%c = fcmp uge <4 x float> %src, %src2
@@ -587,8 +587,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ult_v4f32(<4 x float> %src, <4 x float>
587587
;
588588
; CHECK-MVEFP-LABEL: vcmp_ult_v4f32:
589589
; CHECK-MVEFP: @ %bb.0: @ %entry
590-
; CHECK-MVEFP-NEXT: vcmp.f32 ge, q0, q1
591-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
590+
; CHECK-MVEFP-NEXT: vcmp.f32 lt, q0, q1
591+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
592592
; CHECK-MVEFP-NEXT: bx lr
593593
entry:
594594
%c = fcmp ult <4 x float> %src, %src2
@@ -639,8 +639,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ule_v4f32(<4 x float> %src, <4 x float>
639639
;
640640
; CHECK-MVEFP-LABEL: vcmp_ule_v4f32:
641641
; CHECK-MVEFP: @ %bb.0: @ %entry
642-
; CHECK-MVEFP-NEXT: vcmp.f32 gt, q0, q1
643-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
642+
; CHECK-MVEFP-NEXT: vcmp.f32 le, q0, q1
643+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
644644
; CHECK-MVEFP-NEXT: bx lr
645645
entry:
646646
%c = fcmp ule <4 x float> %src, %src2
@@ -1897,8 +1897,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s
18971897
;
18981898
; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
18991899
; CHECK-MVEFP: @ %bb.0: @ %entry
1900-
; CHECK-MVEFP-NEXT: vcmp.f16 ge, q1, q0
1901-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
1900+
; CHECK-MVEFP-NEXT: vcmp.f16 lt, q1, q0
1901+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
19021902
; CHECK-MVEFP-NEXT: bx lr
19031903
entry:
19041904
%c = fcmp ugt <8 x half> %src, %src2
@@ -2021,8 +2021,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s
20212021
;
20222022
; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
20232023
; CHECK-MVEFP: @ %bb.0: @ %entry
2024-
; CHECK-MVEFP-NEXT: vcmp.f16 gt, q1, q0
2025-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
2024+
; CHECK-MVEFP-NEXT: vcmp.f16 le, q1, q0
2025+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
20262026
; CHECK-MVEFP-NEXT: bx lr
20272027
entry:
20282028
%c = fcmp uge <8 x half> %src, %src2
@@ -2145,8 +2145,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s
21452145
;
21462146
; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
21472147
; CHECK-MVEFP: @ %bb.0: @ %entry
2148-
; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, q1
2149-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
2148+
; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, q1
2149+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
21502150
; CHECK-MVEFP-NEXT: bx lr
21512151
entry:
21522152
%c = fcmp ult <8 x half> %src, %src2
@@ -2269,8 +2269,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s
22692269
;
22702270
; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
22712271
; CHECK-MVEFP: @ %bb.0: @ %entry
2272-
; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, q1
2273-
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
2272+
; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, q1
2273+
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
22742274
; CHECK-MVEFP-NEXT: bx lr
22752275
entry:
22762276
%c = fcmp ule <8 x half> %src, %src2

0 commit comments

Comments
 (0)