-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[DAGCombine] Count leading ones: refine post DAG/Type Legalisation if promotion #102877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAGCombine] Count leading ones: refine post DAG/Type Legalisation if promotion #102877
Conversation
21d6f40
to
d4644f2
Compare
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: None (v01dXYZ) ChangesThis PR is related to #99591. In this PR, instead of modifying how the legalisation occurs depending on surrounding instructions, we refine after legalisation. This PR has two parts:
Remaining Tasks:
@topperc Patch is 23.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102877.diff 9 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 96ece1559bc437..b30efc9a25a39e 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -47,6 +47,8 @@ class BasicMatchContext {
bool match(SDValue N, unsigned Opcode) const {
return N->getOpcode() == Opcode;
}
+
+ static constexpr bool IsVP = false;
};
template <typename Pattern, typename MatchContext>
@@ -390,7 +392,8 @@ template <unsigned OpIdx, typename... OpndPreds> struct Operands_match {
template <typename MatchContext>
bool match(const MatchContext &Ctx, SDValue N) {
// Returns false if there are more operands than predicates;
- return N->getNumOperands() == OpIdx;
+ // Ignores the last two operands if both the Context and the Node are VP
+ return N->getNumOperands() == (OpIdx + 2 * Ctx.IsVP * N->isVPOpcode());
}
};
@@ -464,7 +467,7 @@ struct TernaryOpc_match {
bool match(const MatchContext &Ctx, SDValue N) {
if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
EffectiveOperands<ExcludeChain> EO(N);
- assert(EO.Size == 3);
+ assert(EO.Size == 3U + 2 * N->isVPOpcode());
return ((Op0.match(Ctx, N->getOperand(EO.FirstIndex)) &&
Op1.match(Ctx, N->getOperand(EO.FirstIndex + 1))) ||
(Commutable && Op0.match(Ctx, N->getOperand(EO.FirstIndex + 1)) &&
@@ -516,7 +519,7 @@ struct BinaryOpc_match {
bool match(const MatchContext &Ctx, SDValue N) {
if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
EffectiveOperands<ExcludeChain> EO(N);
- assert(EO.Size == 2);
+ assert(EO.Size == 2U + 2 * N->isVPOpcode());
return (LHS.match(Ctx, N->getOperand(EO.FirstIndex)) &&
RHS.match(Ctx, N->getOperand(EO.FirstIndex + 1))) ||
(Commutable && LHS.match(Ctx, N->getOperand(EO.FirstIndex + 1)) &&
@@ -668,7 +671,7 @@ template <typename Opnd_P, bool ExcludeChain = false> struct UnaryOpc_match {
bool match(const MatchContext &Ctx, SDValue N) {
if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
EffectiveOperands<ExcludeChain> EO(N);
- assert(EO.Size == 1);
+ assert(EO.Size == 1U + 2 * N->isVPOpcode());
return Opnd.match(Ctx, N->getOperand(EO.FirstIndex));
}
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index a4a1000d37259e..5366092d1740c5 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -269,6 +269,7 @@ VP_PROPERTY_FUNCTIONAL_INTRINSIC(ctlz)
VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ)
END_REGISTER_VP_SDNODE(VP_CTLZ)
BEGIN_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF, -1, vp_ctlz_zero_undef, 1, 2)
+VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ_ZERO_UNDEF)
END_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF)
END_REGISTER_VP_INTRINSIC(vp_ctlz)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f0c459d61a4d74..a8f4dc8c8ffa46 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3753,6 +3753,51 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
return SDValue();
}
+template <class MatchContextClass>
+static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
+ const SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N0.getValueType();
+ unsigned BitWidth = VT.getScalarSizeInBits();
+
+ MatchContextClass Matcher(DAG, DAG.getTargetLoweringInfo(), N);
+
+ APInt AndMask;
+ APInt XorMask;
+ APInt BitWidthDiff;
+
+ SDValue CtlzOp;
+ SDValue Src;
+
+ if (!sd_context_match(
+ N, Matcher,
+ m_Sub(m_Node(ISD::CTLZ, m_Value(CtlzOp)), m_ConstInt(BitWidthDiff))))
+ return SDValue();
+
+ if (sd_context_match(CtlzOp, Matcher, m_ZExt(m_Not(m_Value(Src))))) {
+ // (sub (ctlz (zero_extend (not Op)) BitWidthDiff))
+ if ((BitWidth - Src.getValueType().getScalarSizeInBits()) != BitWidthDiff)
+ return SDValue();
+
+ Src = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Src);
+ } else if (sd_context_match(CtlzOp, Matcher,
+ m_And(m_Xor(m_Value(Src), m_ConstInt(XorMask)),
+ m_ConstInt(AndMask)))) {
+ // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
+ unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
+ if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
+ return SDValue();
+ } else
+ return SDValue();
+
+ SDValue ShiftConst = DAG.getShiftAmountConstant(BitWidthDiff, VT, DL);
+ SDValue LShift = Matcher.getNode(ISD::SHL, DL, VT, Src, ShiftConst);
+ SDValue Not =
+ Matcher.getNode(ISD::XOR, DL, VT, LShift, DAG.getAllOnesConstant(DL, VT));
+
+ return Matcher.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, Not);
+}
+
// Since it may not be valid to emit a fold to zero for vector initializers
// check if we can before folding.
static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
@@ -3777,6 +3822,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
return N;
};
+ if (SDValue V = foldSubCtlzNot<EmptyMatchContext>(N, DAG))
+ return V;
+
// fold (sub x, x) -> 0
// FIXME: Refactor this and xor and other similar operations together.
if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
@@ -26767,6 +26815,8 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) {
return visitVP_SELECT(N);
case ISD::VP_MUL:
return visitMUL<VPMatchContext>(N);
+ case ISD::VP_SUB:
+ return foldSubCtlzNot<VPMatchContext>(N, DAG);
default:
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
index f965cb952f97a2..c1b3f7259aae33 100644
--- a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
+++ b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
@@ -46,6 +46,8 @@ class EmptyMatchContext {
bool LegalOnly = false) const {
return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
}
+
+ static constexpr bool IsVP = false;
};
class VPMatchContext {
@@ -170,6 +172,8 @@ class VPMatchContext {
unsigned VPOp = ISD::getVPForBaseOpcode(Op);
return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
}
+
+ static constexpr bool IsVP = true;
};
} // end anonymous namespace
#endif
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
new file mode 100644
index 00000000000000..e047545b38cfa5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; CHECK-SD-LABEL: ctlo_i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #24
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+ ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; CHECK-SD-LABEL: ctlo_i8_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #24
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i8_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #24
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+ ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; CHECK-SD-LABEL: ctlo_i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #16
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+ ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; CHECK-SD-LABEL: ctlo_i16_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #16
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i16_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #16
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+ ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; CHECK-LABEL: ctlo_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+ ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; CHECK-LABEL: ctlo_i32_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+ ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; CHECK-LABEL: ctlo_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+ ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; CHECK-LABEL: ctlo_i64_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+ ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f3236..e993ecfcdf3b81 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
define i8 @test_not_ctlz_i8(i8 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i8:
; LA32: # %bb.0:
-; LA32-NEXT: ori $a1, $zero, 255
-; LA32-NEXT: andn $a0, $a1, $a0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -24
+; LA32-NEXT: slli.w $a0, $a0, 24
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i8:
; LA64: # %bb.0:
-; LA64-NEXT: ori $a1, $zero, 255
-; LA64-NEXT: andn $a0, $a1, $a0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -56
+; LA64-NEXT: slli.d $a0, $a0, 56
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i8 %a, -1
%tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
define i16 @test_not_ctlz_i16(i16 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i16:
; LA32: # %bb.0:
-; LA32-NEXT: nor $a0, $a0, $zero
-; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -16
+; LA32-NEXT: slli.w $a0, $a0, 16
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i16:
; LA64: # %bb.0:
-; LA64-NEXT: nor $a0, $a0, $zero
-; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -48
+; LA64-NEXT: slli.d $a0, $a0, 48
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i16 %a, -1
%tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 58882525e55c4c..9ea1394a1dd2c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2624,6 +2624,153 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i9> %v
}
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vminu.vx v8, v8, a0
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vsub.vx v8, v8, a0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vclz.v v8, v8
+; CHECK-ZVBB-NEXT: li a0, 7
+; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.ctlz(<vscale x 1 x i9> %va.not, i1 false)
+ ret <vscale x 1 x i9> %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index bb80279e28f3d3..e4f7c666f9ea37 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
;
; X86-CLZ-LABEL: ctlo_i8:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: notb %al
-; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: shll $24, %eax
+; X86-CLZ-NEXT: notl %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlo_i8:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: notb %dil
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: notl %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed...
[truncated]
|
@llvm/pr-subscribers-llvm-ir Author: None (v01dXYZ) ChangesThis PR is related to #99591. In this PR, instead of modifying how the legalisation occurs depending on surrounding instructions, we refine after legalisation. This PR has two parts:
Remaining Tasks:
@topperc Patch is 23.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102877.diff 9 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 96ece1559bc437..b30efc9a25a39e 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -47,6 +47,8 @@ class BasicMatchContext {
bool match(SDValue N, unsigned Opcode) const {
return N->getOpcode() == Opcode;
}
+
+ static constexpr bool IsVP = false;
};
template <typename Pattern, typename MatchContext>
@@ -390,7 +392,8 @@ template <unsigned OpIdx, typename... OpndPreds> struct Operands_match {
template <typename MatchContext>
bool match(const MatchContext &Ctx, SDValue N) {
// Returns false if there are more operands than predicates;
- return N->getNumOperands() == OpIdx;
+ // Ignores the last two operands if both the Context and the Node are VP
+ return N->getNumOperands() == (OpIdx + 2 * Ctx.IsVP * N->isVPOpcode());
}
};
@@ -464,7 +467,7 @@ struct TernaryOpc_match {
bool match(const MatchContext &Ctx, SDValue N) {
if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
EffectiveOperands<ExcludeChain> EO(N);
- assert(EO.Size == 3);
+ assert(EO.Size == 3U + 2 * N->isVPOpcode());
return ((Op0.match(Ctx, N->getOperand(EO.FirstIndex)) &&
Op1.match(Ctx, N->getOperand(EO.FirstIndex + 1))) ||
(Commutable && Op0.match(Ctx, N->getOperand(EO.FirstIndex + 1)) &&
@@ -516,7 +519,7 @@ struct BinaryOpc_match {
bool match(const MatchContext &Ctx, SDValue N) {
if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
EffectiveOperands<ExcludeChain> EO(N);
- assert(EO.Size == 2);
+ assert(EO.Size == 2U + 2 * N->isVPOpcode());
return (LHS.match(Ctx, N->getOperand(EO.FirstIndex)) &&
RHS.match(Ctx, N->getOperand(EO.FirstIndex + 1))) ||
(Commutable && LHS.match(Ctx, N->getOperand(EO.FirstIndex + 1)) &&
@@ -668,7 +671,7 @@ template <typename Opnd_P, bool ExcludeChain = false> struct UnaryOpc_match {
bool match(const MatchContext &Ctx, SDValue N) {
if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
EffectiveOperands<ExcludeChain> EO(N);
- assert(EO.Size == 1);
+ assert(EO.Size == 1U + 2 * N->isVPOpcode());
return Opnd.match(Ctx, N->getOperand(EO.FirstIndex));
}
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index a4a1000d37259e..5366092d1740c5 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -269,6 +269,7 @@ VP_PROPERTY_FUNCTIONAL_INTRINSIC(ctlz)
VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ)
END_REGISTER_VP_SDNODE(VP_CTLZ)
BEGIN_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF, -1, vp_ctlz_zero_undef, 1, 2)
+VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ_ZERO_UNDEF)
END_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF)
END_REGISTER_VP_INTRINSIC(vp_ctlz)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f0c459d61a4d74..a8f4dc8c8ffa46 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3753,6 +3753,51 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
return SDValue();
}
+template <class MatchContextClass>
+static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
+ const SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N0.getValueType();
+ unsigned BitWidth = VT.getScalarSizeInBits();
+
+ MatchContextClass Matcher(DAG, DAG.getTargetLoweringInfo(), N);
+
+ APInt AndMask;
+ APInt XorMask;
+ APInt BitWidthDiff;
+
+ SDValue CtlzOp;
+ SDValue Src;
+
+ if (!sd_context_match(
+ N, Matcher,
+ m_Sub(m_Node(ISD::CTLZ, m_Value(CtlzOp)), m_ConstInt(BitWidthDiff))))
+ return SDValue();
+
+ if (sd_context_match(CtlzOp, Matcher, m_ZExt(m_Not(m_Value(Src))))) {
+ // (sub (ctlz (zero_extend (not Op)) BitWidthDiff))
+ if ((BitWidth - Src.getValueType().getScalarSizeInBits()) != BitWidthDiff)
+ return SDValue();
+
+ Src = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Src);
+ } else if (sd_context_match(CtlzOp, Matcher,
+ m_And(m_Xor(m_Value(Src), m_ConstInt(XorMask)),
+ m_ConstInt(AndMask)))) {
+ // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
+ unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
+ if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
+ return SDValue();
+ } else
+ return SDValue();
+
+ SDValue ShiftConst = DAG.getShiftAmountConstant(BitWidthDiff, VT, DL);
+ SDValue LShift = Matcher.getNode(ISD::SHL, DL, VT, Src, ShiftConst);
+ SDValue Not =
+ Matcher.getNode(ISD::XOR, DL, VT, LShift, DAG.getAllOnesConstant(DL, VT));
+
+ return Matcher.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, Not);
+}
+
// Since it may not be valid to emit a fold to zero for vector initializers
// check if we can before folding.
static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
@@ -3777,6 +3822,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
return N;
};
+ if (SDValue V = foldSubCtlzNot<EmptyMatchContext>(N, DAG))
+ return V;
+
// fold (sub x, x) -> 0
// FIXME: Refactor this and xor and other similar operations together.
if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
@@ -26767,6 +26815,8 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) {
return visitVP_SELECT(N);
case ISD::VP_MUL:
return visitMUL<VPMatchContext>(N);
+ case ISD::VP_SUB:
+ return foldSubCtlzNot<VPMatchContext>(N, DAG);
default:
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
index f965cb952f97a2..c1b3f7259aae33 100644
--- a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
+++ b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
@@ -46,6 +46,8 @@ class EmptyMatchContext {
bool LegalOnly = false) const {
return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
}
+
+ static constexpr bool IsVP = false;
};
class VPMatchContext {
@@ -170,6 +172,8 @@ class VPMatchContext {
unsigned VPOp = ISD::getVPForBaseOpcode(Op);
return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
}
+
+ static constexpr bool IsVP = true;
};
} // end anonymous namespace
#endif
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
new file mode 100644
index 00000000000000..e047545b38cfa5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; CHECK-SD-LABEL: ctlo_i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #24
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+ ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; CHECK-SD-LABEL: ctlo_i8_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #24
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i8_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #24
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+ ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; CHECK-SD-LABEL: ctlo_i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #16
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+ ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; CHECK-SD-LABEL: ctlo_i16_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #16
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i16_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #16
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+ ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; CHECK-LABEL: ctlo_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+ ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; CHECK-LABEL: ctlo_i32_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+ ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; CHECK-LABEL: ctlo_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+ ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; CHECK-LABEL: ctlo_i64_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+ ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f3236..e993ecfcdf3b81 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
define i8 @test_not_ctlz_i8(i8 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i8:
; LA32: # %bb.0:
-; LA32-NEXT: ori $a1, $zero, 255
-; LA32-NEXT: andn $a0, $a1, $a0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -24
+; LA32-NEXT: slli.w $a0, $a0, 24
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i8:
; LA64: # %bb.0:
-; LA64-NEXT: ori $a1, $zero, 255
-; LA64-NEXT: andn $a0, $a1, $a0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -56
+; LA64-NEXT: slli.d $a0, $a0, 56
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i8 %a, -1
%tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
define i16 @test_not_ctlz_i16(i16 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i16:
; LA32: # %bb.0:
-; LA32-NEXT: nor $a0, $a0, $zero
-; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -16
+; LA32-NEXT: slli.w $a0, $a0, 16
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i16:
; LA64: # %bb.0:
-; LA64-NEXT: nor $a0, $a0, $zero
-; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -48
+; LA64-NEXT: slli.d $a0, $a0, 48
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i16 %a, -1
%tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 58882525e55c4c..9ea1394a1dd2c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2624,6 +2624,153 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i9> %v
}
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vminu.vx v8, v8, a0
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vsub.vx v8, v8, a0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vclz.v v8, v8
+; CHECK-ZVBB-NEXT: li a0, 7
+; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.ctlz(<vscale x 1 x i9> %va.not, i1 false)
+ ret <vscale x 1 x i9> %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index bb80279e28f3d3..e4f7c666f9ea37 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
;
; X86-CLZ-LABEL: ctlo_i8:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: notb %al
-; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: shll $24, %eax
+; X86-CLZ-NEXT: notl %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlo_i8:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: notb %dil
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: notl %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed...
[truncated]
|
could you split this PR into at least two separate ones |
@@ -464,7 +467,7 @@ struct TernaryOpc_match { | |||
bool match(const MatchContext &Ctx, SDValue N) { | |||
if (sd_context_match(N, Ctx, m_Opc(Opcode))) { | |||
EffectiveOperands<ExcludeChain> EO(N); | |||
assert(EO.Size == 3); | |||
assert(EO.Size == 3U + 2 * N->isVPOpcode()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would it be better if we put this into EffectiveOperands
? EffectiveOperands
was invented to abstract away "don't care" operands, like input chain, under certain circumstances so I feel like it fits well with what you're doing here.
I'll do that as the SDPatternMatch part seems to require work on its own. |
Putting this PR back to Draft as it is blocked by the PR for SDPatternMatch. |
d4644f2
to
4c166cf
Compare
Currently, when using a VP match context with `sd_context_match`, only Opcode matching is possible (`m_Opc(Opcode)`). This PR suggest a way to make patterns with Operands (eg `m_Node`, `m_Add`, ...) works with a VP context. This PR blocks another PR #102877. Co-authored-by: v01dxyz <[email protected]>
4c166cf
to
4e15998
Compare
4e15998
to
13db390
Compare
13db390
to
edc6349
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/ minor comment
@@ -3755,6 +3755,50 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) { | |||
return SDValue(); | |||
} | |||
|
|||
template <class MatchContextClass> | |||
static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you add a brief comment explaining what this function does, primarily the pattern?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a comment. It's lengthy bc there are two patterns that are matched. And the replacing subDAG is slightly different (one any-extend when the other one does not).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping
Detect and rewrite patterns created by DAG/Type Legalisation when CTLZ is used for counting leading ones. Replace a SUB + CTLZ + ZERO_EXTEND with a CTLZ_ZERO_UNDEF + SHL. The VP path is supported too. DAG Legalisation Pattern: (sub (ctlz (zeroextend (not Src))) BitWidthDiff) if BitWidthDiff == BitWidth(Node) - BitWidth(Src) --> (ctlz_zero_undef (not (shl (anyextend Src) BitWidthDiff))) Type Legalisation Pattern: (sub (ctlz (and (xor Src XorMask) AndMask)) BitWidthDiff) if AndMask has only trailing ones and MaskBitWidth(AndMask) == BitWidth(Node) - BitWidthDiff and XorMask has more trailing ones than AndMask --> (ctlz_zero_undef (not (shl Src BitWidthDiff)))
edc6349
to
dc9e27b
Compare
This PR is related to #99591. In this PR, instead of modifying how the legalisation occurs depending on surrounding instructions, we refine after legalisation.
This PR has two parts:
SDPatternMatch/MatchContext
: Modify a little bit the code to match Operands (used bym_Node(...)
) and Unary/Binary/Ternary Patterns to make it compatible withVPMatchContext
, instead of onlym_Opc
supported. Some tests were added to ensure no regressions.DAGCombiner
: Add afoldSubCtlzNot
which detect and rewrite the patterns using matching context.Remaining Tasks:
foldSubCtlzNot
in another location for style consistency purpose ?@topperc