Skip to content

Commit 3b786f2

Browse files
[AArch64] Add intrinsic to count trailing zero elements
This patch introduces an experimental intrinsic for counting the trailing zero elements in a vector. The intrinsic has generic expansion in SelectionDAGBuilder, and for AArch64 there is a pattern which matches to brkb & cntp instructions where SVE is enabled. The intrinsic has a second operand, is_zero_poison, similar to the existing cttz intrinsic. These changes have been split out from D158291.
1 parent 00a8314 commit 3b786f2

File tree

13 files changed

+972
-0
lines changed

13 files changed

+972
-0
lines changed

llvm/docs/LangRef.rst

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18497,6 +18497,45 @@ Arguments:
1849718497
Both arguments must be vectors of the same type whereby their logical
1849818498
concatenation matches the result type.
1849918499

18500+
'``llvm.experimental.cttz.elts``' Intrinsic
18501+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
18502+
18503+
Syntax:
18504+
"""""""
18505+
18506+
This is an overloaded intrinsic. You can use ```llvm.experimental.cttz.elts```
18507+
on any vector of integer elements, both fixed width and scalable.
18508+
18509+
::
18510+
18511+
declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> <src>, i1 <is_zero_poison>)
18512+
18513+
Overview:
18514+
"""""""""
18515+
18516+
The '``llvm.experimental.cttz.elts``' intrinsic counts the number of trailing
18517+
zero elements of a vector.
18518+
18519+
Arguments:
18520+
""""""""""
18521+
18522+
The first argument is the vector to be counted. This argument must be a vector
18523+
with integer element type. The return type must also be an integer type which is
18524+
wide enough to hold the maximum number of elements of the source vector. The
18525+
behaviour of this intrinsic is undefined if the return type is not wide enough
18526+
for the number of elements in the input vector.
18527+
18528+
The second argument is a constant flag that indicates whether the intrinsic
18529+
returns a valid result if the first argument is all zero. If the first argument
18530+
is all zero and the second argument is true, the result is poison.
18531+
18532+
Semantics:
18533+
""""""""""
18534+
18535+
The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least
18536+
significant) zero elements in a vector. If ``src == 0`` the result is the
18537+
number of elements in the input vector.
18538+
1850018539
'``llvm.experimental.vector.splice``' Intrinsic
1850118540
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1850218541

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,10 @@ class TargetLoweringBase {
465465
return true;
466466
}
467467

468+
/// Return true if the @llvm.experimental.cttz.elts intrinsic should be
469+
/// expanded using generic code in SelectionDAGBuilder.
470+
virtual bool shouldExpandCttzElements(EVT VT) const { return true; }
471+
468472
// Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
469473
// vecreduce(op(x, y)) for the reduction opcode RedOpc.
470474
virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2182,6 +2182,11 @@ def int_experimental_get_vector_length:
21822182
[IntrNoMem, IntrNoSync, IntrWillReturn,
21832183
ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
21842184

2185+
def int_experimental_cttz_elts:
2186+
DefaultAttrsIntrinsic<[llvm_anyint_ty],
2187+
[llvm_anyvector_ty, llvm_i1_ty],
2188+
[IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
2189+
21852190
def int_experimental_vp_splice:
21862191
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
21872192
[LLVMMatchType<0>,

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7514,6 +7514,62 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
75147514
setValue(&I, Trunc);
75157515
return;
75167516
}
7517+
case Intrinsic::experimental_cttz_elts: {
7518+
auto DL = getCurSDLoc();
7519+
SDValue Op = getValue(I.getOperand(0));
7520+
EVT OpVT = Op.getValueType();
7521+
7522+
if (!TLI.shouldExpandCttzElements(OpVT)) {
7523+
visitTargetIntrinsic(I, Intrinsic);
7524+
return;
7525+
}
7526+
7527+
if (OpVT.getScalarType() != MVT::i1) {
7528+
// Compare the input vector elements to zero & use to count trailing zeros
7529+
SDValue AllZero = DAG.getConstant(0, DL, OpVT);
7530+
OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
7531+
OpVT.getVectorElementCount());
7532+
Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE);
7533+
}
7534+
7535+
// Find the smallest "sensible" element type to use for the expansion.
7536+
ConstantRange CR(
7537+
APInt(64, OpVT.getVectorElementCount().getKnownMinValue()));
7538+
if (OpVT.isScalableVT())
7539+
CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64));
7540+
7541+
// If the zero-is-poison flag is set, we can assume the upper limit
7542+
// of the result is VF-1.
7543+
if (!cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero())
7544+
CR = CR.subtract(APInt(64, 1));
7545+
7546+
unsigned EltWidth = I.getType()->getScalarSizeInBits();
7547+
EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
7548+
EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
7549+
7550+
MVT NewEltTy = MVT::getIntegerVT(EltWidth);
7551+
7552+
// Create the new vector type & get the vector length
7553+
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy,
7554+
OpVT.getVectorElementCount());
7555+
7556+
SDValue VL =
7557+
DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount());
7558+
7559+
SDValue StepVec = DAG.getStepVector(DL, NewVT);
7560+
SDValue SplatVL = DAG.getSplat(NewVT, DL, VL);
7561+
SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec);
7562+
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op);
7563+
SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext);
7564+
SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And);
7565+
SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max);
7566+
7567+
EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
7568+
SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy);
7569+
7570+
setValue(&I, Ret);
7571+
return;
7572+
}
75177573
case Intrinsic::vector_insert: {
75187574
SDValue Vec = getValue(I.getOperand(0));
75197575
SDValue SubVec = getValue(I.getOperand(1));

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,6 +1791,10 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
17911791
return false;
17921792
}
17931793

1794+
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1795+
return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1796+
}
1797+
17941798
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
17951799
bool StreamingSVE) {
17961800
assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
@@ -2634,6 +2638,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
26342638
MAKE_CASE(AArch64ISD::MRRS)
26352639
MAKE_CASE(AArch64ISD::MSRR)
26362640
MAKE_CASE(AArch64ISD::RSHRNB_I)
2641+
MAKE_CASE(AArch64ISD::CTTZ_ELTS)
26372642
}
26382643
#undef MAKE_CASE
26392644
return nullptr;
@@ -5338,6 +5343,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
53385343
}
53395344
return SDValue();
53405345
}
5346+
case Intrinsic::experimental_cttz_elts: {
5347+
SDValue NewCttzElts =
5348+
DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5349+
5350+
return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5351+
}
53415352
}
53425353
}
53435354

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,8 @@ enum NodeType : unsigned {
335335
PTEST_ANY,
336336
PTRUE,
337337

338+
CTTZ_ELTS,
339+
338340
BITREVERSE_MERGE_PASSTHRU,
339341
BSWAP_MERGE_PASSTHRU,
340342
REVH_MERGE_PASSTHRU,
@@ -927,6 +929,8 @@ class AArch64TargetLowering : public TargetLowering {
927929

928930
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
929931

932+
bool shouldExpandCttzElements(EVT VT) const override;
933+
930934
/// If a change in streaming mode is required on entry to/return from a
931935
/// function call it emits and returns the corresponding SMSTART or SMSTOP node.
932936
/// \p Entry tells whether this is before/after the Call, which is necessary

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,9 @@ def AArch64rshrnb_pf : PatFrags<(ops node:$rs, node:$i),
842842
[(AArch64rshrnb node:$rs, node:$i),
843843
(int_aarch64_sve_rshrnb node:$rs, node:$i)]>;
844844

845+
def AArch64CttzElts : SDNode<"AArch64ISD::CTTZ_ELTS", SDTypeProfile<1, 1,
846+
[SDTCisInt<0>, SDTCisVec<1>]>, []>;
847+
845848
// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
846849
// have no common bits.
847850
def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,11 @@ let Predicates = [HasSVEorSME] in {
19641964
defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
19651965
defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
19661966
defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
1967+
1968+
def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
1969+
(i64 (!cast<Instruction>(CNTP_XPP_B)
1970+
(nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)),
1971+
(nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>;
19671972
}
19681973

19691974
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2049,6 +2054,17 @@ let Predicates = [HasSVEorSME] in {
20492054
defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
20502055
defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
20512056

2057+
def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv16i1:$Op2)))),
2058+
(i64 (!cast<Instruction>(INCP_XP_B)
2059+
(nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
2060+
GPR64:$Op1))>;
2061+
2062+
def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv16i1:$Op2))))),
2063+
(i32 (EXTRACT_SUBREG (i64 (!cast<Instruction>(INCP_XP_B)
2064+
(nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
2065+
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Op1, sub_32))),
2066+
sub_32))>;
2067+
20522068
defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
20532069
defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
20542070
defm INDEX_RI : sve_int_index_ri<"index">;

0 commit comments

Comments
 (0)