Skip to content

Commit b563e76

Browse files
authored
[AArch64] Improve scalar and Neon popcount with SVE CNT. (#143870)
When available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't. For the scalar types, I see the following speedups on NVIDIA Grace CPU: | size (bits) | before (Gibit/s) | after (Gibit/s) | speedup | |------------:|-----------------:|----------------:|--------:| | 32 | 75.20 | 86.79 | 1.15 | | 64 | 149.87 | 173.70 | 1.16 | | 128 | 158.56 | 164.88 | 1.04 |
1 parent 597ee88 commit b563e76

File tree

3 files changed

+282
-103
lines changed

3 files changed

+282
-103
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10683,12 +10683,9 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
1068310683

1068410684
EVT VT = Op.getValueType();
1068510685
if (VT.isScalableVector() ||
10686-
useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
10686+
useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
1068710687
return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
1068810688

10689-
if (!Subtarget->isNeonAvailable())
10690-
return SDValue();
10691-
1069210689
bool IsParity = Op.getOpcode() == ISD::PARITY;
1069310690
SDValue Val = Op.getOperand(0);
1069410691
SDLoc DL(Op);
@@ -10698,6 +10695,36 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
1069810695
if (VT == MVT::i32 && IsParity)
1069910696
return SDValue();
1070010697

10698+
if (Subtarget->isSVEorStreamingSVEAvailable()) {
10699+
if (VT == MVT::i32 || VT == MVT::i64) {
10700+
EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
10701+
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
10702+
DAG.getUNDEF(ContainerVT), Val,
10703+
DAG.getVectorIdxConstant(0, DL));
10704+
Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
10705+
Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
10706+
DAG.getVectorIdxConstant(0, DL));
10707+
if (IsParity)
10708+
Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
10709+
return Val;
10710+
}
10711+
10712+
if (VT == MVT::i128) {
10713+
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
10714+
Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
10715+
Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
10716+
Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
10717+
Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
10718+
Val = DAG.getZExtOrTrunc(Val, DL, VT);
10719+
if (IsParity)
10720+
Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
10721+
return Val;
10722+
}
10723+
}
10724+
10725+
if (!Subtarget->isNeonAvailable())
10726+
return SDValue();
10727+
1070110728
// If there is no CNT instruction available, GPR popcount can
1070210729
// be more efficiently lowered to the following sequence that uses
1070310730
// AdvSIMD registers/instructions as long as the copies to/from

0 commit comments

Comments
 (0)