-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AArch64] Improve scalar and Neon popcount with SVE CNT. #143870
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
When available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't. For the scalar types, I see the following speedups on NVIDIA Grace CPU: | type | before | after | speedup | | | (Gibit/s) | (Gibit/s) | | |------|-----------|-----------|---------| | u32 | 75.20 | 86.79 | 1.15 | | u64 | 149.87 | 173.70 | 1.16 | | u128 | 158.56 | 164.88 | 1.04 |
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesWhen available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't. For the scalar types, I see the following speedups on NVIDIA Grace CPU:
Full diff: https://github.com/llvm/llvm-project/pull/143870.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f51caef6d228..6c4ca9e4e5233 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10614,13 +10614,13 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
return SDValue();
EVT VT = Op.getValueType();
- if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+ assert((!Subtarget->isNeonAvailable() ||
+ (VT != MVT::v8i8 && VT != MVT::v16i8)) &&
+ "Unexpected custom lowering for B vectors with Neon available.");
+ bool OverrideNEON = !Subtarget->isNeonAvailable() || VT.isFixedLengthVector();
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
- if (!Subtarget->isNeonAvailable())
- return SDValue();
-
bool IsParity = Op.getOpcode() == ISD::PARITY;
SDValue Val = Op.getOperand(0);
SDLoc DL(Op);
@@ -10630,6 +10630,34 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
if (VT == MVT::i32 && IsParity)
return SDValue();
+ if (Subtarget->isSVEorStreamingSVEAvailable()) {
+ assert((VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i128) &&
+ "Unexpected type for custom ctpop lowering.");
+ if (VT == MVT::i32 || VT == MVT::i64) {
+ EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), Val,
+ DAG.getVectorIdxConstant(0, DL));
+ Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
+ Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
+ DAG.getVectorIdxConstant(0, DL));
+ } else if (VT == MVT::i128) {
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
+ Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
+ Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
+ Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
+ Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
+ Val = DAG.getZExtOrTrunc(Val, DL, VT);
+ } else
+ llvm_unreachable("Unexpected type!");
+ if (IsParity)
+ Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
+ return Val;
+ }
+
+ if (!Subtarget->isNeonAvailable())
+ return SDValue();
+
// If there is no CNT instruction available, GPR popcount can
// be more efficiently lowered to the following sequence that uses
// AdvSIMD registers/instructions as long as the copies to/from
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 61f221988777f..e4d9c62664b58 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -23,15 +23,36 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
; CHECKO0-NEXT: fmov w0, s0
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount128:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: add x8, x0, #8
-; CHECK-NEXT: ld1 { v0.d }[1], [x8]
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount128:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: ldr d0, [x0]
+; NEON-NEXT: add x8, x0, #8
+; NEON-NEXT: ld1 { v0.d }[1], [x8]
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: addv b0, v0.16b
+; NEON-NEXT: fmov w0, s0
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount128:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: ldr d0, [x0]
+; DOT-NEXT: add x8, x0, #8
+; DOT-NEXT: ld1 { v0.d }[1], [x8]
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: addv b0, v0.16b
+; DOT-NEXT: fmov w0, s0
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount128:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ldr d0, [x0]
+; SVE-NEXT: add x8, x0, #8
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: ld1 { v0.d }[1], [x8]
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: addp d0, v0.2d
+; SVE-NEXT: fmov w0, s0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount128:
; BE: // %bb.0: // %Entry
@@ -107,22 +128,55 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
; CHECKO0-NEXT: mov w0, w8
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount256:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: ldr d0, [x0, #16]
-; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: add x8, x0, #8
-; CHECK-NEXT: add x9, x0, #24
-; CHECK-NEXT: ld1 { v0.d }[1], [x9]
-; CHECK-NEXT: ld1 { v1.d }[1], [x8]
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: addv b1, v1.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w0, w9, w8
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount256:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: ldr d0, [x0, #16]
+; NEON-NEXT: ldr d1, [x0]
+; NEON-NEXT: add x8, x0, #8
+; NEON-NEXT: add x9, x0, #24
+; NEON-NEXT: ld1 { v0.d }[1], [x9]
+; NEON-NEXT: ld1 { v1.d }[1], [x8]
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: cnt v1.16b, v1.16b
+; NEON-NEXT: addv b0, v0.16b
+; NEON-NEXT: addv b1, v1.16b
+; NEON-NEXT: fmov w8, s0
+; NEON-NEXT: fmov w9, s1
+; NEON-NEXT: add w0, w9, w8
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount256:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: ldr d0, [x0, #16]
+; DOT-NEXT: ldr d1, [x0]
+; DOT-NEXT: add x8, x0, #8
+; DOT-NEXT: add x9, x0, #24
+; DOT-NEXT: ld1 { v0.d }[1], [x9]
+; DOT-NEXT: ld1 { v1.d }[1], [x8]
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: cnt v1.16b, v1.16b
+; DOT-NEXT: addv b0, v0.16b
+; DOT-NEXT: addv b1, v1.16b
+; DOT-NEXT: fmov w8, s0
+; DOT-NEXT: fmov w9, s1
+; DOT-NEXT: add w0, w9, w8
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount256:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ldr d0, [x0, #16]
+; SVE-NEXT: ldr d1, [x0]
+; SVE-NEXT: add x8, x0, #8
+; SVE-NEXT: add x9, x0, #24
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: ld1 { v0.d }[1], [x9]
+; SVE-NEXT: ld1 { v1.d }[1], [x8]
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: cnt z1.d, p0/m, z1.d
+; SVE-NEXT: add v0.2d, v1.2d, v0.2d
+; SVE-NEXT: addp d0, v0.2d
+; SVE-NEXT: fmov w0, s0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount256:
; BE: // %bb.0: // %Entry
@@ -223,15 +277,36 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
; CHECKO0-NEXT: bfi x0, x8, #32, #32
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount1x128:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: mov v0.d[1], x1
-; CHECK-NEXT: mov x1, xzr
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount1x128:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: fmov d0, x0
+; NEON-NEXT: mov v0.d[1], x1
+; NEON-NEXT: mov x1, xzr
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: addv b0, v0.16b
+; NEON-NEXT: fmov x0, d0
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount1x128:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: fmov d0, x0
+; DOT-NEXT: mov v0.d[1], x1
+; DOT-NEXT: mov x1, xzr
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: addv b0, v0.16b
+; DOT-NEXT: fmov x0, d0
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount1x128:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: fmov d0, x0
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: mov v0.d[1], x1
+; SVE-NEXT: mov x1, xzr
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: addp d0, v0.2d
+; SVE-NEXT: fmov x0, d0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount1x128:
; BE: // %bb.0: // %Entry
@@ -305,10 +380,10 @@ define <2 x i64> @popcount2x64(<2 x i64> %0) {
;
; SVE-LABEL: popcount2x64:
; SVE: // %bb.0: // %Entry
-; SVE-NEXT: cnt v0.16b, v0.16b
-; SVE-NEXT: uaddlp v0.8h, v0.16b
-; SVE-NEXT: uaddlp v0.4s, v0.8h
-; SVE-NEXT: uaddlp v0.2d, v0.4s
+; SVE-NEXT: ptrue p0.d, vl2
+; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: ret
;
; BE-LABEL: popcount2x64:
@@ -374,13 +449,29 @@ define <1 x i64> @popcount1x64(<1 x i64> %0) {
; CHECKO0-NEXT: fmov d0, x8
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount1x64:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: uaddlp v0.1d, v0.2s
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount1x64:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: cnt v0.8b, v0.8b
+; NEON-NEXT: uaddlp v0.4h, v0.8b
+; NEON-NEXT: uaddlp v0.2s, v0.4h
+; NEON-NEXT: uaddlp v0.1d, v0.2s
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount1x64:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: cnt v0.8b, v0.8b
+; DOT-NEXT: uaddlp v0.4h, v0.8b
+; DOT-NEXT: uaddlp v0.2s, v0.4h
+; DOT-NEXT: uaddlp v0.1d, v0.2s
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount1x64:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ptrue p0.d, vl1
+; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount1x64:
; BE: // %bb.0: // %Entry
@@ -442,9 +533,10 @@ define <4 x i32> @popcount4x32(<4 x i32> %0) {
;
; SVE-LABEL: popcount4x32:
; SVE: // %bb.0: // %Entry
-; SVE-NEXT: cnt v0.16b, v0.16b
-; SVE-NEXT: uaddlp v0.8h, v0.16b
-; SVE-NEXT: uaddlp v0.4s, v0.8h
+; SVE-NEXT: ptrue p0.s, vl4
+; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT: cnt z0.s, p0/m, z0.s
+; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: ret
;
; BE-LABEL: popcount4x32:
@@ -520,9 +612,10 @@ define <2 x i32> @popcount2x32(<2 x i32> %0) {
;
; SVE-LABEL: popcount2x32:
; SVE: // %bb.0: // %Entry
-; SVE-NEXT: cnt v0.8b, v0.8b
-; SVE-NEXT: uaddlp v0.4h, v0.8b
-; SVE-NEXT: uaddlp v0.2s, v0.4h
+; SVE-NEXT: ptrue p0.s, vl2
+; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT: cnt z0.s, p0/m, z0.s
+; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT: ret
;
; BE-LABEL: popcount2x32:
@@ -577,11 +670,25 @@ define <8 x i16> @popcount8x16(<8 x i16> %0) {
; CHECKO0-NEXT: uaddlp v0.8h, v0.16b
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount8x16:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount8x16:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: uaddlp v0.8h, v0.16b
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount8x16:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: uaddlp v0.8h, v0.16b
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount8x16:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ptrue p0.h, vl8
+; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT: cnt z0.h, p0/m, z0.h
+; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount8x16:
; BE: // %bb.0: // %Entry
@@ -618,11 +725,25 @@ define <4 x i16> @popcount4x16(<4 x i16> %0) {
; CHECKO0-NEXT: uaddlp v0.4h, v0.8b
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount4x16:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount4x16:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: cnt v0.8b, v0.8b
+; NEON-NEXT: uaddlp v0.4h, v0.8b
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount4x16:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: cnt v0.8b, v0.8b
+; DOT-NEXT: uaddlp v0.4h, v0.8b
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount4x16:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ptrue p0.h, vl4
+; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT: cnt z0.h, p0/m, z0.h
+; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount4x16:
; BE: // %bb.0: // %Entry
@@ -676,20 +797,49 @@ define i32 @ctpop_into_extract(ptr %p) {
; CHECKO0-NEXT: mov w0, wzr
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: ctpop_into_extract:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff
-; CHECK-NEXT: mov x8, x0
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: cnt v1.8b, v1.8b
-; CHECK-NEXT: addv b1, v1.8b
-; CHECK-NEXT: mov v2.s[1], v1.s[0]
-; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
-; CHECK-NEXT: str d0, [x8]
-; CHECK-NEXT: ret
+; NEON-LABEL: ctpop_into_extract:
+; NEON: // %bb.0:
+; NEON-NEXT: ldr d0, [x0]
+; NEON-NEXT: movi v2.2d, #0xffffffffffffffff
+; NEON-NEXT: mov x8, x0
+; NEON-NEXT: mov w0, wzr
+; NEON-NEXT: fmov w9, s0
+; NEON-NEXT: fmov s1, w9
+; NEON-NEXT: cnt v1.8b, v1.8b
+; NEON-NEXT: addv b1, v1.8b
+; NEON-NEXT: mov v2.s[1], v1.s[0]
+; NEON-NEXT: sub v0.2s, v0.2s, v2.2s
+; NEON-NEXT: str d0, [x8]
+; NEON-NEXT: ret
+;
+; DOT-LABEL: ctpop_into_extract:
+; DOT: // %bb.0:
+; DOT-NEXT: ldr d0, [x0]
+; DOT-NEXT: movi v2.2d, #0xffffffffffffffff
+; DOT-NEXT: mov x8, x0
+; DOT-NEXT: mov w0, wzr
+; DOT-NEXT: fmov w9, s0
+; DOT-NEXT: fmov s1, w9
+; DOT-NEXT: cnt v1.8b, v1.8b
+; DOT-NEXT: addv b1, v1.8b
+; DOT-NEXT: mov v2.s[1], v1.s[0]
+; DOT-NEXT: sub v0.2s, v0.2s, v2.2s
+; DOT-NEXT: str d0, [x8]
+; DOT-NEXT: ret
+;
+; SVE-LABEL: ctpop_into_extract:
+; SVE: // %bb.0:
+; SVE-NEXT: ldr d0, [x0]
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: movi v2.2d, #0xffffffffffffffff
+; SVE-NEXT: mov x8, x0
+; SVE-NEXT: mov w0, wzr
+; SVE-NEXT: movprfx z1, z0
+; SVE-NEXT: cnt z1.s, p0/m, z0.s
+; SVE-NEXT: mov v2.s[1], v1.s[0]
+; SVE-NEXT: sub v0.2s, v0.2s, v2.2s
+; SVE-NEXT: str d0, [x8]
+; SVE-NEXT: ret
;
; BE-LABEL: ctpop_into_extract:
; BE: // %bb.0:
@@ -758,3 +908,5 @@ define i32 @ctpop_into_extract(ptr %p) {
}
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
index 1e71c4b66156c..b62b850434469 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
@@ -457,23 +457,25 @@ define void @ctpop_v256i8(ptr %a) vscale_range(16,0) #0 {
ret void
}
-; Don't use SVE for 64-bit vectors.
define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: cnt z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
-; Don't use SVE for 128-bit vectors.
define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cnt z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
ret <8 x i16> %res
@@ -547,25 +549,25 @@ define void @ctpop_v128i16(ptr %a) vscale_range(16,0) #0 {
ret void
}
-; Don't use SVE for 64-bit vectors.
define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: cnt z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
-; Don't use SVE for 128-bit vectors.
define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cnt z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
ret <4 x i32> %res
@@ -639,27 +641,25 @@ define void @ctpop_v64i32(ptr %a) vscale_range(16,0) #0 {
ret void
}
-; Don't use SVE for 64-bit vectors.
define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: uaddlp v0.1d, v0.2s
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: cnt z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
-; Don't use SVE for 128-bit vectors.
define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cnt z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
ret <2 x i64> %res
|
When available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't. For the scalar types, I see the following speedups on NVIDIA Grace CPU: | size (bits) | before (Gibit/s) | after (Gibit/s) | speedup | |------------:|-----------------:|----------------:|--------:| | 32 | 75.20 | 86.79 | 1.15 | | 64 | 149.87 | 173.70 | 1.16 | | 128 | 158.56 | 164.88 | 1.04 |
When available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't. For the scalar types, I see the following speedups on NVIDIA Grace CPU: | size (bits) | before (Gibit/s) | after (Gibit/s) | speedup | |------------:|-----------------:|----------------:|--------:| | 32 | 75.20 | 86.79 | 1.15 | | 64 | 149.87 | 173.70 | 1.16 | | 128 | 158.56 | 164.88 | 1.04 |
When available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't.
For the scalar types, I see the following speedups on NVIDIA Grace CPU: