Skip to content

Commit 696805d

Browse files
authored
[AArch64] Improve non-SVE popcount for 32bit and 64 bit using udot (#95881)
fixes #95860 Use `udot` instead of a sequence of `uaddlp` instructions when summing up lanes for `popcount`.
1 parent b48623c commit 696805d

File tree

2 files changed

+286
-46
lines changed

2 files changed

+286
-46
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9972,6 +9972,26 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
99729972
Val = DAG.getBitcast(VT8Bit, Val);
99739973
Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
99749974

9975+
if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
9976+
VT.getVectorNumElements() >= 2) {
9977+
EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
9978+
SDValue Zeros = DAG.getConstant(0, DL, DT);
9979+
SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
9980+
9981+
if (VT == MVT::v2i64) {
9982+
Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
9983+
Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
9984+
} else if (VT == MVT::v2i32) {
9985+
Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
9986+
} else if (VT == MVT::v4i32) {
9987+
Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
9988+
} else {
9989+
llvm_unreachable("Unexpected type for custom ctpop lowering");
9990+
}
9991+
9992+
return Val;
9993+
}
9994+
99759995
// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
99769996
unsigned EltSize = 8;
99779997
unsigned NumElts = VT.is64BitVector() ? 8 : 16;

llvm/test/CodeGen/AArch64/popcount.ll

Lines changed: 266 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,28 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -O0 -mtriple=aarch64-unknown-unknown | FileCheck %s
2+
; RUN: llc < %s -O0 -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=CHECKO0
3+
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON
4+
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon,+dotprod | FileCheck %s --check-prefixes=CHECK,DOT
5+
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+sve | FileCheck %s --check-prefixes=CHECK,SVE
36

47
; Function Attrs: nobuiltin nounwind readonly
58
define i8 @popcount128(ptr nocapture nonnull readonly %0) {
9+
; CHECKO0-LABEL: popcount128:
10+
; CHECKO0: // %bb.0: // %Entry
11+
; CHECKO0-NEXT: ldr q0, [x0]
12+
; CHECKO0-NEXT: cnt v0.16b, v0.16b
13+
; CHECKO0-NEXT: uaddlv h0, v0.16b
14+
; CHECKO0-NEXT: // kill: def $q0 killed $h0
15+
; CHECKO0-NEXT: // kill: def $s0 killed $s0 killed $q0
16+
; CHECKO0-NEXT: fmov w0, s0
17+
; CHECKO0-NEXT: ret
18+
;
619
; CHECK-LABEL: popcount128:
720
; CHECK: // %bb.0: // %Entry
8-
; CHECK-NEXT: ldr q0, [x0]
21+
; CHECK-NEXT: ldr d0, [x0]
22+
; CHECK-NEXT: add x8, x0, #8
23+
; CHECK-NEXT: ld1 { v0.d }[1], [x8]
924
; CHECK-NEXT: cnt v0.16b, v0.16b
1025
; CHECK-NEXT: uaddlv h0, v0.16b
11-
; CHECK-NEXT: // kill: def $q0 killed $h0
12-
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
1326
; CHECK-NEXT: fmov w0, s0
1427
; CHECK-NEXT: ret
1528
Entry:
@@ -24,37 +37,54 @@ declare i128 @llvm.ctpop.i128(i128)
2437

2538
; Function Attrs: nobuiltin nounwind readonly
2639
define i16 @popcount256(ptr nocapture nonnull readonly %0) {
40+
; CHECKO0-LABEL: popcount256:
41+
; CHECKO0: // %bb.0: // %Entry
42+
; CHECKO0-NEXT: ldr x11, [x0]
43+
; CHECKO0-NEXT: ldr x10, [x0, #8]
44+
; CHECKO0-NEXT: ldr x9, [x0, #16]
45+
; CHECKO0-NEXT: ldr x8, [x0, #24]
46+
; CHECKO0-NEXT: // implicit-def: $q1
47+
; CHECKO0-NEXT: mov v1.d[0], x11
48+
; CHECKO0-NEXT: mov v1.d[1], x10
49+
; CHECKO0-NEXT: // implicit-def: $q0
50+
; CHECKO0-NEXT: mov v0.d[0], x9
51+
; CHECKO0-NEXT: mov v0.d[1], x8
52+
; CHECKO0-NEXT: cnt v1.16b, v1.16b
53+
; CHECKO0-NEXT: uaddlv h1, v1.16b
54+
; CHECKO0-NEXT: // kill: def $q1 killed $h1
55+
; CHECKO0-NEXT: // kill: def $s1 killed $s1 killed $q1
56+
; CHECKO0-NEXT: fmov w0, s1
57+
; CHECKO0-NEXT: mov w10, wzr
58+
; CHECKO0-NEXT: mov w9, w0
59+
; CHECKO0-NEXT: mov w8, w10
60+
; CHECKO0-NEXT: bfi x9, x8, #32, #32
61+
; CHECKO0-NEXT: cnt v0.16b, v0.16b
62+
; CHECKO0-NEXT: uaddlv h0, v0.16b
63+
; CHECKO0-NEXT: // kill: def $q0 killed $h0
64+
; CHECKO0-NEXT: // kill: def $s0 killed $s0 killed $q0
65+
; CHECKO0-NEXT: fmov w0, s0
66+
; CHECKO0-NEXT: mov w8, w0
67+
; CHECKO0-NEXT: // kill: def $x10 killed $w10
68+
; CHECKO0-NEXT: bfi x8, x10, #32, #32
69+
; CHECKO0-NEXT: adds x8, x8, x9
70+
; CHECKO0-NEXT: mov w0, w8
71+
; CHECKO0-NEXT: ret
72+
;
2773
; CHECK-LABEL: popcount256:
2874
; CHECK: // %bb.0: // %Entry
29-
; CHECK-NEXT: ldr x11, [x0]
30-
; CHECK-NEXT: ldr x10, [x0, #8]
31-
; CHECK-NEXT: ldr x9, [x0, #16]
32-
; CHECK-NEXT: ldr x8, [x0, #24]
33-
; CHECK-NEXT: // implicit-def: $q1
34-
; CHECK-NEXT: mov v1.d[0], x11
35-
; CHECK-NEXT: mov v1.d[1], x10
36-
; CHECK-NEXT: // implicit-def: $q0
37-
; CHECK-NEXT: mov v0.d[0], x9
38-
; CHECK-NEXT: mov v0.d[1], x8
39-
; CHECK-NEXT: cnt v1.16b, v1.16b
40-
; CHECK-NEXT: uaddlv h1, v1.16b
41-
; CHECK-NEXT: // kill: def $q1 killed $h1
42-
; CHECK-NEXT: // kill: def $s1 killed $s1 killed $q1
43-
; CHECK-NEXT: fmov w0, s1
44-
; CHECK-NEXT: mov w10, wzr
45-
; CHECK-NEXT: mov w9, w0
46-
; CHECK-NEXT: mov w8, w10
47-
; CHECK-NEXT: bfi x9, x8, #32, #32
75+
; CHECK-NEXT: ldr d0, [x0, #16]
76+
; CHECK-NEXT: ldr d1, [x0]
77+
; CHECK-NEXT: add x8, x0, #8
78+
; CHECK-NEXT: add x9, x0, #24
79+
; CHECK-NEXT: ld1 { v0.d }[1], [x9]
80+
; CHECK-NEXT: ld1 { v1.d }[1], [x8]
4881
; CHECK-NEXT: cnt v0.16b, v0.16b
82+
; CHECK-NEXT: cnt v1.16b, v1.16b
4983
; CHECK-NEXT: uaddlv h0, v0.16b
50-
; CHECK-NEXT: // kill: def $q0 killed $h0
51-
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
52-
; CHECK-NEXT: fmov w0, s0
53-
; CHECK-NEXT: mov w8, w0
54-
; CHECK-NEXT: // kill: def $x10 killed $w10
55-
; CHECK-NEXT: bfi x8, x10, #32, #32
56-
; CHECK-NEXT: adds x8, x8, x9
57-
; CHECK-NEXT: mov w0, w8
84+
; CHECK-NEXT: uaddlv h1, v1.16b
85+
; CHECK-NEXT: fmov w8, s0
86+
; CHECK-NEXT: fmov w9, s1
87+
; CHECK-NEXT: add w0, w9, w8
5888
; CHECK-NEXT: ret
5989
Entry:
6090
%1 = load i256, ptr %0, align 16
@@ -67,25 +97,215 @@ Entry:
6797
declare i256 @llvm.ctpop.i256(i256)
6898

6999
define <1 x i128> @popcount1x128(<1 x i128> %0) {
100+
; CHECKO0-LABEL: popcount1x128:
101+
; CHECKO0: // %bb.0: // %Entry
102+
; CHECKO0-NEXT: // implicit-def: $q0
103+
; CHECKO0-NEXT: mov v0.d[0], x0
104+
; CHECKO0-NEXT: mov v0.d[1], x1
105+
; CHECKO0-NEXT: cnt v0.16b, v0.16b
106+
; CHECKO0-NEXT: uaddlv h0, v0.16b
107+
; CHECKO0-NEXT: // kill: def $q0 killed $h0
108+
; CHECKO0-NEXT: mov x1, xzr
109+
; CHECKO0-NEXT: // kill: def $s0 killed $s0 killed $q0
110+
; CHECKO0-NEXT: fmov w0, s0
111+
; CHECKO0-NEXT: mov w8, wzr
112+
; CHECKO0-NEXT: // kill: def $x0 killed $w0
113+
; CHECKO0-NEXT: // kill: def $x8 killed $w8
114+
; CHECKO0-NEXT: bfi x0, x8, #32, #32
115+
; CHECKO0-NEXT: ret
116+
;
70117
; CHECK-LABEL: popcount1x128:
71118
; CHECK: // %bb.0: // %Entry
72-
; CHECK-NEXT: // implicit-def: $q0
73-
; CHECK-NEXT: mov v0.d[0], x0
74-
; CHECK-NEXT: mov v0.d[1], x1
75-
; CHECK-NEXT: cnt v0.16b, v0.16b
76-
; CHECK-NEXT: uaddlv h0, v0.16b
77-
; CHECK-NEXT: // kill: def $q0 killed $h0
78-
; CHECK-NEXT: mov x1, xzr
79-
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
80-
; CHECK-NEXT: fmov w0, s0
81-
; CHECK-NEXT: mov w8, wzr
82-
; CHECK-NEXT: // kill: def $x0 killed $w0
83-
; CHECK-NEXT: // kill: def $x8 killed $w8
84-
; CHECK-NEXT: bfi x0, x8, #32, #32
119+
; CHECK-NEXT: fmov d1, x0
120+
; CHECK-NEXT: movi v0.2d, #0000000000000000
121+
; CHECK-NEXT: mov v1.d[1], x1
122+
; CHECK-NEXT: cnt v1.16b, v1.16b
123+
; CHECK-NEXT: uaddlv h1, v1.16b
124+
; CHECK-NEXT: mov v0.s[0], v1.s[0]
125+
; CHECK-NEXT: mov x1, v0.d[1]
126+
; CHECK-NEXT: fmov x0, d0
85127
; CHECK-NEXT: ret
86128
Entry:
87-
%1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)
129+
%1 = tail call <1 x i128> @llvm.ctpop.v1i128(<1 x i128> %0)
88130
ret <1 x i128> %1
89131
}
90132

91-
declare <1 x i128> @llvm.ctpop.v1.i128(<1 x i128>)
133+
declare <1 x i128> @llvm.ctpop.v1i128(<1 x i128>)
134+
135+
define <2 x i64> @popcount2x64(<2 x i64> %0) {
136+
; CHECKO0-LABEL: popcount2x64:
137+
; CHECKO0: // %bb.0: // %Entry
138+
; CHECKO0-NEXT: cnt v0.16b, v0.16b
139+
; CHECKO0-NEXT: uaddlp v0.8h, v0.16b
140+
; CHECKO0-NEXT: uaddlp v0.4s, v0.8h
141+
; CHECKO0-NEXT: uaddlp v0.2d, v0.4s
142+
; CHECKO0-NEXT: ret
143+
;
144+
; NEON-LABEL: popcount2x64:
145+
; NEON: // %bb.0: // %Entry
146+
; NEON-NEXT: cnt v0.16b, v0.16b
147+
; NEON-NEXT: uaddlp v0.8h, v0.16b
148+
; NEON-NEXT: uaddlp v0.4s, v0.8h
149+
; NEON-NEXT: uaddlp v0.2d, v0.4s
150+
; NEON-NEXT: ret
151+
;
152+
; DOT-LABEL: popcount2x64:
153+
; DOT: // %bb.0: // %Entry
154+
; DOT-NEXT: movi v1.16b, #1
155+
; DOT-NEXT: cnt v0.16b, v0.16b
156+
; DOT-NEXT: movi v2.2d, #0000000000000000
157+
; DOT-NEXT: udot v2.4s, v1.16b, v0.16b
158+
; DOT-NEXT: uaddlp v0.2d, v2.4s
159+
; DOT-NEXT: ret
160+
;
161+
; SVE-LABEL: popcount2x64:
162+
; SVE: // %bb.0: // %Entry
163+
; SVE-NEXT: cnt v0.16b, v0.16b
164+
; SVE-NEXT: uaddlp v0.8h, v0.16b
165+
; SVE-NEXT: uaddlp v0.4s, v0.8h
166+
; SVE-NEXT: uaddlp v0.2d, v0.4s
167+
; SVE-NEXT: ret
168+
Entry:
169+
%1 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
170+
ret <2 x i64> %1
171+
}
172+
173+
declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
174+
175+
define <1 x i64> @popcount1x64(<1 x i64> %0) {
176+
; CHECKO0-LABEL: popcount1x64:
177+
; CHECKO0: // %bb.0: // %Entry
178+
; CHECKO0-NEXT: fmov x0, d0
179+
; CHECKO0-NEXT: fmov d0, x0
180+
; CHECKO0-NEXT: cnt v0.8b, v0.8b
181+
; CHECKO0-NEXT: uaddlv h0, v0.8b
182+
; CHECKO0-NEXT: // kill: def $q0 killed $h0
183+
; CHECKO0-NEXT: mov w8, v0.s[0]
184+
; CHECKO0-NEXT: // kill: def $x8 killed $w8
185+
; CHECKO0-NEXT: fmov d0, x8
186+
; CHECKO0-NEXT: ret
187+
;
188+
; CHECK-LABEL: popcount1x64:
189+
; CHECK: // %bb.0: // %Entry
190+
; CHECK-NEXT: cnt v0.8b, v0.8b
191+
; CHECK-NEXT: uaddlp v0.4h, v0.8b
192+
; CHECK-NEXT: uaddlp v0.2s, v0.4h
193+
; CHECK-NEXT: uaddlp v0.1d, v0.2s
194+
; CHECK-NEXT: ret
195+
Entry:
196+
%1 = tail call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %0)
197+
ret <1 x i64> %1
198+
}
199+
200+
declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>)
201+
202+
define <4 x i32> @popcount4x32(<4 x i32> %0) {
203+
; CHECKO0-LABEL: popcount4x32:
204+
; CHECKO0: // %bb.0: // %Entry
205+
; CHECKO0-NEXT: cnt v0.16b, v0.16b
206+
; CHECKO0-NEXT: uaddlp v0.8h, v0.16b
207+
; CHECKO0-NEXT: uaddlp v0.4s, v0.8h
208+
; CHECKO0-NEXT: ret
209+
;
210+
; NEON-LABEL: popcount4x32:
211+
; NEON: // %bb.0: // %Entry
212+
; NEON-NEXT: cnt v0.16b, v0.16b
213+
; NEON-NEXT: uaddlp v0.8h, v0.16b
214+
; NEON-NEXT: uaddlp v0.4s, v0.8h
215+
; NEON-NEXT: ret
216+
;
217+
; DOT-LABEL: popcount4x32:
218+
; DOT: // %bb.0: // %Entry
219+
; DOT-NEXT: movi v1.16b, #1
220+
; DOT-NEXT: cnt v2.16b, v0.16b
221+
; DOT-NEXT: movi v0.2d, #0000000000000000
222+
; DOT-NEXT: udot v0.4s, v1.16b, v2.16b
223+
; DOT-NEXT: ret
224+
;
225+
; SVE-LABEL: popcount4x32:
226+
; SVE: // %bb.0: // %Entry
227+
; SVE-NEXT: cnt v0.16b, v0.16b
228+
; SVE-NEXT: uaddlp v0.8h, v0.16b
229+
; SVE-NEXT: uaddlp v0.4s, v0.8h
230+
; SVE-NEXT: ret
231+
Entry:
232+
%1 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
233+
ret <4 x i32> %1
234+
}
235+
236+
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
237+
238+
define <2 x i32> @popcount2x32(<2 x i32> %0) {
239+
; CHECKO0-LABEL: popcount2x32:
240+
; CHECKO0: // %bb.0: // %Entry
241+
; CHECKO0-NEXT: cnt v0.8b, v0.8b
242+
; CHECKO0-NEXT: uaddlp v0.4h, v0.8b
243+
; CHECKO0-NEXT: uaddlp v0.2s, v0.4h
244+
; CHECKO0-NEXT: ret
245+
;
246+
; NEON-LABEL: popcount2x32:
247+
; NEON: // %bb.0: // %Entry
248+
; NEON-NEXT: cnt v0.8b, v0.8b
249+
; NEON-NEXT: uaddlp v0.4h, v0.8b
250+
; NEON-NEXT: uaddlp v0.2s, v0.4h
251+
; NEON-NEXT: ret
252+
;
253+
; DOT-LABEL: popcount2x32:
254+
; DOT: // %bb.0: // %Entry
255+
; DOT-NEXT: movi v1.2d, #0000000000000000
256+
; DOT-NEXT: cnt v0.8b, v0.8b
257+
; DOT-NEXT: movi v2.8b, #1
258+
; DOT-NEXT: udot v1.2s, v2.8b, v0.8b
259+
; DOT-NEXT: fmov d0, d1
260+
; DOT-NEXT: ret
261+
;
262+
; SVE-LABEL: popcount2x32:
263+
; SVE: // %bb.0: // %Entry
264+
; SVE-NEXT: cnt v0.8b, v0.8b
265+
; SVE-NEXT: uaddlp v0.4h, v0.8b
266+
; SVE-NEXT: uaddlp v0.2s, v0.4h
267+
; SVE-NEXT: ret
268+
Entry:
269+
%1 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %0)
270+
ret <2 x i32> %1
271+
}
272+
273+
declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
274+
275+
define <8 x i16> @popcount8x16(<8 x i16> %0) {
276+
; CHECKO0-LABEL: popcount8x16:
277+
; CHECKO0: // %bb.0: // %Entry
278+
; CHECKO0-NEXT: cnt v0.16b, v0.16b
279+
; CHECKO0-NEXT: uaddlp v0.8h, v0.16b
280+
; CHECKO0-NEXT: ret
281+
;
282+
; CHECK-LABEL: popcount8x16:
283+
; CHECK: // %bb.0: // %Entry
284+
; CHECK-NEXT: cnt v0.16b, v0.16b
285+
; CHECK-NEXT: uaddlp v0.8h, v0.16b
286+
; CHECK-NEXT: ret
287+
Entry:
288+
%1 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
289+
ret <8 x i16> %1
290+
}
291+
292+
declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
293+
294+
define <4 x i16> @popcount4x16(<4 x i16> %0) {
295+
; CHECKO0-LABEL: popcount4x16:
296+
; CHECKO0: // %bb.0: // %Entry
297+
; CHECKO0-NEXT: cnt v0.8b, v0.8b
298+
; CHECKO0-NEXT: uaddlp v0.4h, v0.8b
299+
; CHECKO0-NEXT: ret
300+
;
301+
; CHECK-LABEL: popcount4x16:
302+
; CHECK: // %bb.0: // %Entry
303+
; CHECK-NEXT: cnt v0.8b, v0.8b
304+
; CHECK-NEXT: uaddlp v0.4h, v0.8b
305+
; CHECK-NEXT: ret
306+
Entry:
307+
%1 = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %0)
308+
ret <4 x i16> %1
309+
}
310+
311+
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)

0 commit comments

Comments
 (0)