Skip to content

Commit 2ee86a1

Browse files
authored
[AArch64][GlobalISel] Improve non-SVE popcount for 32bit and 64 bit using udot (#96409)
Follow up for #95881 Use udot instead of a sequence of uaddlp instructions when summing up lanes for popcount.
1 parent 93e0ffa commit 2ee86a1

File tree

2 files changed

+264
-0
lines changed

2 files changed

+264
-0
lines changed

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,31 @@ bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
19081908
auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
19091909

19101910
// Sum across lanes.
1911+
1912+
if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
1913+
Ty.getScalarSizeInBits() != 16) {
1914+
LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty;
1915+
auto Zeros = MIRBuilder.buildConstant(Dt, 0);
1916+
auto Ones = MIRBuilder.buildConstant(VTy, 1);
1917+
MachineInstrBuilder Sum;
1918+
1919+
if (Ty == LLT::fixed_vector(2, 64)) {
1920+
auto UDOT =
1921+
MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1922+
Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT});
1923+
} else if (Ty == LLT::fixed_vector(4, 32)) {
1924+
Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1925+
} else if (Ty == LLT::fixed_vector(2, 32)) {
1926+
Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1927+
} else {
1928+
llvm_unreachable("unexpected vector shape");
1929+
}
1930+
1931+
Sum->getOperand(0).setReg(Dst);
1932+
MI.eraseFromParent();
1933+
return true;
1934+
}
1935+
19111936
Register HSum = CTPOP.getReg(0);
19121937
unsigned Opc;
19131938
SmallVector<LLT> HAddTys;

llvm/test/CodeGen/AArch64/popcount.ll

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON
44
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon,+dotprod | FileCheck %s --check-prefixes=CHECK,DOT
55
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+sve | FileCheck %s --check-prefixes=CHECK,SVE
6+
; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISEL
7+
; RUN: llc < %s -O0 -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISELO0
8+
; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=GISEL,NEON-GISEL
9+
; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown -mattr=+neon,+dotprod | FileCheck %s --check-prefixes=GISEL,DOT-GISEL
10+
; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown -mattr=+sve | FileCheck %s --check-prefixes=GISEL,SVE-GISEL
11+
612

713
; Function Attrs: nobuiltin nounwind readonly
814
define i8 @popcount128(ptr nocapture nonnull readonly %0) {
@@ -25,6 +31,24 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
2531
; CHECK-NEXT: uaddlv h0, v0.16b
2632
; CHECK-NEXT: fmov w0, s0
2733
; CHECK-NEXT: ret
34+
;
35+
; GISEL-LABEL: popcount128:
36+
; GISEL: // %bb.0: // %Entry
37+
; GISEL-NEXT: ldr q0, [x0]
38+
; GISEL-NEXT: cnt v0.16b, v0.16b
39+
; GISEL-NEXT: uaddlv h0, v0.16b
40+
; GISEL-NEXT: fmov w0, s0
41+
; GISEL-NEXT: ret
42+
;
43+
; GISELO0-LABEL: popcount128:
44+
; GISELO0: // %bb.0: // %Entry
45+
; GISELO0-NEXT: ldr q0, [x0]
46+
; GISELO0-NEXT: cnt v0.16b, v0.16b
47+
; GISELO0-NEXT: uaddlv h0, v0.16b
48+
; GISELO0-NEXT: // kill: def $q0 killed $h0
49+
; GISELO0-NEXT: // kill: def $s0 killed $s0 killed $q0
50+
; GISELO0-NEXT: fmov w0, s0
51+
; GISELO0-NEXT: ret
2852
Entry:
2953
%1 = load i128, ptr %0, align 16
3054
%2 = tail call i128 @llvm.ctpop.i128(i128 %1)
@@ -86,6 +110,57 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
86110
; CHECK-NEXT: fmov w9, s1
87111
; CHECK-NEXT: add w0, w9, w8
88112
; CHECK-NEXT: ret
113+
;
114+
; GISEL-LABEL: popcount256:
115+
; GISEL: // %bb.0: // %Entry
116+
; GISEL-NEXT: ldp x8, x9, [x0, #16]
117+
; GISEL-NEXT: mov v0.d[0], x8
118+
; GISEL-NEXT: ldp x8, x10, [x0]
119+
; GISEL-NEXT: mov v1.d[0], x8
120+
; GISEL-NEXT: mov v0.d[1], x9
121+
; GISEL-NEXT: mov v1.d[1], x10
122+
; GISEL-NEXT: cnt v0.16b, v0.16b
123+
; GISEL-NEXT: cnt v1.16b, v1.16b
124+
; GISEL-NEXT: uaddlv h0, v0.16b
125+
; GISEL-NEXT: uaddlv h1, v1.16b
126+
; GISEL-NEXT: mov w8, v0.s[0]
127+
; GISEL-NEXT: fmov w9, s1
128+
; GISEL-NEXT: add x0, x8, w9, uxtw
129+
; GISEL-NEXT: // kill: def $w0 killed $w0 killed $x0
130+
; GISEL-NEXT: ret
131+
;
132+
; GISELO0-LABEL: popcount256:
133+
; GISELO0: // %bb.0: // %Entry
134+
; GISELO0-NEXT: ldr x11, [x0]
135+
; GISELO0-NEXT: ldr x10, [x0, #8]
136+
; GISELO0-NEXT: ldr x9, [x0, #16]
137+
; GISELO0-NEXT: ldr x8, [x0, #24]
138+
; GISELO0-NEXT: // implicit-def: $q1
139+
; GISELO0-NEXT: mov v1.d[0], x11
140+
; GISELO0-NEXT: mov v1.d[1], x10
141+
; GISELO0-NEXT: // implicit-def: $q0
142+
; GISELO0-NEXT: mov v0.d[0], x9
143+
; GISELO0-NEXT: mov v0.d[1], x8
144+
; GISELO0-NEXT: cnt v1.16b, v1.16b
145+
; GISELO0-NEXT: uaddlv h1, v1.16b
146+
; GISELO0-NEXT: // kill: def $q1 killed $h1
147+
; GISELO0-NEXT: // kill: def $s1 killed $s1 killed $q1
148+
; GISELO0-NEXT: fmov w0, s1
149+
; GISELO0-NEXT: mov w10, wzr
150+
; GISELO0-NEXT: mov w9, w0
151+
; GISELO0-NEXT: mov w8, w10
152+
; GISELO0-NEXT: bfi x9, x8, #32, #32
153+
; GISELO0-NEXT: cnt v0.16b, v0.16b
154+
; GISELO0-NEXT: uaddlv h0, v0.16b
155+
; GISELO0-NEXT: // kill: def $q0 killed $h0
156+
; GISELO0-NEXT: // kill: def $s0 killed $s0 killed $q0
157+
; GISELO0-NEXT: fmov w0, s0
158+
; GISELO0-NEXT: mov w8, w0
159+
; GISELO0-NEXT: // kill: def $x10 killed $w10
160+
; GISELO0-NEXT: bfi x8, x10, #32, #32
161+
; GISELO0-NEXT: adds x8, x8, x9
162+
; GISELO0-NEXT: mov w0, w8
163+
; GISELO0-NEXT: ret
89164
Entry:
90165
%1 = load i256, ptr %0, align 16
91166
%2 = tail call i256 @llvm.ctpop.i256(i256 %1)
@@ -125,6 +200,33 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
125200
; CHECK-NEXT: mov x1, v0.d[1]
126201
; CHECK-NEXT: fmov x0, d0
127202
; CHECK-NEXT: ret
203+
;
204+
; GISEL-LABEL: popcount1x128:
205+
; GISEL: // %bb.0: // %Entry
206+
; GISEL-NEXT: mov v0.d[0], x0
207+
; GISEL-NEXT: mov v0.d[1], x1
208+
; GISEL-NEXT: mov x1, xzr
209+
; GISEL-NEXT: cnt v0.16b, v0.16b
210+
; GISEL-NEXT: uaddlv h0, v0.16b
211+
; GISEL-NEXT: mov w0, v0.s[0]
212+
; GISEL-NEXT: ret
213+
;
214+
; GISELO0-LABEL: popcount1x128:
215+
; GISELO0: // %bb.0: // %Entry
216+
; GISELO0-NEXT: // implicit-def: $q0
217+
; GISELO0-NEXT: mov v0.d[0], x0
218+
; GISELO0-NEXT: mov v0.d[1], x1
219+
; GISELO0-NEXT: cnt v0.16b, v0.16b
220+
; GISELO0-NEXT: uaddlv h0, v0.16b
221+
; GISELO0-NEXT: // kill: def $q0 killed $h0
222+
; GISELO0-NEXT: mov x1, xzr
223+
; GISELO0-NEXT: // kill: def $s0 killed $s0 killed $q0
224+
; GISELO0-NEXT: fmov w0, s0
225+
; GISELO0-NEXT: mov w8, wzr
226+
; GISELO0-NEXT: // kill: def $x0 killed $w0
227+
; GISELO0-NEXT: // kill: def $x8 killed $w8
228+
; GISELO0-NEXT: bfi x0, x8, #32, #32
229+
; GISELO0-NEXT: ret
128230
Entry:
129231
%1 = tail call <1 x i128> @llvm.ctpop.v1i128(<1 x i128> %0)
130232
ret <1 x i128> %1
@@ -165,6 +267,39 @@ define <2 x i64> @popcount2x64(<2 x i64> %0) {
165267
; SVE-NEXT: uaddlp v0.4s, v0.8h
166268
; SVE-NEXT: uaddlp v0.2d, v0.4s
167269
; SVE-NEXT: ret
270+
;
271+
; GISELO0-LABEL: popcount2x64:
272+
; GISELO0: // %bb.0: // %Entry
273+
; GISELO0-NEXT: cnt v0.16b, v0.16b
274+
; GISELO0-NEXT: uaddlp v0.8h, v0.16b
275+
; GISELO0-NEXT: uaddlp v0.4s, v0.8h
276+
; GISELO0-NEXT: uaddlp v0.2d, v0.4s
277+
; GISELO0-NEXT: ret
278+
;
279+
; NEON-GISEL-LABEL: popcount2x64:
280+
; NEON-GISEL: // %bb.0: // %Entry
281+
; NEON-GISEL-NEXT: cnt v0.16b, v0.16b
282+
; NEON-GISEL-NEXT: uaddlp v0.8h, v0.16b
283+
; NEON-GISEL-NEXT: uaddlp v0.4s, v0.8h
284+
; NEON-GISEL-NEXT: uaddlp v0.2d, v0.4s
285+
; NEON-GISEL-NEXT: ret
286+
;
287+
; DOT-GISEL-LABEL: popcount2x64:
288+
; DOT-GISEL: // %bb.0: // %Entry
289+
; DOT-GISEL-NEXT: movi v1.2d, #0000000000000000
290+
; DOT-GISEL-NEXT: cnt v0.16b, v0.16b
291+
; DOT-GISEL-NEXT: movi v2.16b, #1
292+
; DOT-GISEL-NEXT: udot v1.4s, v2.16b, v0.16b
293+
; DOT-GISEL-NEXT: uaddlp v0.2d, v1.4s
294+
; DOT-GISEL-NEXT: ret
295+
;
296+
; SVE-GISEL-LABEL: popcount2x64:
297+
; SVE-GISEL: // %bb.0: // %Entry
298+
; SVE-GISEL-NEXT: cnt v0.16b, v0.16b
299+
; SVE-GISEL-NEXT: uaddlp v0.8h, v0.16b
300+
; SVE-GISEL-NEXT: uaddlp v0.4s, v0.8h
301+
; SVE-GISEL-NEXT: uaddlp v0.2d, v0.4s
302+
; SVE-GISEL-NEXT: ret
168303
Entry:
169304
%1 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
170305
ret <2 x i64> %1
@@ -192,6 +327,26 @@ define <1 x i64> @popcount1x64(<1 x i64> %0) {
192327
; CHECK-NEXT: uaddlp v0.2s, v0.4h
193328
; CHECK-NEXT: uaddlp v0.1d, v0.2s
194329
; CHECK-NEXT: ret
330+
;
331+
; GISEL-LABEL: popcount1x64:
332+
; GISEL: // %bb.0: // %Entry
333+
; GISEL-NEXT: cnt v0.8b, v0.8b
334+
; GISEL-NEXT: uaddlv h0, v0.8b
335+
; GISEL-NEXT: mov w8, v0.s[0]
336+
; GISEL-NEXT: fmov d0, x8
337+
; GISEL-NEXT: ret
338+
;
339+
; GISELO0-LABEL: popcount1x64:
340+
; GISELO0: // %bb.0: // %Entry
341+
; GISELO0-NEXT: fmov x0, d0
342+
; GISELO0-NEXT: fmov d0, x0
343+
; GISELO0-NEXT: cnt v0.8b, v0.8b
344+
; GISELO0-NEXT: uaddlv h0, v0.8b
345+
; GISELO0-NEXT: // kill: def $q0 killed $h0
346+
; GISELO0-NEXT: mov w8, v0.s[0]
347+
; GISELO0-NEXT: // kill: def $x8 killed $w8
348+
; GISELO0-NEXT: fmov d0, x8
349+
; GISELO0-NEXT: ret
195350
Entry:
196351
%1 = tail call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %0)
197352
ret <1 x i64> %1
@@ -228,6 +383,36 @@ define <4 x i32> @popcount4x32(<4 x i32> %0) {
228383
; SVE-NEXT: uaddlp v0.8h, v0.16b
229384
; SVE-NEXT: uaddlp v0.4s, v0.8h
230385
; SVE-NEXT: ret
386+
;
387+
; GISELO0-LABEL: popcount4x32:
388+
; GISELO0: // %bb.0: // %Entry
389+
; GISELO0-NEXT: cnt v0.16b, v0.16b
390+
; GISELO0-NEXT: uaddlp v0.8h, v0.16b
391+
; GISELO0-NEXT: uaddlp v0.4s, v0.8h
392+
; GISELO0-NEXT: ret
393+
;
394+
; NEON-GISEL-LABEL: popcount4x32:
395+
; NEON-GISEL: // %bb.0: // %Entry
396+
; NEON-GISEL-NEXT: cnt v0.16b, v0.16b
397+
; NEON-GISEL-NEXT: uaddlp v0.8h, v0.16b
398+
; NEON-GISEL-NEXT: uaddlp v0.4s, v0.8h
399+
; NEON-GISEL-NEXT: ret
400+
;
401+
; DOT-GISEL-LABEL: popcount4x32:
402+
; DOT-GISEL: // %bb.0: // %Entry
403+
; DOT-GISEL-NEXT: movi v1.2d, #0000000000000000
404+
; DOT-GISEL-NEXT: cnt v0.16b, v0.16b
405+
; DOT-GISEL-NEXT: movi v2.16b, #1
406+
; DOT-GISEL-NEXT: udot v1.4s, v2.16b, v0.16b
407+
; DOT-GISEL-NEXT: mov v0.16b, v1.16b
408+
; DOT-GISEL-NEXT: ret
409+
;
410+
; SVE-GISEL-LABEL: popcount4x32:
411+
; SVE-GISEL: // %bb.0: // %Entry
412+
; SVE-GISEL-NEXT: cnt v0.16b, v0.16b
413+
; SVE-GISEL-NEXT: uaddlp v0.8h, v0.16b
414+
; SVE-GISEL-NEXT: uaddlp v0.4s, v0.8h
415+
; SVE-GISEL-NEXT: ret
231416
Entry:
232417
%1 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0)
233418
ret <4 x i32> %1
@@ -265,6 +450,36 @@ define <2 x i32> @popcount2x32(<2 x i32> %0) {
265450
; SVE-NEXT: uaddlp v0.4h, v0.8b
266451
; SVE-NEXT: uaddlp v0.2s, v0.4h
267452
; SVE-NEXT: ret
453+
;
454+
; GISELO0-LABEL: popcount2x32:
455+
; GISELO0: // %bb.0: // %Entry
456+
; GISELO0-NEXT: cnt v0.8b, v0.8b
457+
; GISELO0-NEXT: uaddlp v0.4h, v0.8b
458+
; GISELO0-NEXT: uaddlp v0.2s, v0.4h
459+
; GISELO0-NEXT: ret
460+
;
461+
; NEON-GISEL-LABEL: popcount2x32:
462+
; NEON-GISEL: // %bb.0: // %Entry
463+
; NEON-GISEL-NEXT: cnt v0.8b, v0.8b
464+
; NEON-GISEL-NEXT: uaddlp v0.4h, v0.8b
465+
; NEON-GISEL-NEXT: uaddlp v0.2s, v0.4h
466+
; NEON-GISEL-NEXT: ret
467+
;
468+
; DOT-GISEL-LABEL: popcount2x32:
469+
; DOT-GISEL: // %bb.0: // %Entry
470+
; DOT-GISEL-NEXT: movi v1.2d, #0000000000000000
471+
; DOT-GISEL-NEXT: cnt v0.8b, v0.8b
472+
; DOT-GISEL-NEXT: movi v2.8b, #1
473+
; DOT-GISEL-NEXT: udot v1.2s, v2.8b, v0.8b
474+
; DOT-GISEL-NEXT: fmov d0, d1
475+
; DOT-GISEL-NEXT: ret
476+
;
477+
; SVE-GISEL-LABEL: popcount2x32:
478+
; SVE-GISEL: // %bb.0: // %Entry
479+
; SVE-GISEL-NEXT: cnt v0.8b, v0.8b
480+
; SVE-GISEL-NEXT: uaddlp v0.4h, v0.8b
481+
; SVE-GISEL-NEXT: uaddlp v0.2s, v0.4h
482+
; SVE-GISEL-NEXT: ret
268483
Entry:
269484
%1 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %0)
270485
ret <2 x i32> %1
@@ -284,6 +499,18 @@ define <8 x i16> @popcount8x16(<8 x i16> %0) {
284499
; CHECK-NEXT: cnt v0.16b, v0.16b
285500
; CHECK-NEXT: uaddlp v0.8h, v0.16b
286501
; CHECK-NEXT: ret
502+
;
503+
; GISEL-LABEL: popcount8x16:
504+
; GISEL: // %bb.0: // %Entry
505+
; GISEL-NEXT: cnt v0.16b, v0.16b
506+
; GISEL-NEXT: uaddlp v0.8h, v0.16b
507+
; GISEL-NEXT: ret
508+
;
509+
; GISELO0-LABEL: popcount8x16:
510+
; GISELO0: // %bb.0: // %Entry
511+
; GISELO0-NEXT: cnt v0.16b, v0.16b
512+
; GISELO0-NEXT: uaddlp v0.8h, v0.16b
513+
; GISELO0-NEXT: ret
287514
Entry:
288515
%1 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0)
289516
ret <8 x i16> %1
@@ -303,6 +530,18 @@ define <4 x i16> @popcount4x16(<4 x i16> %0) {
303530
; CHECK-NEXT: cnt v0.8b, v0.8b
304531
; CHECK-NEXT: uaddlp v0.4h, v0.8b
305532
; CHECK-NEXT: ret
533+
;
534+
; GISEL-LABEL: popcount4x16:
535+
; GISEL: // %bb.0: // %Entry
536+
; GISEL-NEXT: cnt v0.8b, v0.8b
537+
; GISEL-NEXT: uaddlp v0.4h, v0.8b
538+
; GISEL-NEXT: ret
539+
;
540+
; GISELO0-LABEL: popcount4x16:
541+
; GISELO0: // %bb.0: // %Entry
542+
; GISELO0-NEXT: cnt v0.8b, v0.8b
543+
; GISELO0-NEXT: uaddlp v0.4h, v0.8b
544+
; GISELO0-NEXT: ret
306545
Entry:
307546
%1 = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %0)
308547
ret <4 x i16> %1

0 commit comments

Comments
 (0)