Skip to content

Commit 7f292b8

Browse files
author
Rin Dobrescu
authored
[AArch64] Convert concat(uhadd(a,b), uhadd(c,d)) to uhadd(concat(a,c), concat(b,d)) (#80674)
We can convert concat(v4i16 uhadd(a,b), v4i16 uhadd(c,d)) to v8i16 uhadd(concat(a,c), concat(b,d)), which can lead to further simplifications.
1 parent c302909 commit 7f292b8

File tree

3 files changed

+190
-104
lines changed

3 files changed

+190
-104
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -18299,50 +18299,23 @@ static SDValue performConcatVectorsCombine(SDNode *N,
1829918299
if (DCI.isBeforeLegalizeOps())
1830018300
return SDValue();
1830118301

18302-
// Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
18303-
// extracted subvectors from the same original vectors. Combine these into a
18304-
// single avg that operates on the two original vectors.
18305-
// avgceil is the target independant name for rhadd, avgfloor is a hadd.
18306-
// Example:
18307-
// (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
18308-
// extract_subvector (v16i8 OpB, <0>))),
18309-
// (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
18310-
// extract_subvector (v16i8 OpB, <8>)))))
18311-
// ->
18312-
// (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
18313-
if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
18302+
// Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
18303+
// destination size, combine into an avg of two contacts of the source
18304+
// vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
18305+
// concat(b, d))
18306+
if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
1831418307
(N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
18315-
N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
18308+
N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
18309+
N0->hasOneUse() && N1->hasOneUse()) {
1831618310
SDValue N00 = N0->getOperand(0);
1831718311
SDValue N01 = N0->getOperand(1);
1831818312
SDValue N10 = N1->getOperand(0);
1831918313
SDValue N11 = N1->getOperand(1);
1832018314

18321-
EVT N00VT = N00.getValueType();
18322-
EVT N10VT = N10.getValueType();
18323-
18324-
if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18325-
N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18326-
N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18327-
N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
18328-
SDValue N00Source = N00->getOperand(0);
18329-
SDValue N01Source = N01->getOperand(0);
18330-
SDValue N10Source = N10->getOperand(0);
18331-
SDValue N11Source = N11->getOperand(0);
18332-
18333-
if (N00Source == N10Source && N01Source == N11Source &&
18334-
N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
18335-
assert(N0.getValueType() == N1.getValueType());
18336-
18337-
uint64_t N00Index = N00.getConstantOperandVal(1);
18338-
uint64_t N01Index = N01.getConstantOperandVal(1);
18339-
uint64_t N10Index = N10.getConstantOperandVal(1);
18340-
uint64_t N11Index = N11.getConstantOperandVal(1);
18341-
18342-
if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
18343-
N10Index == N00VT.getVectorNumElements())
18344-
return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
18345-
}
18315+
if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18316+
SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18317+
SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18318+
return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
1834618319
}
1834718320
}
1834818321

llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll

Lines changed: 27 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,36 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
33

4-
define i32 @lower_lshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
5-
; CHECK-LABEL: lower_lshr:
6-
; CHECK: // %bb.0:
7-
; CHECK-NEXT: addv s0, v0.4s
8-
; CHECK-NEXT: addv s1, v1.4s
9-
; CHECK-NEXT: addv s4, v4.4s
10-
; CHECK-NEXT: addv s5, v5.4s
11-
; CHECK-NEXT: addv s2, v2.4s
12-
; CHECK-NEXT: addv s6, v6.4s
13-
; CHECK-NEXT: mov v0.s[1], v1.s[0]
14-
; CHECK-NEXT: addv s1, v3.4s
15-
; CHECK-NEXT: addv s3, v7.4s
16-
; CHECK-NEXT: mov v4.s[1], v5.s[0]
17-
; CHECK-NEXT: mov v0.s[2], v2.s[0]
18-
; CHECK-NEXT: mov v4.s[2], v6.s[0]
19-
; CHECK-NEXT: mov v0.s[3], v1.s[0]
20-
; CHECK-NEXT: mov v4.s[3], v3.s[0]
21-
; CHECK-NEXT: xtn v1.4h, v0.4s
22-
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
23-
; CHECK-NEXT: xtn v2.4h, v4.4s
24-
; CHECK-NEXT: shrn v3.4h, v4.4s, #16
25-
; CHECK-NEXT: uhadd v0.4h, v1.4h, v0.4h
26-
; CHECK-NEXT: uhadd v1.4h, v2.4h, v3.4h
27-
; CHECK-NEXT: mov v0.d[1], v1.d[0]
28-
; CHECK-NEXT: uaddlv s0, v0.8h
29-
; CHECK-NEXT: fmov w0, s0
30-
; CHECK-NEXT: ret
31-
%l87 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
32-
%l174 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
33-
%l257 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
34-
%l340 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d)
35-
%l427 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
36-
%l514 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %f)
37-
%l597 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %g)
38-
%l680 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %h)
39-
%l681 = insertelement <8 x i32> poison, i32 %l87, i32 0
40-
%l682 = insertelement <8 x i32> %l681, i32 %l174, i32 1
41-
%l683 = insertelement <8 x i32> %l682, i32 %l257, i32 2
42-
%l684 = insertelement <8 x i32> %l683, i32 %l340, i32 3
43-
%l685 = insertelement <8 x i32> %l684, i32 %l427, i32 4
44-
%l686 = insertelement <8 x i32> %l685, i32 %l514, i32 5
45-
%l687 = insertelement <8 x i32> %l686, i32 %l597, i32 6
46-
%l688 = insertelement <8 x i32> %l687, i32 %l680, i32 7
47-
%l689 = and <8 x i32> %l688, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
48-
%l690 = lshr <8 x i32> %l688, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
49-
%l691 = add nuw nsw <8 x i32> %l689, %l690
50-
%l692 = lshr <8 x i32> %l691, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
51-
%l693 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l692)
52-
ret i32 %l693
53-
}
54-
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
55-
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
56-
574
define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i16 %i, i16 %j, i16 %k, i16 %l, i16 %m, i16 %n, i16 %o, i16 %p) {
585
; CHECK-LABEL: lower_trunc_16xi8:
596
; CHECK: // %bb.0:
607
; CHECK-NEXT: fmov s0, w0
61-
; CHECK-NEXT: add x8, sp, #56
62-
; CHECK-NEXT: ld1r { v1.8h }, [x8]
8+
; CHECK-NEXT: ldr h1, [sp]
9+
; CHECK-NEXT: add x8, sp, #8
10+
; CHECK-NEXT: ld1 { v1.h }[1], [x8]
11+
; CHECK-NEXT: add x8, sp, #16
6312
; CHECK-NEXT: mov v0.h[1], w1
64-
; CHECK-NEXT: add v3.8h, v1.8h, v1.8h
13+
; CHECK-NEXT: ld1 { v1.h }[2], [x8]
14+
; CHECK-NEXT: add x8, sp, #24
6515
; CHECK-NEXT: mov v0.h[2], w2
16+
; CHECK-NEXT: ld1 { v1.h }[3], [x8]
17+
; CHECK-NEXT: add x8, sp, #32
6618
; CHECK-NEXT: mov v0.h[3], w3
19+
; CHECK-NEXT: ld1 { v1.h }[4], [x8]
20+
; CHECK-NEXT: add x8, sp, #40
21+
; CHECK-NEXT: ld1 { v1.h }[5], [x8]
22+
; CHECK-NEXT: add x8, sp, #48
6723
; CHECK-NEXT: mov v0.h[4], w4
24+
; CHECK-NEXT: ld1 { v1.h }[6], [x8]
25+
; CHECK-NEXT: add x8, sp, #56
6826
; CHECK-NEXT: mov v0.h[5], w5
27+
; CHECK-NEXT: ld1 { v1.h }[7], [x8]
6928
; CHECK-NEXT: mov v0.h[6], w6
70-
; CHECK-NEXT: add v2.8h, v0.8h, v0.8h
29+
; CHECK-NEXT: add v2.8h, v1.8h, v1.8h
30+
; CHECK-NEXT: mov v0.h[7], w7
31+
; CHECK-NEXT: add v3.8h, v0.8h, v0.8h
7132
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
72-
; CHECK-NEXT: uzp1 v1.16b, v2.16b, v3.16b
33+
; CHECK-NEXT: uzp1 v1.16b, v3.16b, v2.16b
7334
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
7435
; CHECK-NEXT: ret
7536
%a1 = insertelement <16 x i16> poison, i16 %a, i16 0
@@ -80,14 +41,14 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
8041
%f1 = insertelement <16 x i16> %e1, i16 %f, i16 5
8142
%g1 = insertelement <16 x i16> %f1, i16 %g, i16 6
8243
%h1 = insertelement <16 x i16> %g1, i16 %h, i16 7
83-
%i1 = insertelement <16 x i16> %f1, i16 %i, i16 8
84-
%j1 = insertelement <16 x i16> %g1, i16 %j, i16 9
85-
%k1 = insertelement <16 x i16> %f1, i16 %k, i16 10
86-
%l1 = insertelement <16 x i16> %g1, i16 %l, i16 11
87-
%m1 = insertelement <16 x i16> %f1, i16 %m, i16 12
88-
%n1 = insertelement <16 x i16> %g1, i16 %n, i16 13
89-
%o1 = insertelement <16 x i16> %f1, i16 %o, i16 14
90-
%p1 = insertelement <16 x i16> %g1, i16 %p, i16 15
44+
%i1 = insertelement <16 x i16> %h1, i16 %i, i16 8
45+
%j1 = insertelement <16 x i16> %i1, i16 %j, i16 9
46+
%k1 = insertelement <16 x i16> %j1, i16 %k, i16 10
47+
%l1 = insertelement <16 x i16> %k1, i16 %l, i16 11
48+
%m1 = insertelement <16 x i16> %l1, i16 %m, i16 12
49+
%n1 = insertelement <16 x i16> %m1, i16 %n, i16 13
50+
%o1 = insertelement <16 x i16> %n1, i16 %o, i16 14
51+
%p1 = insertelement <16 x i16> %o1, i16 %p, i16 15
9152
%t = trunc <16 x i16> %p1 to <16 x i8>
9253
%s = add <16 x i16> %p1, %p1
9354
%t2 = trunc <16 x i16> %s to <16 x i8>
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define i16 @combine_add_16xi16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i16 %i, i16 %j, i16 %k, i16 %l, i16 %m, i16 %n, i16 %o, i16 %p) {
5+
; CHECK-LABEL: combine_add_16xi16:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: fmov s0, w0
8+
; CHECK-NEXT: ldr h1, [sp]
9+
; CHECK-NEXT: add x8, sp, #8
10+
; CHECK-NEXT: ld1 { v1.h }[1], [x8]
11+
; CHECK-NEXT: add x8, sp, #16
12+
; CHECK-NEXT: mov v0.h[1], w1
13+
; CHECK-NEXT: ld1 { v1.h }[2], [x8]
14+
; CHECK-NEXT: add x8, sp, #24
15+
; CHECK-NEXT: mov v0.h[2], w2
16+
; CHECK-NEXT: ld1 { v1.h }[3], [x8]
17+
; CHECK-NEXT: add x8, sp, #32
18+
; CHECK-NEXT: mov v0.h[3], w3
19+
; CHECK-NEXT: ld1 { v1.h }[4], [x8]
20+
; CHECK-NEXT: add x8, sp, #40
21+
; CHECK-NEXT: ld1 { v1.h }[5], [x8]
22+
; CHECK-NEXT: add x8, sp, #48
23+
; CHECK-NEXT: mov v0.h[4], w4
24+
; CHECK-NEXT: ld1 { v1.h }[6], [x8]
25+
; CHECK-NEXT: add x8, sp, #56
26+
; CHECK-NEXT: mov v0.h[5], w5
27+
; CHECK-NEXT: ld1 { v1.h }[7], [x8]
28+
; CHECK-NEXT: mov v0.h[6], w6
29+
; CHECK-NEXT: mov v0.h[7], w7
30+
; CHECK-NEXT: uzp2 v2.16b, v0.16b, v1.16b
31+
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
32+
; CHECK-NEXT: uhadd v0.16b, v0.16b, v2.16b
33+
; CHECK-NEXT: uaddlv h0, v0.16b
34+
; CHECK-NEXT: umov w0, v0.h[0]
35+
; CHECK-NEXT: ret
36+
%a1 = insertelement <16 x i16> poison, i16 %a, i16 0
37+
%b1 = insertelement <16 x i16> %a1, i16 %b, i16 1
38+
%c1 = insertelement <16 x i16> %b1, i16 %c, i16 2
39+
%d1 = insertelement <16 x i16> %c1, i16 %d, i16 3
40+
%e1 = insertelement <16 x i16> %d1, i16 %e, i16 4
41+
%f1 = insertelement <16 x i16> %e1, i16 %f, i16 5
42+
%g1 = insertelement <16 x i16> %f1, i16 %g, i16 6
43+
%h1 = insertelement <16 x i16> %g1, i16 %h, i16 7
44+
%i1 = insertelement <16 x i16> %h1, i16 %i, i16 8
45+
%j1 = insertelement <16 x i16> %i1, i16 %j, i16 9
46+
%k1 = insertelement <16 x i16> %j1, i16 %k, i16 10
47+
%l1 = insertelement <16 x i16> %k1, i16 %l, i16 11
48+
%m1 = insertelement <16 x i16> %l1, i16 %m, i16 12
49+
%n1 = insertelement <16 x i16> %m1, i16 %n, i16 13
50+
%o1 = insertelement <16 x i16> %n1, i16 %o, i16 14
51+
%p1 = insertelement <16 x i16> %o1, i16 %p, i16 15
52+
%x = and <16 x i16> %p1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
53+
%sh1 = lshr <16 x i16> %p1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
54+
%s = add nuw nsw <16 x i16> %x, %sh1
55+
%sh2 = lshr <16 x i16> %s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
56+
%res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %sh2)
57+
ret i16 %res
58+
}
59+
60+
define i32 @combine_add_8xi32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) local_unnamed_addr #0 {
61+
; CHECK-LABEL: combine_add_8xi32:
62+
; CHECK: // %bb.0:
63+
; CHECK-NEXT: fmov s0, w4
64+
; CHECK-NEXT: fmov s1, w0
65+
; CHECK-NEXT: mov v0.s[1], w5
66+
; CHECK-NEXT: mov v1.s[1], w1
67+
; CHECK-NEXT: mov v0.s[2], w6
68+
; CHECK-NEXT: mov v1.s[2], w2
69+
; CHECK-NEXT: mov v0.s[3], w7
70+
; CHECK-NEXT: mov v1.s[3], w3
71+
; CHECK-NEXT: uzp2 v2.8h, v1.8h, v0.8h
72+
; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
73+
; CHECK-NEXT: uhadd v0.8h, v0.8h, v2.8h
74+
; CHECK-NEXT: uaddlv s0, v0.8h
75+
; CHECK-NEXT: fmov w0, s0
76+
; CHECK-NEXT: ret
77+
%a1 = insertelement <8 x i32> poison, i32 %a, i32 0
78+
%b1 = insertelement <8 x i32> %a1, i32 %b, i32 1
79+
%c1 = insertelement <8 x i32> %b1, i32 %c, i32 2
80+
%d1 = insertelement <8 x i32> %c1, i32 %d, i32 3
81+
%e1 = insertelement <8 x i32> %d1, i32 %e, i32 4
82+
%f1 = insertelement <8 x i32> %e1, i32 %f, i32 5
83+
%g1 = insertelement <8 x i32> %f1, i32 %g, i32 6
84+
%h1 = insertelement <8 x i32> %g1, i32 %h, i32 7
85+
%x = and <8 x i32> %h1, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
86+
%sh1 = lshr <8 x i32> %h1, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
87+
%s = add nuw nsw <8 x i32> %x, %sh1
88+
%sh2 = lshr <8 x i32> %s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
89+
%res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sh2)
90+
ret i32 %res
91+
}
92+
93+
define i32 @combine_undef_add_8xi32(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
94+
; CHECK-LABEL: combine_undef_add_8xi32:
95+
; CHECK: // %bb.0:
96+
; CHECK-NEXT: fmov s1, w0
97+
; CHECK-NEXT: movi v0.2d, #0000000000000000
98+
; CHECK-NEXT: mov v1.s[1], w1
99+
; CHECK-NEXT: uhadd v0.4h, v0.4h, v0.4h
100+
; CHECK-NEXT: mov v1.s[2], w2
101+
; CHECK-NEXT: mov v1.s[3], w3
102+
; CHECK-NEXT: xtn v2.4h, v1.4s
103+
; CHECK-NEXT: shrn v1.4h, v1.4s, #16
104+
; CHECK-NEXT: uhadd v1.4h, v2.4h, v1.4h
105+
; CHECK-NEXT: mov v1.d[1], v0.d[0]
106+
; CHECK-NEXT: uaddlv s0, v1.8h
107+
; CHECK-NEXT: fmov w0, s0
108+
; CHECK-NEXT: ret
109+
%a1 = insertelement <8 x i32> poison, i32 %a, i32 0
110+
%b1 = insertelement <8 x i32> %a1, i32 %b, i32 1
111+
%c1 = insertelement <8 x i32> %b1, i32 %c, i32 2
112+
%d1 = insertelement <8 x i32> %c1, i32 %d, i32 3
113+
%e1 = insertelement <8 x i32> %d1, i32 undef, i32 4
114+
%f1 = insertelement <8 x i32> %e1, i32 undef, i32 5
115+
%g1 = insertelement <8 x i32> %f1, i32 undef, i32 6
116+
%h1 = insertelement <8 x i32> %g1, i32 undef, i32 7
117+
%x = and <8 x i32> %h1, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
118+
%sh1 = lshr <8 x i32> %h1, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
119+
%s = add nuw nsw <8 x i32> %x, %sh1
120+
%sh2 = lshr <8 x i32> %s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
121+
%res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sh2)
122+
ret i32 %res
123+
}
124+
125+
define i64 @combine_add_4xi64(i64 %a, i64 %b, i64 %c, i64 %d) local_unnamed_addr #0 {
126+
; CHECK-LABEL: combine_add_4xi64:
127+
; CHECK: // %bb.0:
128+
; CHECK-NEXT: fmov d0, x2
129+
; CHECK-NEXT: fmov d1, x0
130+
; CHECK-NEXT: mov v0.d[1], x3
131+
; CHECK-NEXT: mov v1.d[1], x1
132+
; CHECK-NEXT: uzp2 v2.4s, v1.4s, v0.4s
133+
; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
134+
; CHECK-NEXT: uhadd v0.4s, v0.4s, v2.4s
135+
; CHECK-NEXT: uaddlv d0, v0.4s
136+
; CHECK-NEXT: fmov x0, d0
137+
; CHECK-NEXT: ret
138+
%a1 = insertelement <4 x i64> poison, i64 %a, i64 0
139+
%b1 = insertelement <4 x i64> %a1, i64 %b, i64 1
140+
%c1 = insertelement <4 x i64> %b1, i64 %c, i64 2
141+
%d1 = insertelement <4 x i64> %c1, i64 %d, i64 3
142+
%x = and <4 x i64> %d1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
143+
%sh1 = lshr <4 x i64> %d1, <i64 32, i64 32, i64 32, i64 32>
144+
%s = add nuw nsw <4 x i64> %x, %sh1
145+
%sh2 = lshr <4 x i64> %s, <i64 1, i64 1, i64 1, i64 1>
146+
%res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %sh2)
147+
ret i64 %res
148+
}
149+
150+
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
151+
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
152+
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)

0 commit comments

Comments
 (0)