Skip to content

Commit 1126bef

Browse files
[AArch64][SVE] Only generate wide adds when SVE2 or StreamingSVE is available (#118838)
1 parent 3dbff90 commit 1126bef

File tree

2 files changed

+81
-32
lines changed

2 files changed

+81
-32
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21817,7 +21817,7 @@ SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
2181721817
Intrinsic::experimental_vector_partial_reduce_add &&
2181821818
"Expected a partial reduction node");
2181921819

21820-
if (!Subtarget->isSVEorStreamingSVEAvailable())
21820+
if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
2182121821
return SDValue();
2182221822

2182321823
SDLoc DL(N);

llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll

Lines changed: 80 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,121 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s
2+
; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2
3+
; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
34

45
define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
5-
; CHECK-LABEL: signed_wide_add_nxv4i32:
6-
; CHECK: // %bb.0: // %entry
7-
; CHECK-NEXT: saddwb z0.d, z0.d, z1.s
8-
; CHECK-NEXT: saddwt z0.d, z0.d, z1.s
9-
; CHECK-NEXT: ret
6+
; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
7+
; CHECK-SVE2: // %bb.0: // %entry
8+
; CHECK-SVE2-NEXT: saddwb z0.d, z0.d, z1.s
9+
; CHECK-SVE2-NEXT: saddwt z0.d, z0.d, z1.s
10+
; CHECK-SVE2-NEXT: ret
11+
;
12+
; CHECK-SVE-LABEL: signed_wide_add_nxv4i32:
13+
; CHECK-SVE: // %bb.0: // %entry
14+
; CHECK-SVE-NEXT: sunpklo z2.d, z1.s
15+
; CHECK-SVE-NEXT: sunpkhi z1.d, z1.s
16+
; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
17+
; CHECK-SVE-NEXT: add z0.d, z1.d, z0.d
18+
; CHECK-SVE-NEXT: ret
1019
entry:
1120
%input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
1221
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
1322
ret <vscale x 2 x i64> %partial.reduce
1423
}
1524

1625
define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
17-
; CHECK-LABEL: unsigned_wide_add_nxv4i32:
18-
; CHECK: // %bb.0: // %entry
19-
; CHECK-NEXT: uaddwb z0.d, z0.d, z1.s
20-
; CHECK-NEXT: uaddwt z0.d, z0.d, z1.s
21-
; CHECK-NEXT: ret
26+
; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
27+
; CHECK-SVE2: // %bb.0: // %entry
28+
; CHECK-SVE2-NEXT: uaddwb z0.d, z0.d, z1.s
29+
; CHECK-SVE2-NEXT: uaddwt z0.d, z0.d, z1.s
30+
; CHECK-SVE2-NEXT: ret
31+
;
32+
; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i32:
33+
; CHECK-SVE: // %bb.0: // %entry
34+
; CHECK-SVE-NEXT: uunpklo z2.d, z1.s
35+
; CHECK-SVE-NEXT: uunpkhi z1.d, z1.s
36+
; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
37+
; CHECK-SVE-NEXT: add z0.d, z1.d, z0.d
38+
; CHECK-SVE-NEXT: ret
2239
entry:
2340
%input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
2441
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
2542
ret <vscale x 2 x i64> %partial.reduce
2643
}
2744

2845
define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
29-
; CHECK-LABEL: signed_wide_add_nxv8i16:
30-
; CHECK: // %bb.0: // %entry
31-
; CHECK-NEXT: saddwb z0.s, z0.s, z1.h
32-
; CHECK-NEXT: saddwt z0.s, z0.s, z1.h
33-
; CHECK-NEXT: ret
46+
; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
47+
; CHECK-SVE2: // %bb.0: // %entry
48+
; CHECK-SVE2-NEXT: saddwb z0.s, z0.s, z1.h
49+
; CHECK-SVE2-NEXT: saddwt z0.s, z0.s, z1.h
50+
; CHECK-SVE2-NEXT: ret
51+
;
52+
; CHECK-SVE-LABEL: signed_wide_add_nxv8i16:
53+
; CHECK-SVE: // %bb.0: // %entry
54+
; CHECK-SVE-NEXT: sunpklo z2.s, z1.h
55+
; CHECK-SVE-NEXT: sunpkhi z1.s, z1.h
56+
; CHECK-SVE-NEXT: add z0.s, z0.s, z2.s
57+
; CHECK-SVE-NEXT: add z0.s, z1.s, z0.s
58+
; CHECK-SVE-NEXT: ret
3459
entry:
3560
%input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
3661
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
3762
ret <vscale x 4 x i32> %partial.reduce
3863
}
3964

4065
define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
41-
; CHECK-LABEL: unsigned_wide_add_nxv8i16:
42-
; CHECK: // %bb.0: // %entry
43-
; CHECK-NEXT: uaddwb z0.s, z0.s, z1.h
44-
; CHECK-NEXT: uaddwt z0.s, z0.s, z1.h
45-
; CHECK-NEXT: ret
66+
; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
67+
; CHECK-SVE2: // %bb.0: // %entry
68+
; CHECK-SVE2-NEXT: uaddwb z0.s, z0.s, z1.h
69+
; CHECK-SVE2-NEXT: uaddwt z0.s, z0.s, z1.h
70+
; CHECK-SVE2-NEXT: ret
71+
;
72+
; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i16:
73+
; CHECK-SVE: // %bb.0: // %entry
74+
; CHECK-SVE-NEXT: uunpklo z2.s, z1.h
75+
; CHECK-SVE-NEXT: uunpkhi z1.s, z1.h
76+
; CHECK-SVE-NEXT: add z0.s, z0.s, z2.s
77+
; CHECK-SVE-NEXT: add z0.s, z1.s, z0.s
78+
; CHECK-SVE-NEXT: ret
4679
entry:
4780
%input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
4881
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
4982
ret <vscale x 4 x i32> %partial.reduce
5083
}
5184

5285
define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
53-
; CHECK-LABEL: signed_wide_add_nxv16i8:
54-
; CHECK: // %bb.0: // %entry
55-
; CHECK-NEXT: saddwb z0.h, z0.h, z1.b
56-
; CHECK-NEXT: saddwt z0.h, z0.h, z1.b
57-
; CHECK-NEXT: ret
86+
; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
87+
; CHECK-SVE2: // %bb.0: // %entry
88+
; CHECK-SVE2-NEXT: saddwb z0.h, z0.h, z1.b
89+
; CHECK-SVE2-NEXT: saddwt z0.h, z0.h, z1.b
90+
; CHECK-SVE2-NEXT: ret
91+
;
92+
; CHECK-SVE-LABEL: signed_wide_add_nxv16i8:
93+
; CHECK-SVE: // %bb.0: // %entry
94+
; CHECK-SVE-NEXT: sunpklo z2.h, z1.b
95+
; CHECK-SVE-NEXT: sunpkhi z1.h, z1.b
96+
; CHECK-SVE-NEXT: add z0.h, z0.h, z2.h
97+
; CHECK-SVE-NEXT: add z0.h, z1.h, z0.h
98+
; CHECK-SVE-NEXT: ret
5899
entry:
59100
%input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
60101
%partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
61102
ret <vscale x 8 x i16> %partial.reduce
62103
}
63104

64105
define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
65-
; CHECK-LABEL: unsigned_wide_add_nxv16i8:
66-
; CHECK: // %bb.0: // %entry
67-
; CHECK-NEXT: uaddwb z0.h, z0.h, z1.b
68-
; CHECK-NEXT: uaddwt z0.h, z0.h, z1.b
69-
; CHECK-NEXT: ret
106+
; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
107+
; CHECK-SVE2: // %bb.0: // %entry
108+
; CHECK-SVE2-NEXT: uaddwb z0.h, z0.h, z1.b
109+
; CHECK-SVE2-NEXT: uaddwt z0.h, z0.h, z1.b
110+
; CHECK-SVE2-NEXT: ret
111+
;
112+
; CHECK-SVE-LABEL: unsigned_wide_add_nxv16i8:
113+
; CHECK-SVE: // %bb.0: // %entry
114+
; CHECK-SVE-NEXT: uunpklo z2.h, z1.b
115+
; CHECK-SVE-NEXT: uunpkhi z1.h, z1.b
116+
; CHECK-SVE-NEXT: add z0.h, z0.h, z2.h
117+
; CHECK-SVE-NEXT: add z0.h, z1.h, z0.h
118+
; CHECK-SVE-NEXT: ret
70119
entry:
71120
%input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
72121
%partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)

0 commit comments

Comments
 (0)