Skip to content

Commit d57be19

Browse files
authored
[AArch64] replace SVE intrinsics with no active lanes with zero (#107413)
This patch extends #73964 and optimises SVE intrinsics into zero constants when predicate is zero.
1 parent 476b1a6 commit d57be19

File tree

2 files changed

+266
-10
lines changed

2 files changed

+266
-10
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,10 +1110,10 @@ instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
11101110
return std::nullopt;
11111111
}
11121112

1113-
// Simplify unary operation where predicate has all inactive lanes by replacing
1113+
// Simplify operation where predicate has all inactive lanes by replacing
11141114
// instruction with zeroed object
11151115
static std::optional<Instruction *>
1116-
instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
1116+
instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II) {
11171117
if (match(II.getOperand(0), m_ZeroInt())) {
11181118
Constant *Node;
11191119
Type *RetTy = II.getType();
@@ -1126,10 +1126,9 @@ instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
11261126
: ConstantInt::get(VecT, 0));
11271127
}
11281128
Node = ConstantStruct::get(StructT, ZerVec);
1129-
} else if (RetTy->isFPOrFPVectorTy())
1130-
Node = ConstantFP::get(RetTy, 0.0);
1131-
else
1132-
Node = ConstantInt::get(II.getType(), 0);
1129+
} else
1130+
Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0)
1131+
: ConstantInt::get(II.getType(), 0);
11331132

11341133
IC.replaceInstUsesWith(II, Node);
11351134
return IC.eraseInstFromFunction(II);
@@ -1188,7 +1187,7 @@ static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
11881187
LLVMContext &Ctx = II.getContext();
11891188

11901189
// Replace by zero constant when all lanes are inactive
1191-
if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1190+
if (auto II_NA = instCombineSVENoActiveZero(IC, II))
11921191
return II_NA;
11931192

11941193
// Check that the predicate is all active
@@ -1556,7 +1555,7 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
15561555
Type *VecTy = II.getType();
15571556

15581557
// Replace by zero constant when all lanes are inactive
1559-
if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1558+
if (auto II_NA = instCombineSVENoActiveZero(IC, II))
15601559
return II_NA;
15611560

15621561
if (isAllActivePredicate(Pred)) {
@@ -1907,7 +1906,7 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
19071906
Value *PassThru = ConstantAggregateZero::get(Ty);
19081907

19091908
// Replace by zero constant when all lanes are inactive
1910-
if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1909+
if (auto II_NA = instCombineSVENoActiveZero(IC, II))
19111910
return II_NA;
19121911

19131912
// Contiguous gather => masked load.
@@ -2197,6 +2196,31 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
21972196
case Intrinsic::aarch64_sve_st4:
21982197
case Intrinsic::aarch64_sve_st4q:
21992198
return instCombineSVENoActiveUnaryErase(IC, II, 4);
2199+
case Intrinsic::aarch64_sve_addqv:
2200+
case Intrinsic::aarch64_sve_and_z:
2201+
case Intrinsic::aarch64_sve_bic_z:
2202+
case Intrinsic::aarch64_sve_brka_z:
2203+
case Intrinsic::aarch64_sve_brkb_z:
2204+
case Intrinsic::aarch64_sve_brkn_z:
2205+
case Intrinsic::aarch64_sve_brkpa_z:
2206+
case Intrinsic::aarch64_sve_brkpb_z:
2207+
case Intrinsic::aarch64_sve_cntp:
2208+
case Intrinsic::aarch64_sve_compact:
2209+
case Intrinsic::aarch64_sve_eor_z:
2210+
case Intrinsic::aarch64_sve_eorv:
2211+
case Intrinsic::aarch64_sve_eorqv:
2212+
case Intrinsic::aarch64_sve_nand_z:
2213+
case Intrinsic::aarch64_sve_nor_z:
2214+
case Intrinsic::aarch64_sve_orn_z:
2215+
case Intrinsic::aarch64_sve_orr_z:
2216+
case Intrinsic::aarch64_sve_orv:
2217+
case Intrinsic::aarch64_sve_orqv:
2218+
case Intrinsic::aarch64_sve_pnext:
2219+
case Intrinsic::aarch64_sve_rdffr_z:
2220+
case Intrinsic::aarch64_sve_saddv:
2221+
case Intrinsic::aarch64_sve_uaddv:
2222+
case Intrinsic::aarch64_sve_umaxv:
2223+
case Intrinsic::aarch64_sve_umaxqv:
22002224
case Intrinsic::aarch64_sve_cmpeq:
22012225
case Intrinsic::aarch64_sve_cmpeq_wide:
22022226
case Intrinsic::aarch64_sve_cmpge:
@@ -2251,7 +2275,7 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
22512275
case Intrinsic::aarch64_sve_ldnt1_gather_index:
22522276
case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
22532277
case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2254-
return instCombineSVENoActiveUnaryZero(IC, II);
2278+
return instCombineSVENoActiveZero(IC, II);
22552279
case Intrinsic::aarch64_sve_prf:
22562280
case Intrinsic::aarch64_sve_prfb_gather_index:
22572281
case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
;RUN: opt -S -passes=instcombine < %s | FileCheck %s
3+
target triple = "aarch64-unknown-linux-gnu"
4+
5+
6+
define <16 x i8> @addqv_i8(<vscale x 16 x i8> %a) {
7+
; CHECK-LABEL: define <16 x i8> @addqv_i8(
8+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) {
9+
; CHECK-NEXT: ret <16 x i8> zeroinitializer
10+
;
11+
%res = call <16 x i8> @llvm.aarch64.sve.addqv.v16i8.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a);
12+
ret <16 x i8> %res
13+
}
14+
15+
define <vscale x 4 x i1> @and_4(<vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
16+
; CHECK-LABEL: define <vscale x 4 x i1> @and_4(
17+
; CHECK-SAME: <vscale x 4 x i1> [[PN:%.*]], <vscale x 4 x i1> [[PD:%.*]]) {
18+
; CHECK-NEXT: ret <vscale x 4 x i1> zeroinitializer
19+
;
20+
%res = call <vscale x 4 x i1> @llvm.aarch64.sve.and.z.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
21+
ret <vscale x 4 x i1> %res;
22+
}
23+
24+
define <vscale x 16 x i1> @bic_16(<vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
25+
; CHECK-LABEL: define <vscale x 16 x i1> @bic_16(
26+
; CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PD:%.*]]) {
27+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
28+
;
29+
%res = call <vscale x 16 x i1> @llvm.aarch64.sve.bic.z.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
30+
ret <vscale x 16 x i1> %res;
31+
}
32+
33+
define <vscale x 16 x i1> @brka_z_b8(<vscale x 16 x i1> %a) {
34+
; CHECK-LABEL: define <vscale x 16 x i1> @brka_z_b8(
35+
; CHECK-SAME: <vscale x 16 x i1> [[A:%.*]]) {
36+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
37+
;
38+
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.brka.z.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> %a)
39+
ret <vscale x 16 x i1> %out
40+
}
41+
42+
define <vscale x 16 x i1> @brkb_z_b8(<vscale x 16 x i1> %a) {
43+
; CHECK-LABEL: define <vscale x 16 x i1> @brkb_z_b8(
44+
; CHECK-SAME: <vscale x 16 x i1> [[A:%.*]]) {
45+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
46+
;
47+
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.brkb.z.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> %a)
48+
ret <vscale x 16 x i1> %out
49+
}
50+
51+
define <vscale x 16 x i1> @brkn_b8(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) {
52+
; CHECK-LABEL: define <vscale x 16 x i1> @brkn_b8(
53+
; CHECK-SAME: <vscale x 16 x i1> [[A:%.*]], <vscale x 16 x i1> [[B:%.*]]) {
54+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
55+
;
56+
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.brkn.z.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
57+
ret <vscale x 16 x i1> %out
58+
}
59+
60+
define <vscale x 16 x i1> @brkpa_b8(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) {
61+
; CHECK-LABEL: define <vscale x 16 x i1> @brkpa_b8(
62+
; CHECK-SAME: <vscale x 16 x i1> [[A:%.*]], <vscale x 16 x i1> [[B:%.*]]) {
63+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
64+
;
65+
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.brkpa.z.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
66+
ret <vscale x 16 x i1> %out
67+
}
68+
69+
define <vscale x 16 x i1> @brkpb_b8(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) {
70+
; CHECK-LABEL: define <vscale x 16 x i1> @brkpb_b8(
71+
; CHECK-SAME: <vscale x 16 x i1> [[A:%.*]], <vscale x 16 x i1> [[B:%.*]]) {
72+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
73+
;
74+
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.brkpb.z.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
75+
ret <vscale x 16 x i1> %out
76+
}
77+
78+
define i64 @cntp_b64(<vscale x 2 x i1> %a) {
79+
; CHECK-LABEL: define i64 @cntp_b64(
80+
; CHECK-SAME: <vscale x 2 x i1> [[A:%.*]]) {
81+
; CHECK-NEXT: ret i64 0
82+
;
83+
; USE_SCALAR_INC-LABEL: cntp_b64:
84+
; USE_SCALAR_INC: // %bb.0:
85+
; USE_SCALAR_INC-NEXT: cntp x0, p0, p1.d
86+
; USE_SCALAR_INC-NEXT: ret
87+
%out = call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> %a)
88+
ret i64 %out
89+
}
90+
91+
define <vscale x 4 x i32> @compact_i32(<vscale x 4 x i32> %a) {
92+
; CHECK-LABEL: define <vscale x 4 x i32> @compact_i32(
93+
; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
94+
; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
95+
;
96+
%out = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a)
97+
ret <vscale x 4 x i32> %out
98+
}
99+
100+
define <vscale x 16 x i1> @eor_16(<vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
101+
; CHECK-LABEL: define <vscale x 16 x i1> @eor_16(
102+
; CHECK-SAME: <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PD:%.*]]) {
103+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
104+
;
105+
%res = call <vscale x 16 x i1> @llvm.aarch64.sve.eor.z.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
106+
ret <vscale x 16 x i1> %res;
107+
}
108+
109+
define i32 @eorv_i32(<vscale x 4 x i32> %a) {
110+
; CHECK-LABEL: define i32 @eorv_i32(
111+
; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
112+
; CHECK-NEXT: ret i32 0
113+
;
114+
%out = call i32 @llvm.aarch64.sve.eorv.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a)
115+
ret i32 %out
116+
}
117+
118+
define <4 x i32> @eorqv_i32(<vscale x 4 x i32> %a) {
119+
; CHECK-LABEL: define <4 x i32> @eorqv_i32(
120+
; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
121+
; CHECK-NEXT: ret <4 x i32> zeroinitializer
122+
;
123+
%res = call <4 x i32> @llvm.aarch64.sve.eorqv.v4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a);
124+
ret <4 x i32> %res
125+
}
126+
127+
define <vscale x 8 x i1> @nand_8(<vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
128+
; CHECK-LABEL: define <vscale x 8 x i1> @nand_8(
129+
; CHECK-SAME: <vscale x 8 x i1> [[PN:%.*]], <vscale x 8 x i1> [[PD:%.*]]) {
130+
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
131+
;
132+
%res = call <vscale x 8 x i1> @llvm.aarch64.sve.nand.z.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
133+
ret <vscale x 8 x i1> %res;
134+
}
135+
136+
define <vscale x 4 x i1> @nor_4(<vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
137+
; CHECK-LABEL: define <vscale x 4 x i1> @nor_4(
138+
; CHECK-SAME: <vscale x 4 x i1> [[PN:%.*]], <vscale x 4 x i1> [[PD:%.*]]) {
139+
; CHECK-NEXT: ret <vscale x 4 x i1> zeroinitializer
140+
;
141+
%res = call <vscale x 4 x i1> @llvm.aarch64.sve.nor.z.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
142+
ret <vscale x 4 x i1> %res;
143+
}
144+
145+
define <vscale x 4 x i1> @orn_4(<vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
146+
; CHECK-LABEL: define <vscale x 4 x i1> @orn_4(
147+
; CHECK-SAME: <vscale x 4 x i1> [[PN:%.*]], <vscale x 4 x i1> [[PD:%.*]]) {
148+
; CHECK-NEXT: ret <vscale x 4 x i1> zeroinitializer
149+
;
150+
%res = call <vscale x 4 x i1> @llvm.aarch64.sve.orn.z.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
151+
ret <vscale x 4 x i1> %res;
152+
}
153+
154+
define <vscale x 2 x i1> @orr_2(<vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
155+
; CHECK-LABEL: define <vscale x 2 x i1> @orr_2(
156+
; CHECK-SAME: <vscale x 2 x i1> [[PN:%.*]], <vscale x 2 x i1> [[PD:%.*]]) {
157+
; CHECK-NEXT: ret <vscale x 2 x i1> zeroinitializer
158+
;
159+
%res = call <vscale x 2 x i1> @llvm.aarch64.sve.orr.z.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
160+
ret <vscale x 2 x i1> %res;
161+
}
162+
163+
define i8 @orv_i8(<vscale x 16 x i8> %a) {
164+
; CHECK-LABEL: define i8 @orv_i8(
165+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) {
166+
; CHECK-NEXT: ret i8 0
167+
;
168+
%out = call i8 @llvm.aarch64.sve.orv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
169+
ret i8 %out
170+
}
171+
172+
define <8 x i16> @orqv_i16(<vscale x 8 x i16> %a) {
173+
; CHECK-LABEL: define <8 x i16> @orqv_i16(
174+
; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]]) {
175+
; CHECK-NEXT: ret <8 x i16> zeroinitializer
176+
;
177+
%res = call <8 x i16> @llvm.aarch64.sve.orqv.v8i16.nxv8i16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i16> %a);
178+
ret <8 x i16> %res
179+
}
180+
181+
define <vscale x 4 x i1> @pnext_b32(<vscale x 4 x i1> %a) {
182+
; CHECK-LABEL: define <vscale x 4 x i1> @pnext_b32(
183+
; CHECK-SAME: <vscale x 4 x i1> [[A:%.*]]) {
184+
; CHECK-NEXT: ret <vscale x 4 x i1> zeroinitializer
185+
;
186+
%out = call <vscale x 4 x i1> @llvm.aarch64.sve.pnext.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> %a)
187+
ret <vscale x 4 x i1> %out
188+
}
189+
190+
define <vscale x 16 x i1> @rdffr_z() {
191+
; CHECK-LABEL: define <vscale x 16 x i1> @rdffr_z() {
192+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
193+
;
194+
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.rdffr.z(<vscale x 16 x i1> zeroinitializer)
195+
ret <vscale x 16 x i1> %out
196+
}
197+
198+
define i64 @saddv_i64(<vscale x 2 x i64> %a) {
199+
; CHECK-LABEL: define i64 @saddv_i64(
200+
; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]]) {
201+
; CHECK-NEXT: ret i64 0
202+
;
203+
%out = call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i64> %a)
204+
ret i64 %out
205+
}
206+
207+
define i64 @uaddv_i8(<vscale x 16 x i8> %a) {
208+
; CHECK-LABEL: define i64 @uaddv_i8(
209+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) {
210+
; CHECK-NEXT: ret i64 0
211+
;
212+
%out = call i64 @llvm.aarch64.sve.uaddv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
213+
ret i64 %out
214+
}
215+
216+
define i8 @umaxv_i8(<vscale x 16 x i8> %a) {
217+
; CHECK-LABEL: define i8 @umaxv_i8(
218+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]]) {
219+
; CHECK-NEXT: ret i8 0
220+
;
221+
%out = call i8 @llvm.aarch64.sve.umaxv.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a)
222+
ret i8 %out
223+
}
224+
225+
define <8 x i16> @umaxqv_i16(<vscale x 8 x i16> %a) {
226+
; CHECK-LABEL: define <8 x i16> @umaxqv_i16(
227+
; CHECK-SAME: <vscale x 8 x i16> [[A:%.*]]) {
228+
; CHECK-NEXT: ret <8 x i16> zeroinitializer
229+
;
230+
%res = call <8 x i16> @llvm.aarch64.sve.umaxqv.v8i16.nxv8i16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i16> %a);
231+
ret <8 x i16> %res
232+
}

0 commit comments

Comments
 (0)