Skip to content

Commit 29cb1e6

Browse files
authored
[AArch64] optimise SVE cmp intrinsics with no active lanes (#104779)
This patch extends #73964 and optimises SVE cmp intrinsics to zero vector when predicate is zero.
1 parent 26a8a85 commit 29cb1e6

File tree

2 files changed

+270
-0
lines changed

2 files changed

+270
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,10 @@ static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
11601160
IntrinsicInst &II) {
11611161
LLVMContext &Ctx = II.getContext();
11621162

1163+
// Replace by zero constant when all lanes are inactive
1164+
if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1165+
return II_NA;
1166+
11631167
// Check that the predicate is all active
11641168
auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
11651169
if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
@@ -2131,6 +2135,27 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
21312135
case Intrinsic::aarch64_sve_st4:
21322136
case Intrinsic::aarch64_sve_st4q:
21332137
return instCombineSVENoActiveUnaryErase(IC, II, 4);
2138+
case Intrinsic::aarch64_sve_cmpeq:
2139+
case Intrinsic::aarch64_sve_cmpeq_wide:
2140+
case Intrinsic::aarch64_sve_cmpge:
2141+
case Intrinsic::aarch64_sve_cmpge_wide:
2142+
case Intrinsic::aarch64_sve_cmpgt:
2143+
case Intrinsic::aarch64_sve_cmpgt_wide:
2144+
case Intrinsic::aarch64_sve_cmphi:
2145+
case Intrinsic::aarch64_sve_cmphi_wide:
2146+
case Intrinsic::aarch64_sve_cmphs:
2147+
case Intrinsic::aarch64_sve_cmphs_wide:
2148+
case Intrinsic::aarch64_sve_cmple_wide:
2149+
case Intrinsic::aarch64_sve_cmplo_wide:
2150+
case Intrinsic::aarch64_sve_cmpls_wide:
2151+
case Intrinsic::aarch64_sve_cmplt_wide:
2152+
case Intrinsic::aarch64_sve_facge:
2153+
case Intrinsic::aarch64_sve_facgt:
2154+
case Intrinsic::aarch64_sve_fcmpeq:
2155+
case Intrinsic::aarch64_sve_fcmpge:
2156+
case Intrinsic::aarch64_sve_fcmpgt:
2157+
case Intrinsic::aarch64_sve_fcmpne:
2158+
case Intrinsic::aarch64_sve_fcmpuo:
21342159
case Intrinsic::aarch64_sve_ld1_gather:
21352160
case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
21362161
case Intrinsic::aarch64_sve_ld1_gather_sxtw:
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
3+
target triple = "aarch64-unknown-linux-gnu"
4+
5+
define <vscale x 16 x i1> @test_cmpeq(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
6+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpeq(
7+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
8+
; CHECK-NEXT: [[ENTRY:.*:]]
9+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
10+
;
11+
entry:
12+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpeq.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
13+
ret <vscale x 16 x i1> %0
14+
}
15+
16+
define <vscale x 16 x i1> @test_cmpeq_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
17+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpeq_wide(
18+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
19+
; CHECK-NEXT: [[ENTRY:.*:]]
20+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
21+
;
22+
entry:
23+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpeq.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
24+
ret <vscale x 16 x i1> %0
25+
}
26+
27+
define <vscale x 16 x i1> @test_cmpge(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
28+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpge(
29+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
30+
; CHECK-NEXT: [[ENTRY:.*:]]
31+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
32+
;
33+
entry:
34+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
35+
ret <vscale x 16 x i1> %0
36+
}
37+
38+
define <vscale x 16 x i1> @test_cmpge_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
39+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpge_wide(
40+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
41+
; CHECK-NEXT: [[ENTRY:.*:]]
42+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
43+
;
44+
entry:
45+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
46+
ret <vscale x 16 x i1> %0
47+
}
48+
49+
define <vscale x 16 x i1> @test_cmpgt(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
50+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpgt(
51+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
52+
; CHECK-NEXT: [[ENTRY:.*:]]
53+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
54+
;
55+
entry:
56+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpgt.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
57+
ret <vscale x 16 x i1> %0
58+
}
59+
60+
define <vscale x 16 x i1> @test_cmpgt_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
61+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpgt_wide(
62+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
63+
; CHECK-NEXT: [[ENTRY:.*:]]
64+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
65+
;
66+
entry:
67+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpgt.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
68+
ret <vscale x 16 x i1> %0
69+
}
70+
71+
define <vscale x 16 x i1> @test_cmphi(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
72+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphi(
73+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
74+
; CHECK-NEXT: [[ENTRY:.*:]]
75+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
76+
;
77+
entry:
78+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphi.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
79+
ret <vscale x 16 x i1> %0
80+
}
81+
82+
define <vscale x 16 x i1> @test_cmphi_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
83+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphi_wide(
84+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
85+
; CHECK-NEXT: [[ENTRY:.*:]]
86+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
87+
;
88+
entry:
89+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphi.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
90+
ret <vscale x 16 x i1> %0
91+
}
92+
93+
define <vscale x 16 x i1> @test_cmphs(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
94+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphs(
95+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
96+
; CHECK-NEXT: [[ENTRY:.*:]]
97+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
98+
;
99+
entry:
100+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphs.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
101+
ret <vscale x 16 x i1> %0
102+
}
103+
104+
define <vscale x 16 x i1> @test_cmphs_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
105+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphs_wide(
106+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
107+
; CHECK-NEXT: [[ENTRY:.*:]]
108+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
109+
;
110+
entry:
111+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphs.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
112+
ret <vscale x 16 x i1> %0
113+
}
114+
115+
define <vscale x 16 x i1> @test_cmple_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
116+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmple_wide(
117+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
118+
; CHECK-NEXT: [[ENTRY:.*:]]
119+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
120+
;
121+
entry:
122+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmple.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
123+
ret <vscale x 16 x i1> %0
124+
}
125+
126+
define <vscale x 16 x i1> @test_cmplo_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
127+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmplo_wide(
128+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
129+
; CHECK-NEXT: [[ENTRY:.*:]]
130+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
131+
;
132+
entry:
133+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmplo.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
134+
ret <vscale x 16 x i1> %0
135+
}
136+
137+
define <vscale x 16 x i1> @test_cmpls_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
138+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpls_wide(
139+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
140+
; CHECK-NEXT: [[ENTRY:.*:]]
141+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
142+
;
143+
entry:
144+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpls.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
145+
ret <vscale x 16 x i1> %0
146+
}
147+
148+
define <vscale x 16 x i1> @test_cmplt_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
149+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmplt_wide(
150+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
151+
; CHECK-NEXT: [[ENTRY:.*:]]
152+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
153+
;
154+
entry:
155+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmplt.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
156+
ret <vscale x 16 x i1> %0
157+
}
158+
159+
define <vscale x 16 x i1> @test_cmpne(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
160+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpne(
161+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) {
162+
; CHECK-NEXT: [[ENTRY:.*:]]
163+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
164+
;
165+
entry:
166+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
167+
ret <vscale x 16 x i1> %0
168+
}
169+
170+
define <vscale x 16 x i1> @test_cmpne_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){
171+
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpne_wide(
172+
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
173+
; CHECK-NEXT: [[ENTRY:.*:]]
174+
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer
175+
;
176+
entry:
177+
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
178+
ret <vscale x 16 x i1> %0
179+
}
180+
181+
define <vscale x 8 x i1> @test_facge(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ry:
182+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.facge.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
183+
ret <vscale x 8 x i1> %0
184+
}
185+
186+
define <vscale x 8 x i1> @test_facgt(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ry:
187+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.facgt.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
188+
ret <vscale x 8 x i1> %0
189+
}
190+
191+
define <vscale x 8 x i1> @test_fcmpeq(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
192+
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpeq(
193+
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
194+
; CHECK-NEXT: [[ENTRY:.*:]]
195+
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
196+
;
197+
entry:
198+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpeq.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
199+
ret <vscale x 8 x i1> %0
200+
}
201+
202+
define <vscale x 8 x i1> @test_fcmpge(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
203+
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpge(
204+
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
205+
; CHECK-NEXT: [[ENTRY:.*:]]
206+
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
207+
;
208+
entry:
209+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpge.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
210+
ret <vscale x 8 x i1> %0
211+
}
212+
213+
define <vscale x 8 x i1> @test_fcmpgt(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
214+
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpgt(
215+
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
216+
; CHECK-NEXT: [[ENTRY:.*:]]
217+
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
218+
;
219+
entry:
220+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpgt.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
221+
ret <vscale x 8 x i1> %0
222+
}
223+
224+
define <vscale x 8 x i1> @test_fcmpne(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
225+
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpne(
226+
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
227+
; CHECK-NEXT: [[ENTRY:.*:]]
228+
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
229+
;
230+
entry:
231+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpne.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
232+
ret <vscale x 8 x i1> %0
233+
}
234+
235+
define <vscale x 8 x i1> @test_fcmpuo(<vscale x 8 x half> %a, <vscale x 8 x half> %b){
236+
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpuo(
237+
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) {
238+
; CHECK-NEXT: [[ENTRY:.*:]]
239+
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer
240+
;
241+
entry:
242+
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpuo.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
243+
ret <vscale x 8 x i1> %0
244+
}
245+

0 commit comments

Comments
 (0)