Skip to content

Commit 2238363

Browse files
authored
[AArch64] Prevent v1f16 vselect/setcc type expansion. (#72048)
PR #71614 identified an issue in the lowering of v1f16 vector compares, where the `v1i1 setcc` is expanded to `v1i16 setcc`, and the `v1i16 setcc` tries to be expanded to a `v2i16 setcc` which fails. For floating point types we can let them scalarize instead though, generating a `setcc f16` that can be lowered using normal fp16 lowering. 07a8ff4 added a special case combine for v1 vselect to expand the predicate type to the same size as the fcmp operands. This turns that off for float types, allowing them to scalarize naturally, which hopefully fixes the issue by preventing the v1i16 setcc, meaning it wont try to widen to larger vectors. The codegen might not be optimal, but as far as I can tell everything generated successfully, providing that no `v1i16 setcc v1f16` instructions get generated.
1 parent dc6d077 commit 2238363

File tree

2 files changed

+151
-4
lines changed

2 files changed

+151
-4
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22389,13 +22389,14 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
2238922389
}
2239022390
}
2239122391

22392+
EVT CmpVT = N0.getOperand(0).getValueType();
2239222393
if (N0.getOpcode() != ISD::SETCC ||
2239322394
CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
22394-
CCVT.getVectorElementType() != MVT::i1)
22395+
CCVT.getVectorElementType() != MVT::i1 ||
22396+
CmpVT.getVectorElementType().isFloatingPoint())
2239522397
return SDValue();
2239622398

2239722399
EVT ResVT = N->getValueType(0);
22398-
EVT CmpVT = N0.getOperand(0).getValueType();
2239922400
// Only combine when the result type is of the same size as the compared
2240022401
// operands.
2240122402
if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
@@ -22438,8 +22439,10 @@ static SDValue performSelectCombine(SDNode *N,
2243822439
EVT SrcVT = N0.getOperand(0).getValueType();
2243922440

2244022441
// Don't try to do this optimization when the setcc itself has i1 operands.
22441-
// There are no legal vectors of i1, so this would be pointless.
22442-
if (SrcVT == MVT::i1)
22442+
// There are no legal vectors of i1, so this would be pointless. v1f16 is
22443+
// ruled out to prevent the creation of setcc that need to be scalarized.
22444+
if (SrcVT == MVT::i1 ||
22445+
(SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
2244322446
return SDValue();
2244422447

2244522448
int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();

llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,147 @@ if.then:
105105
if.end:
106106
ret i32 1;
107107
}
108+
109+
110+
define <1 x float> @test_vselect_f32(<1 x float> %i105, <1 x float> %in) {
111+
; CHECK-LABEL: test_vselect_f32:
112+
; CHECK: // %bb.0:
113+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
114+
; CHECK-NEXT: fcmp s0, s0
115+
; CHECK-NEXT: cset w8, vs
116+
; CHECK-NEXT: fmov s2, w8
117+
; CHECK-NEXT: shl v2.2s, v2.2s, #31
118+
; CHECK-NEXT: cmlt v2.2s, v2.2s, #0
119+
; CHECK-NEXT: bit v0.8b, v1.8b, v2.8b
120+
; CHECK-NEXT: ret
121+
%i179 = fcmp uno <1 x float> %i105, zeroinitializer
122+
%i180 = select <1 x i1> %i179, <1 x float> %in, <1 x float> %i105
123+
ret <1 x float> %i180
124+
}
125+
126+
define <1 x half> @test_vselect_f16(<1 x half> %i105, <1 x half> %in) {
127+
; CHECK-LABEL: test_vselect_f16:
128+
; CHECK: // %bb.0:
129+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
130+
; CHECK-NEXT: fcvt s2, h0
131+
; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
132+
; CHECK-NEXT: fcmp s2, s2
133+
; CHECK-NEXT: fcsel s0, s1, s0, vs
134+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
135+
; CHECK-NEXT: ret
136+
%i179 = fcmp uno <1 x half> %i105, zeroinitializer
137+
%i180 = select <1 x i1> %i179, <1 x half> %in, <1 x half> %i105
138+
ret <1 x half> %i180
139+
}
140+
141+
define <1 x half> @test_select_f16(half %a, half %b, <1 x half> %c, <1 x half> %d ) {
142+
; CHECK-LABEL: test_select_f16:
143+
; CHECK: // %bb.0:
144+
; CHECK-NEXT: fcvt s1, h1
145+
; CHECK-NEXT: fcvt s0, h0
146+
; CHECK-NEXT: // kill: def $h3 killed $h3 def $s3
147+
; CHECK-NEXT: // kill: def $h2 killed $h2 def $s2
148+
; CHECK-NEXT: fcmp s0, s1
149+
; CHECK-NEXT: fcsel s0, s2, s3, eq
150+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
151+
; CHECK-NEXT: ret
152+
%cmp31 = fcmp oeq half %a, %b
153+
%e = select i1 %cmp31, <1 x half> %c, <1 x half> %d
154+
ret <1 x half> %e
155+
}
156+
157+
define <1 x i16> @test_vselect_f16_i16(<1 x half> %i105, <1 x half> %in, <1 x i16> %x, <1 x i16> %y) {
158+
; CHECK-LABEL: test_vselect_f16_i16:
159+
; CHECK: // %bb.0:
160+
; CHECK-NEXT: fcvt s0, h0
161+
; CHECK-NEXT: fcmp s0, s0
162+
; CHECK-NEXT: cset w8, vs
163+
; CHECK-NEXT: fmov s0, w8
164+
; CHECK-NEXT: shl v0.4h, v0.4h, #15
165+
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
166+
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
167+
; CHECK-NEXT: ret
168+
%i179 = fcmp uno <1 x half> %i105, zeroinitializer
169+
%i180 = select <1 x i1> %i179, <1 x i16> %x, <1 x i16> %y
170+
ret <1 x i16> %i180
171+
}
172+
173+
define <1 x i16> @test_select_f16_i16(half %i105, half %in, <1 x i16> %x, <1 x i16> %y) {
174+
; CHECK-LABEL: test_select_f16_i16:
175+
; CHECK: // %bb.0:
176+
; CHECK-NEXT: fcvt s0, h0
177+
; CHECK-NEXT: fcmp s0, s0
178+
; CHECK-NEXT: csetm w8, vs
179+
; CHECK-NEXT: dup v0.4h, w8
180+
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
181+
; CHECK-NEXT: ret
182+
%i179 = fcmp uno half %i105, zeroinitializer
183+
%i180 = select i1 %i179, <1 x i16> %x, <1 x i16> %y
184+
ret <1 x i16> %i180
185+
}
186+
187+
define <1 x i32> @test_vselect_f16_i32(<1 x half> %i105, <1 x half> %in, <1 x i32> %x, <1 x i32> %y) {
188+
; CHECK-LABEL: test_vselect_f16_i32:
189+
; CHECK: // %bb.0:
190+
; CHECK-NEXT: fcvt s0, h0
191+
; CHECK-NEXT: fcmp s0, s0
192+
; CHECK-NEXT: cset w8, vs
193+
; CHECK-NEXT: fmov s0, w8
194+
; CHECK-NEXT: shl v0.2s, v0.2s, #31
195+
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
196+
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
197+
; CHECK-NEXT: ret
198+
%i179 = fcmp uno <1 x half> %i105, zeroinitializer
199+
%i180 = select <1 x i1> %i179, <1 x i32> %x, <1 x i32> %y
200+
ret <1 x i32> %i180
201+
}
202+
203+
define i64 @test_sext_extr_cmp_half(<1 x half> %v1, <1 x half> %v2) {
204+
; CHECK-LABEL: test_sext_extr_cmp_half:
205+
; CHECK: // %bb.0:
206+
; CHECK-NEXT: fcvt s1, h1
207+
; CHECK-NEXT: fcvt s0, h0
208+
; CHECK-NEXT: fcmp s0, s1
209+
; CHECK-NEXT: cset w8, eq
210+
; CHECK-NEXT: sbfx x0, x8, #0, #1
211+
; CHECK-NEXT: ret
212+
%1 = fcmp oeq <1 x half> %v1, %v2
213+
%2 = extractelement <1 x i1> %1, i32 0
214+
%vget_lane = sext i1 %2 to i64
215+
ret i64 %vget_lane
216+
}
217+
218+
define <1 x i64> @test_select_v1i1_half(half %lhs, half %rhs, <1 x i64> %v3) {
219+
; CHECK-LABEL: test_select_v1i1_half:
220+
; CHECK: // %bb.0:
221+
; CHECK-NEXT: fcvt s1, h1
222+
; CHECK-NEXT: fcvt s0, h0
223+
; CHECK-NEXT: fcmp s0, s1
224+
; CHECK-NEXT: csetm x8, eq
225+
; CHECK-NEXT: fmov d0, x8
226+
; CHECK-NEXT: bic v0.8b, v2.8b, v0.8b
227+
; CHECK-NEXT: ret
228+
%tst = fcmp oeq half %lhs, %rhs
229+
%evil = insertelement <1 x i1> undef, i1 %tst, i32 0
230+
%res = select <1 x i1> %evil, <1 x i64> zeroinitializer, <1 x i64> %v3
231+
ret <1 x i64> %res
232+
}
233+
234+
define i32 @test_br_extr_cmp_half(<1 x half> %v1, <1 x half> %v2) {
235+
; CHECK-LABEL: test_br_extr_cmp_half:
236+
; CHECK: // %bb.0: // %common.ret
237+
; CHECK-NEXT: fcvt s1, h1
238+
; CHECK-NEXT: fcvt s0, h0
239+
; CHECK-NEXT: fcmp s0, s1
240+
; CHECK-NEXT: cset w0, eq
241+
; CHECK-NEXT: ret
242+
%1 = fcmp oeq <1 x half> %v1, %v2
243+
%2 = extractelement <1 x i1> %1, i32 0
244+
br i1 %2, label %if.end, label %if.then
245+
246+
if.then:
247+
ret i32 0;
248+
249+
if.end:
250+
ret i32 1;
251+
}

0 commit comments

Comments
 (0)