Skip to content

Commit 092dd9c

Browse files
authored
[AArch64] Remove redundant instructions in int-to-fp of lowest vector… (#98602)
… element. When converting the lowest element (that in lane 0) of a vector from an integer to a floating-point value, LLVM should select the SIMD scalar variant of CVTF (https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Scalar-Instructions/SCVTF--scalar--integer-) to avoid the FPR to GPR register transfers that are required to use the general floating-point variant (https://developer.arm.com/documentation/dui0801/g/A64-Floating-point-Instructions/SCVTF--scalar--integer-). This is possible as the lowest element can be referred to by the corresponding scalar sub-register with the width of the vector's constituent elements. This patch adds new TableGen patterns to remove these redundant instructions for AArch64, as well as back-end tests to ensure the new preferred instruction selection result is produced. Existing tests that relied on the previous selection result have also been updated.
1 parent e5df657 commit 092dd9c

File tree

4 files changed

+193
-14
lines changed

4 files changed

+193
-14
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6084,6 +6084,34 @@ def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
60846084
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
60856085
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
60866086
}
6087+
6088+
// int -> float conversion of value in lane 0 of simd vector should use
6089+
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
6090+
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
6091+
(SCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
6092+
6093+
def : Pat<(f32 (uint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
6094+
(UCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
6095+
6096+
def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
6097+
(SCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
6098+
6099+
def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
6100+
(UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;
6101+
6102+
// fp16: integer extraction from vector must be at least 32-bits to be legal.
6103+
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
6104+
let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
6105+
def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
6106+
(v8i16 FPR128:$Rn), (i64 0))), i16)))),
6107+
(SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
6108+
6109+
// unsigned 32-bit extracted element is truncated to 16-bits using AND
6110+
def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract
6111+
(v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
6112+
(UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
6113+
}
6114+
60876115
// If an integer is about to be converted to a floating point value,
60886116
// just load it on the floating point unit.
60896117
// Here are the patterns for 8 and 16-bits to float.

llvm/test/CodeGen/AArch64/arm64-neon-copy.ll

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,8 +1127,7 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
11271127
; CHECK-SD-LABEL: test_bitcastv1f64tov8i8:
11281128
; CHECK-SD: // %bb.0:
11291129
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1130-
; CHECK-SD-NEXT: fmov x8, d0
1131-
; CHECK-SD-NEXT: scvtf d0, x8
1130+
; CHECK-SD-NEXT: scvtf d0, d0
11321131
; CHECK-SD-NEXT: neg v0.8b, v0.8b
11331132
; CHECK-SD-NEXT: ret
11341133
;
@@ -1147,8 +1146,7 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
11471146
; CHECK-SD-LABEL: test_bitcastv1f64tov4i16:
11481147
; CHECK-SD: // %bb.0:
11491148
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1150-
; CHECK-SD-NEXT: fmov x8, d0
1151-
; CHECK-SD-NEXT: scvtf d0, x8
1149+
; CHECK-SD-NEXT: scvtf d0, d0
11521150
; CHECK-SD-NEXT: neg v0.4h, v0.4h
11531151
; CHECK-SD-NEXT: ret
11541152
;
@@ -1167,8 +1165,7 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
11671165
; CHECK-SD-LABEL: test_bitcastv1f64tov2i32:
11681166
; CHECK-SD: // %bb.0:
11691167
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1170-
; CHECK-SD-NEXT: fmov x8, d0
1171-
; CHECK-SD-NEXT: scvtf d0, x8
1168+
; CHECK-SD-NEXT: scvtf d0, d0
11721169
; CHECK-SD-NEXT: neg v0.2s, v0.2s
11731170
; CHECK-SD-NEXT: ret
11741171
;
@@ -1187,8 +1184,7 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
11871184
; CHECK-SD-LABEL: test_bitcastv1f64tov1i64:
11881185
; CHECK-SD: // %bb.0:
11891186
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
1190-
; CHECK-SD-NEXT: fmov x8, d0
1191-
; CHECK-SD-NEXT: scvtf d0, x8
1187+
; CHECK-SD-NEXT: scvtf d0, d0
11921188
; CHECK-SD-NEXT: neg d0, d0
11931189
; CHECK-SD-NEXT: ret
11941190
;
@@ -1209,8 +1205,7 @@ define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
12091205
; CHECK-LABEL: test_bitcastv1f64tov2f32:
12101206
; CHECK: // %bb.0:
12111207
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1212-
; CHECK-NEXT: fmov x8, d0
1213-
; CHECK-NEXT: scvtf d0, x8
1208+
; CHECK-NEXT: scvtf d0, d0
12141209
; CHECK-NEXT: fneg v0.2s, v0.2s
12151210
; CHECK-NEXT: ret
12161211
%vcvt.i = sitofp <1 x i64> %a to <1 x double>

llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,162 @@ define <8 x half> @h_v8_s8(<8 x i16> %u) #0 {
101101
ret <8 x half> %v
102102
}
103103

104+
; int-to-fp conversion of element in lane 0 should apply
105+
; cvtf on vector subregister to avoid fpr->gpr trip
106+
define float @l0_extract_f_v2s(<2 x i32> %u) {
107+
; CHECK-LABEL: l0_extract_f_v2s:
108+
; CHECK: // %bb.0:
109+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
110+
; CHECK-NEXT: scvtf s0, s0
111+
; CHECK-NEXT: ret
112+
%i = extractelement <2 x i32> %u, i64 0
113+
%f = sitofp i32 %i to float
114+
ret float %f
115+
}
116+
117+
; cvtf to use ssub for bottom 32-bits from v2i32
118+
define float @l0_extract_f_v2u(<2 x i32> %u) {
119+
; CHECK-LABEL: l0_extract_f_v2u:
120+
; CHECK: // %bb.0:
121+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
122+
; CHECK-NEXT: ucvtf s0, s0
123+
; CHECK-NEXT: ret
124+
%i = extractelement <2 x i32> %u, i64 0
125+
%f = uitofp i32 %i to float
126+
ret float %f
127+
}
128+
129+
; Pattern should only apply when it is known to be lane 0
130+
define float @ln_extract_f_v2s(<2 x i32> %u, i64 %n) {
131+
; CHECK-LABEL: ln_extract_f_v2s:
132+
; CHECK: // %bb.0:
133+
; CHECK-NEXT: sub sp, sp, #16
134+
; CHECK-NEXT: .cfi_def_cfa_offset 16
135+
; CHECK-NEXT: add x8, sp, #8
136+
; CHECK-NEXT: str d0, [sp, #8]
137+
; CHECK-NEXT: bfi x8, x0, #2, #1
138+
; CHECK-NEXT: ldr s0, [x8]
139+
; CHECK-NEXT: scvtf s0, s0
140+
; CHECK-NEXT: add sp, sp, #16
141+
; CHECK-NEXT: ret
142+
%i = extractelement <2 x i32> %u, i64 %n
143+
%f = sitofp i32 %i to float
144+
ret float %f
145+
}
146+
147+
; cvtf to use ssub for bottom 32-bits from v4i32
148+
define float @l0_extract_f_v4s(<4 x i32> %u) {
149+
; CHECK-LABEL: l0_extract_f_v4s:
150+
; CHECK: // %bb.0:
151+
; CHECK-NEXT: scvtf s0, s0
152+
; CHECK-NEXT: ret
153+
%i = extractelement <4 x i32> %u, i64 0
154+
%f = sitofp i32 %i to float
155+
ret float %f
156+
}
157+
158+
define float @l0_extract_f_v4u(<4 x i32> %u) {
159+
; CHECK-LABEL: l0_extract_f_v4u:
160+
; CHECK: // %bb.0:
161+
; CHECK-NEXT: ucvtf s0, s0
162+
; CHECK-NEXT: ret
163+
%i = extractelement <4 x i32> %u, i64 0
164+
%f = uitofp i32 %i to float
165+
ret float %f
166+
}
167+
168+
define float @ln_extract_f_v4s(<4 x i32> %u, i64 %n) {
169+
; CHECK-LABEL: ln_extract_f_v4s:
170+
; CHECK: // %bb.0:
171+
; CHECK-NEXT: sub sp, sp, #16
172+
; CHECK-NEXT: .cfi_def_cfa_offset 16
173+
; CHECK-NEXT: mov x8, sp
174+
; CHECK-NEXT: str q0, [sp]
175+
; CHECK-NEXT: bfi x8, x0, #2, #2
176+
; CHECK-NEXT: ldr s0, [x8]
177+
; CHECK-NEXT: scvtf s0, s0
178+
; CHECK-NEXT: add sp, sp, #16
179+
; CHECK-NEXT: ret
180+
%i = extractelement <4 x i32> %u, i64 %n
181+
%f = sitofp i32 %i to float
182+
ret float %f
183+
}
184+
185+
; cvtf to use dsub for bottom 64-bits from v2i64
186+
define double @l0_extract_d_v2s(<2 x i64> %u) {
187+
; CHECK-LABEL: l0_extract_d_v2s:
188+
; CHECK: // %bb.0:
189+
; CHECK-NEXT: scvtf d0, d0
190+
; CHECK-NEXT: ret
191+
%i = extractelement <2 x i64> %u, i64 0
192+
%f = sitofp i64 %i to double
193+
ret double %f
194+
}
195+
196+
define double @l0_extract_d_v2u(<2 x i64> %u) {
197+
; CHECK-LABEL: l0_extract_d_v2u:
198+
; CHECK: // %bb.0:
199+
; CHECK-NEXT: ucvtf d0, d0
200+
; CHECK-NEXT: ret
201+
%i = extractelement <2 x i64> %u, i64 0
202+
%f = uitofp i64 %i to double
203+
ret double %f
204+
}
205+
206+
define double @ln_extract_d_v2s(<2 x i64> %u, i64 %n) {
207+
; CHECK-LABEL: ln_extract_d_v2s:
208+
; CHECK: // %bb.0:
209+
; CHECK-NEXT: sub sp, sp, #16
210+
; CHECK-NEXT: .cfi_def_cfa_offset 16
211+
; CHECK-NEXT: mov x8, sp
212+
; CHECK-NEXT: str q0, [sp]
213+
; CHECK-NEXT: bfi x8, x0, #3, #1
214+
; CHECK-NEXT: ldr d0, [x8]
215+
; CHECK-NEXT: scvtf d0, d0
216+
; CHECK-NEXT: add sp, sp, #16
217+
; CHECK-NEXT: ret
218+
%i = extractelement <2 x i64> %u, i64 %n
219+
%f = sitofp i64 %i to double
220+
ret double %f
221+
}
222+
223+
; (fullfp16) cvtf to use hsub for bottom 16-bits from v8i16
224+
define half @l0_extract_h_v8s(<8 x i16> %u) #0 {
225+
; CHECK-LABEL: l0_extract_h_v8s:
226+
; CHECK: // %bb.0:
227+
; CHECK-NEXT: scvtf h0, h0
228+
; CHECK-NEXT: ret
229+
%i = extractelement <8 x i16> %u, i32 0
230+
%f = sitofp i16 %i to half
231+
ret half %f
232+
}
233+
234+
define half @l0_extract_h_v8u(<8 x i16> %u) #0 {
235+
; CHECK-LABEL: l0_extract_h_v8u:
236+
; CHECK: // %bb.0:
237+
; CHECK-NEXT: ucvtf h0, h0
238+
; CHECK-NEXT: ret
239+
%i = extractelement <8 x i16> %u, i32 0
240+
%f = uitofp i16 %i to half
241+
ret half %f
242+
}
243+
244+
define half @ln_extract_h_v8u(<8 x i16> %u, i32 %n) #0 {
245+
; CHECK-LABEL: ln_extract_h_v8u:
246+
; CHECK: // %bb.0:
247+
; CHECK-NEXT: sub sp, sp, #16
248+
; CHECK-NEXT: .cfi_def_cfa_offset 16
249+
; CHECK-NEXT: mov x8, sp
250+
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
251+
; CHECK-NEXT: str q0, [sp]
252+
; CHECK-NEXT: bfi x8, x0, #1, #3
253+
; CHECK-NEXT: ldrh w8, [x8]
254+
; CHECK-NEXT: ucvtf h0, w8
255+
; CHECK-NEXT: add sp, sp, #16
256+
; CHECK-NEXT: ret
257+
%i = extractelement <8 x i16> %u, i32 %n
258+
%f = uitofp i16 %i to half
259+
ret half %f
260+
}
261+
104262
attributes #0 = { "target-features"="+fullfp16"}

llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -827,8 +827,7 @@ define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
827827
; CHECK-LABEL: ucvtf_v1i64_v1f64:
828828
; CHECK: // %bb.0:
829829
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
830-
; CHECK-NEXT: fmov x8, d0
831-
; CHECK-NEXT: ucvtf d0, x8
830+
; CHECK-NEXT: ucvtf d0, d0
832831
; CHECK-NEXT: ret
833832
%res = uitofp <1 x i64> %op1 to <1 x double>
834833
ret <1 x double> %res
@@ -1752,8 +1751,7 @@ define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
17521751
; CHECK-LABEL: scvtf_v1i64_v1f64:
17531752
; CHECK: // %bb.0:
17541753
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1755-
; CHECK-NEXT: fmov x8, d0
1756-
; CHECK-NEXT: scvtf d0, x8
1754+
; CHECK-NEXT: scvtf d0, d0
17571755
; CHECK-NEXT: ret
17581756
%res = sitofp <1 x i64> %op1 to <1 x double>
17591757
ret <1 x double> %res

0 commit comments

Comments
 (0)