Skip to content

[AArch64] Remove redundant instructions in int-to-fp of lowest vector… #98602

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6079,6 +6079,34 @@ def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}

// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these need a HasNEONandIsStreamingSafe predicate?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe so - the entire section that these patterns are in is predicated by a HasNeonandIsStreamingSafe slightly further up, though admittedly this is not that clear.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @davemgreen. It looks like these patterns need guarding with let Predicates = [HasNEONandIsStreamingSafe] similar to the two fp16 patterns below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies if my comment was unclear, these patterns already fall in the scope of a previous let Predicates = [HasNEONandIsStreamingSafe] binding. I can always another guard above these patterns if we'd like this to be clearer?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. I mis-read where the above predicate got closed.

(SCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;

def : Pat<(f32 (uint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
(UCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;

def : Pat<(f64 (sint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
(SCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;

def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
(UCVTFv1i64 (i64 (EXTRACT_SUBREG (v2i64 FPR128:$Rn), dsub)))>;

// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
(v8i16 FPR128:$Rn), (i64 0))), i16)))),
(SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;

// unsigned 32-bit extracted element is truncated to 16-bits using AND
def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract
(v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
(UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
}

// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// Here are the patterns for 8 and 16-bits to float.
Expand Down
15 changes: 5 additions & 10 deletions llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1127,8 +1127,7 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov8i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: scvtf d0, x8
; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg v0.8b, v0.8b
; CHECK-SD-NEXT: ret
;
Expand All @@ -1147,8 +1146,7 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov4i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: scvtf d0, x8
; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg v0.4h, v0.4h
; CHECK-SD-NEXT: ret
;
Expand All @@ -1167,8 +1165,7 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov2i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: scvtf d0, x8
; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg v0.2s, v0.2s
; CHECK-SD-NEXT: ret
;
Expand All @@ -1187,8 +1184,7 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
; CHECK-SD-LABEL: test_bitcastv1f64tov1i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: scvtf d0, x8
; CHECK-SD-NEXT: scvtf d0, d0
; CHECK-SD-NEXT: neg d0, d0
; CHECK-SD-NEXT: ret
;
Expand All @@ -1209,8 +1205,7 @@ define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1f64tov2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: scvtf d0, x8
; CHECK-NEXT: scvtf d0, d0
; CHECK-NEXT: fneg v0.2s, v0.2s
; CHECK-NEXT: ret
%vcvt.i = sitofp <1 x i64> %a to <1 x double>
Expand Down
158 changes: 158 additions & 0 deletions llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,162 @@ define <8 x half> @h_v8_s8(<8 x i16> %u) #0 {
ret <8 x half> %v
}

; int-to-fp conversion of element in lane 0 should apply
; cvtf on vector subregister to avoid fpr->gpr trip
define float @l0_extract_f_v2s(<2 x i32> %u) {
; CHECK-LABEL: l0_extract_f_v2s:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: scvtf s0, s0
; CHECK-NEXT: ret
%i = extractelement <2 x i32> %u, i64 0
%f = sitofp i32 %i to float
ret float %f
}

; cvtf to use ssub for bottom 32-bits from v2i32
define float @l0_extract_f_v2u(<2 x i32> %u) {
; CHECK-LABEL: l0_extract_f_v2u:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ucvtf s0, s0
; CHECK-NEXT: ret
%i = extractelement <2 x i32> %u, i64 0
%f = uitofp i32 %i to float
ret float %f
}

; Pattern should only apply when it is known to be lane 0
define float @ln_extract_f_v2s(<2 x i32> %u, i64 %n) {
; CHECK-LABEL: ln_extract_f_v2s:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: add x8, sp, #8
; CHECK-NEXT: str d0, [sp, #8]
; CHECK-NEXT: bfi x8, x0, #2, #1
; CHECK-NEXT: ldr s0, [x8]
; CHECK-NEXT: scvtf s0, s0
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%i = extractelement <2 x i32> %u, i64 %n
%f = sitofp i32 %i to float
ret float %f
}

; cvtf to use ssub for bottom 32-bits from v4i32
define float @l0_extract_f_v4s(<4 x i32> %u) {
; CHECK-LABEL: l0_extract_f_v4s:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf s0, s0
; CHECK-NEXT: ret
%i = extractelement <4 x i32> %u, i64 0
%f = sitofp i32 %i to float
ret float %f
}

define float @l0_extract_f_v4u(<4 x i32> %u) {
; CHECK-LABEL: l0_extract_f_v4u:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf s0, s0
; CHECK-NEXT: ret
%i = extractelement <4 x i32> %u, i64 0
%f = uitofp i32 %i to float
ret float %f
}

define float @ln_extract_f_v4s(<4 x i32> %u, i64 %n) {
; CHECK-LABEL: ln_extract_f_v4s:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: str q0, [sp]
; CHECK-NEXT: bfi x8, x0, #2, #2
; CHECK-NEXT: ldr s0, [x8]
; CHECK-NEXT: scvtf s0, s0
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%i = extractelement <4 x i32> %u, i64 %n
%f = sitofp i32 %i to float
ret float %f
}

; cvtf to use dsub for bottom 64-bits from v2i64
define double @l0_extract_d_v2s(<2 x i64> %u) {
; CHECK-LABEL: l0_extract_d_v2s:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf d0, d0
; CHECK-NEXT: ret
%i = extractelement <2 x i64> %u, i64 0
%f = sitofp i64 %i to double
ret double %f
}

define double @l0_extract_d_v2u(<2 x i64> %u) {
; CHECK-LABEL: l0_extract_d_v2u:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf d0, d0
; CHECK-NEXT: ret
%i = extractelement <2 x i64> %u, i64 0
%f = uitofp i64 %i to double
ret double %f
}

define double @ln_extract_d_v2s(<2 x i64> %u, i64 %n) {
; CHECK-LABEL: ln_extract_d_v2s:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: str q0, [sp]
; CHECK-NEXT: bfi x8, x0, #3, #1
; CHECK-NEXT: ldr d0, [x8]
; CHECK-NEXT: scvtf d0, d0
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%i = extractelement <2 x i64> %u, i64 %n
%f = sitofp i64 %i to double
ret double %f
}

; (fullfp16) cvtf to use hsub for bottom 16-bits from v8i16
define half @l0_extract_h_v8s(<8 x i16> %u) #0 {
; CHECK-LABEL: l0_extract_h_v8s:
; CHECK: // %bb.0:
; CHECK-NEXT: scvtf h0, h0
; CHECK-NEXT: ret
%i = extractelement <8 x i16> %u, i32 0
%f = sitofp i16 %i to half
ret half %f
}

define half @l0_extract_h_v8u(<8 x i16> %u) #0 {
; CHECK-LABEL: l0_extract_h_v8u:
; CHECK: // %bb.0:
; CHECK-NEXT: ucvtf h0, h0
; CHECK-NEXT: ret
%i = extractelement <8 x i16> %u, i32 0
%f = uitofp i16 %i to half
ret half %f
}

define half @ln_extract_h_v8u(<8 x i16> %u, i32 %n) #0 {
; CHECK-LABEL: ln_extract_h_v8u:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: str q0, [sp]
; CHECK-NEXT: bfi x8, x0, #1, #3
; CHECK-NEXT: ldrh w8, [x8]
; CHECK-NEXT: ucvtf h0, w8
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%i = extractelement <8 x i16> %u, i32 %n
%f = uitofp i16 %i to half
ret half %f
}

attributes #0 = { "target-features"="+fullfp16"}
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -827,8 +827,7 @@ define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i64_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: ucvtf d0, x8
; CHECK-NEXT: ucvtf d0, d0
; CHECK-NEXT: ret
%res = uitofp <1 x i64> %op1 to <1 x double>
ret <1 x double> %res
Expand Down Expand Up @@ -1752,8 +1751,7 @@ define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i64_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: scvtf d0, x8
; CHECK-NEXT: scvtf d0, d0
; CHECK-NEXT: ret
%res = sitofp <1 x i64> %op1 to <1 x double>
ret <1 x double> %res
Expand Down
Loading