[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions #112564

MacDue · 2024-10-16T15:10:53Z

In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly.

This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types.

Follow up to #112213.

llvmbot · 2024-10-16T15:11:35Z

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

Changes

When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive.

Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet).

Follow up to #112213.

Full diff: https://github.com/llvm/llvm-project/pull/112564.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+35)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll (+72-17)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 2a857234c7d745..19dc2016f9fcf7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2421,6 +2421,41 @@ let Predicates = [HasSVEorSME] in {
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  AArch64fsqrt_mt>;
 } // End HasSVEorSME
 
+// Helper for creating fp -> int -> fp conversions using SVE.
+class sve_fp_int_fp_cvt<Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
+  : OutPatFrag<(ops node: $Rn),
+    (EXTRACT_SUBREG
+      (FROM_INT (IMPLICIT_DEF), (PTRUE 1),
+        (TO_INT (IMPLICIT_DEF), (PTRUE 1),
+          (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>;
+
+// Some float -> int -> float conversion patterns where we want to keep the int
+// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR
+// register transfers. Only used when NEON is not available (e.g. in streaming
+// functions).
+// TODO: When +sme2p2 is available single-element vectors should be preferred.
+def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
+let Predicates = [HasSVEorSME, HasNoNEON] in {
+def : Pat<
+  (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+  (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+  (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+  (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+  (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
+def : Pat<
+  (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+  (sve_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
+} // End HasSVEorSME, HasNoNEON
+
 let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFDOT_ZZZ    : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
   defm BFDOT_ZZI    : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index 9aadf3133ba197..fbbe2cc64ad248 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible  < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 ; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -7,10 +8,19 @@ target triple = "aarch64-unknown-linux-gnu"
 define double @t1(double %x) {
 ; CHECK-LABEL: t1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; NONEON-NOSVE-LABEL: t1:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
+;
 ; NON-STREAMING-LABEL: t1:
 ; NON-STREAMING:       // %bb.0: // %entry
 ; NON-STREAMING-NEXT:    fcvtzs d0, d0
@@ -25,10 +35,19 @@ entry:
 define float @t2(float %x) {
 ; CHECK-LABEL: t2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; NONEON-NOSVE-LABEL: t2:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
+;
 ; NON-STREAMING-LABEL: t2:
 ; NON-STREAMING:       // %bb.0: // %entry
 ; NON-STREAMING-NEXT:    fcvtzs s0, s0
@@ -43,12 +62,21 @@ entry:
 define half @t3(half %x)  {
 ; CHECK-LABEL: t3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    scvtf s0, w8
-; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; NONEON-NOSVE-LABEL: t3:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+;
 ; NON-STREAMING-LABEL: t3:
 ; NON-STREAMING:       // %bb.0: // %entry
 ; NON-STREAMING-NEXT:    fcvt s0, h0
@@ -65,10 +93,19 @@ entry:
 define double @t4(double %x) {
 ; CHECK-LABEL: t4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu x8, d0
-; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; NONEON-NOSVE-LABEL: t4:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
+;
 ; NON-STREAMING-LABEL: t4:
 ; NON-STREAMING:       // %bb.0: // %entry
 ; NON-STREAMING-NEXT:    fcvtzu d0, d0
@@ -83,10 +120,19 @@ entry:
 define float @t5(float %x) {
 ; CHECK-LABEL: t5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; NONEON-NOSVE-LABEL: t5:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
+;
 ; NON-STREAMING-LABEL: t5:
 ; NON-STREAMING:       // %bb.0: // %entry
 ; NON-STREAMING-NEXT:    fcvtzu s0, s0
@@ -101,12 +147,21 @@ entry:
 define half @t6(half %x)  {
 ; CHECK-LABEL: t6:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    ucvtf s0, w8
-; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
+; NONEON-NOSVE-LABEL: t6:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+;
 ; NON-STREAMING-LABEL: t6:
 ; NON-STREAMING:       // %bb.0: // %entry
 ; NON-STREAMING-NEXT:    fcvt s0, h0

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

MacDue · 2024-10-22T12:55:26Z

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll

 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.h, z0.h[3]
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
 ; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    fcvtzu x11, h3
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
 ; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret


In some cases, this now removes a decent amount of fmovs to/from GRPs. 🙂

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

sdesmalen-arm · 2024-10-24T08:53:08Z

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll

 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.h, z0.h[3]
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
 ; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    fcvtzu x11, h3
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
 ; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret


paulwalker-arm

To me this looks more like a DAGCombine rather than a lowering problem. You're essentially saying fp_op->gpr->fpr should be rewritten as new_fp_op when possible. I'm assuming there are related combines for the cases where the only reason for the GPR was to either load the source data or store the result, which again feels like a DAG or preferable an IR combine.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

MacDue · 2024-10-24T10:11:30Z

To me this looks more like a DAGCombine rather than a lowering problem. You're essentially saying fp_op->gpr->fpr should be rewritten as new_fp_op when possible. I'm assuming there are related combines for the cases where the only reason for the GPR was to either load the source data or store the result, which again feels like a DAG or preferable an IR combine.

There was some discussion on that here #112564 (comment), originally this followed the neon combine that matched (from_int (to_int)) -- 564d9a7, but it was suggested to make it into a more generally applied lowering.

sdesmalen-arm · 2024-10-24T10:25:14Z

To me this looks more like a DAGCombine rather than a lowering problem. You're essentially saying fp_op->gpr->fpr should be rewritten as new_fp_op when possible. I'm assuming there are related combines for the cases where the only reason for the GPR was to either load the source data or store the result, which again feels like a DAG or preferable an IR combine.

There was some discussion on that here #112564 (comment), originally this followed the neon combine that matched (from_int (to_int)) -- 564d9a7, but it was suggested to make it into a more generally applied lowering.

I hope this wasn't a bum steer, but I thought there might be more general value in this for other cases than this particular fp -> int -> fp combine, e.g. when int result is inserted back into a vector for further (vector) calculations, although I didn't have any particular case in mind.

paulwalker-arm · 2024-10-24T10:27:49Z

There was some discussion on that here #112564 (comment), originally this followed the neon combine that matched (from_int (to_int)) -- 564d9a7, but it was suggested to make it into a more generally applied lowering.

I agree with the sentiments, this isn't a selection problem either, but not the new home of the transformation. DAGCombine is the place to optimise the DAG whereas LowerOperation exists to legalise the DAG. I'm happy to be convinced otherwise but given there is nothing illegal about the DAG to me that suggests a DAGCombine.

paulwalker-arm · 2024-10-24T10:51:12Z

To clarify my previous comment. I'm not suggesting you "must" implement a set of fp_op->gpr->fpr combines specific to the sequences you care about. I only used that to illustrate my view of this being a combine rather than a lowering problem. If splitting the operation in two (i.e. in_place_fp_cvt->transfer_to_gpr) is sufficient for your needs (because there is already code that'll remove transfer_to_gpr) then that is preferable as the simplest solution that has the widest coverage.

It is worth investigating a route where existing ISD nodes are used but with vector types. This could be the way to represent in_place_fp_cvt, which has the advantage of being generic but it's possible existing DACombines would undo the transformation. If that happens then at least we've a good reason to go the target specific ISD nodes route.

MacDue · 2024-10-24T11:34:27Z

To clarify my previous comment. I'm not suggesting you "must" implement a set of fp_op->gpr->fpr combines specific to the sequences you care about. I only used that to illustrate my view of this being a combine rather than a lowering problem. If splitting the operation in two (i.e. in_place_fp_cvt->transfer_to_gpr) is sufficient for your needs (because there is already code that'll remove transfer_to_gpr) then that is preferable as the simplest solution that has the widest coverage.

Right, I've moved the logic here into performIntToFpCombine()/performFpToIntCombine() rather than having it in the lowerings, and that achieves the same goals (with the inserts/extracts generally folding away).

It is worth investigating a route where existing ISD nodes are used but with vector types. This could be the way to represent in_place_fp_cvt, which has the advantage of being generic but it's possible existing DACombines would undo the transformation. If that happens then at least we've a good reason to go the target specific ISD nodes route.

I'm not sure I follow 😅, this is using the existing ISD nodes + insert/extract element. The AArch64 target nodes only come in after LowerToPredicatedOp().

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

paulwalker-arm · 2024-10-24T12:52:50Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  assert(!N->isStrictFPOpcode() && "strict fp ops not supported");
+  SDValue SrcVal = N->getOperand(0);


I'd recommend delaying the combine until at least after type legalisation (e.g. isBeforeLegalizeOps()) because that gives time to remove the original code entirely and limits the combination of types you need to worry about, which might remove the need for, or at least simplify, isSupportedType.

Waiting till after legalization has the side-effect of promoting f16 to f32, which leads to extra unnecessary fcvts being emitted.

It looks like I picked the wrong name when randomly giving an example. isBeforeLegalize() looks to be more relevant for delaying a combine until after type legalisation.

I have this same issue with both isBeforeLegalizeOps() and isBeforeLegalize(), but it's limited to streaming functions +sme (not +sve). Those functions should still be able to use the f16 SVE instructions though.

github-actions · 2024-10-24T15:19:25Z

✅ With the latest revision this PR passed the C/C++ code formatter.

MacDue

Drafting this for now as there's some issues that need to be resolved.

MacDue · 2024-10-24T15:21:35Z

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll

 define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0


Now loads to GPR instead of FPR (possible hazard + extra fmov)

MacDue · 2024-10-24T15:22:01Z

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll

 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0


Extra sxtws emitted

This is quite sad in cases like:

%conv = fptosi double %x to i32 %conv1 = sitofp i32 %conv to double

which now become:

fcvtzs z0.d, p0/m, z0.d fmov x8, d0 sxtw x8, w8 fmov d0, x8 scvtf z0.d, p0/m, z0.d

Though maybe some more folds could fix that.

MacDue · 2024-10-24T16:58:29Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  if (IsI32ToF64 || isF64ToI32) {
+    unsigned IntrinsicOpc;
+    if (IsI32ToF64)
+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_scvtf_f64i32
+                              : Intrinsic::aarch64_sve_ucvtf_f64i32;
+    else
+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_fcvtzs_i32f64
+                              : Intrinsic::aarch64_sve_fcvtzu_i32f64;
+    SDValue PTrue = getPredicateForVector(DAG, DL, MVT::nxv2f64);
+    Vec = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, DestVecTy,
+                      {DAG.getConstant(IntrinsicOpc, DL, MVT::i32),
+                       DAG.getUNDEF(DestTy), PTrue, Vec});


Not sure about this, but the easiest way I could think of to get the desired codegen was to directly use intrinsics for the f64 <-> i32 cases (it avoids any sign/zero extension issues).

When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive. Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet). Follow up to llvm#112213.

paulwalker-arm · 2024-10-29T13:14:37Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_scvtf_f64i32
+                              : Intrinsic::aarch64_sve_ucvtf_f64i32;
+    else
+      IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_fcvtzs_i32f64
+                              : Intrinsic::aarch64_sve_fcvtzu_i32f64;
+    SDValue PTrue = getPredicateForVector(DAG, DL, MVT::nxv2f64);
+    Vec = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, DestVecTy,
+                      {DAG.getConstant(IntrinsicOpc, DL, MVT::i32),


This is not much better than the original isel route. Code generation is rarely about implementing a single function that emits exactly what you need, but rather a sequence of smaller cuts that each move the DAG to its finial destination.

You mention using 64-bit converts did not result in the best code generation. What are the resulting deficiencies? Is it possible for them to be resolved using isolated combines or perhaps better isel? If the deficiencies are integer scalar instructions then doesn't that mean the GPR<->FPR transitions have not been removed and thus the rational for the combine has not been achieved anyway?

This is not much better than the original isel route. Code generation is rarely about implementing a single function that emits exactly what you need, but rather a sequence of smaller cuts that each move the DAG to its final destination.

I get that, but this is not entirely the standard case. This is emulating Neon/FP, which has f64 <-> i32 conversions, SVE also has these conversions, but they're currently unused (outside of intrinsics), as nxv2i32 is an illegal type that is promoted to nxv2i64. So you can't rely on the standard lowerings/patterns being in place.

You mention using 64-bit converts did not result in the best code generation. What are the resulting deficiencies?

I've shown a few such cases in my comments above. The issues were:

Loads not being promoted to FP registers (likely due to the sign/zero extensions in the way)

Pointless fmovs between GPRs + sign/zero extends for values that should stay within FP/vector registers

Additional unavoidable sign/zero extends in some cases

If you can only do i64 -> f64 conversions, and you have an i32, you'll need to extend it

You could probably write a bunch of folds to eventually tidy this up, but given that this will never be used for the general SVE path (since these converts are illegal there), the direct approach made more sense to me.

The inspiration for using the intrinsics directly partly came from the existing performFpToIntCombine(), which sometimes uses the Neon intrinsics directly, so it didn't seem too unreasonable.

…e] functions (1/n) In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to llvm#112213. Note: This PR does not include support for f64 <-> i32 conversions (like llvm#112564), which needs a bit more work to support.

…e] functions (1/n) (#118505) In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to #112213. Note: This PR does not include support for f64 <-> i32 conversions (like #112564), which needs a bit more work to support.

llvmbot added the backend:AArch64 label Oct 16, 2024

MacDue requested review from sdesmalen-arm and kmclaughlin-arm October 16, 2024 15:11

sdesmalen-arm reviewed Oct 16, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td Outdated Show resolved Hide resolved

MacDue force-pushed the sve_fp_converts branch from 60c48e6 to aea6409 Compare October 22, 2024 12:43

MacDue changed the title ~~[AArch64][SVE] Avoid transfer to GPRs for fp -> int -> fp conversions~~ [AArch64][SVE] Lower scalar converts to SVE when Neon is unavailable Oct 22, 2024

MacDue commented Oct 22, 2024

View reviewed changes

MacDue changed the title ~~[AArch64][SVE] Lower scalar converts to SVE when Neon is unavailable~~ [AArch64][SVE] Lower scalar FP converts to SVE when Neon is unavailable Oct 22, 2024

MacDue requested a review from paulwalker-arm October 22, 2024 16:55

paulwalker-arm reviewed Oct 22, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

sdesmalen-arm reviewed Oct 24, 2024

View reviewed changes

paulwalker-arm reviewed Oct 24, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

MacDue force-pushed the sve_fp_converts branch from b92240f to e09d40d Compare October 24, 2024 11:23

MacDue changed the title ~~[AArch64][SVE] Lower scalar FP converts to SVE when Neon is unavailable~~ [AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions Oct 24, 2024

paulwalker-arm reviewed Oct 24, 2024

View reviewed changes

MacDue marked this pull request as draft October 24, 2024 15:16

MacDue commented Oct 24, 2024

View reviewed changes

MacDue force-pushed the sve_fp_converts branch from 3376b10 to 839d4fc Compare October 24, 2024 15:25

MacDue commented Oct 24, 2024

View reviewed changes

MacDue added 4 commits October 24, 2024 17:57

Note that this is for scalars

68540ed

Lower scalar FP converts to SVE

56b9410

Remove strict converts

f59876f

MacDue added 4 commits October 24, 2024 17:57

Move to DAGCombine + fixups

3a5683a

Tweak test

8a37e17

WIP

64335db

Use intrinsics

b81e8db

MacDue force-pushed the sve_fp_converts branch from 91368f0 to b81e8db Compare October 24, 2024 17:57

Minor tweaks

7999e0b

MacDue marked this pull request as ready for review October 28, 2024 11:15

paulwalker-arm reviewed Oct 29, 2024

View reviewed changes

MacDue mentioned this pull request Dec 3, 2024

[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (1/n) #118505

Merged

This was referenced Jun 2, 2025

[MTE] [NFC] use vector to collect globals to tag (#120283) #142329

Closed

[MTE] [NFC] use vector to collect globals to tag (#120283) #142330

Draft

		assert(!N->isStrictFPOpcode() && "strict fp ops not supported");
		SDValue SrcVal = N->getOperand(0);

[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions #112564

Are you sure you want to change the base?

[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions #112564

Uh oh!

Conversation

MacDue commented Oct 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 16, 2024

Uh oh!

Uh oh!

MacDue Oct 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

MacDue commented Oct 24, 2024

Uh oh!

sdesmalen-arm commented Oct 24, 2024

Uh oh!

paulwalker-arm commented Oct 24, 2024

Uh oh!

paulwalker-arm commented Oct 24, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

MacDue commented Oct 24, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

MacDue Oct 24, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Oct 24, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

MacDue left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

MacDue Oct 24, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

MacDue Oct 29, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

MacDue commented Oct 16, 2024 •

edited

Loading

MacDue Oct 22, 2024 •

edited

Loading

paulwalker-arm left a comment •

edited

Loading

paulwalker-arm commented Oct 24, 2024 •

edited

Loading

MacDue Oct 24, 2024 •

edited

Loading

github-actions bot commented Oct 24, 2024 •

edited

Loading

MacDue Oct 24, 2024 •

edited

Loading

MacDue Oct 29, 2024 •

edited

Loading