-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions #112564
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesWhen Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive. Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet). Follow up to #112213. Full diff: https://github.com/llvm/llvm-project/pull/112564.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 2a857234c7d745..19dc2016f9fcf7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2421,6 +2421,41 @@ let Predicates = [HasSVEorSME] in {
defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>;
} // End HasSVEorSME
+// Helper for creating fp -> int -> fp conversions using SVE.
+class sve_fp_int_fp_cvt<Instruction PTRUE, Instruction FROM_INT, Instruction TO_INT, SubRegIndex sub>
+ : OutPatFrag<(ops node: $Rn),
+ (EXTRACT_SUBREG
+ (FROM_INT (IMPLICIT_DEF), (PTRUE 1),
+ (TO_INT (IMPLICIT_DEF), (PTRUE 1),
+ (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub))), sub)>;
+
+// Some float -> int -> float conversion patterns where we want to keep the int
+// values in FP registers using the SVE instructions to avoid costly GPR <-> FPR
+// register transfers. Only used when NEON is not available (e.g. in streaming
+// functions).
+// TODO: When +sme2p2 is available single-element vectors should be preferred.
+def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">;
+let Predicates = [HasSVEorSME, HasNoNEON] in {
+def : Pat<
+ (f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_D, SCVTF_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+ (f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_D, UCVTF_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoD, dsub> $Rn)>;
+def : Pat<
+ (f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_S, SCVTF_ZPmZ_StoS, FCVTZS_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+ (f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_S, UCVTF_ZPmZ_StoS, FCVTZU_ZPmZ_StoS, ssub> $Rn)>;
+def : Pat<
+ (f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_H, SCVTF_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoH, hsub> $Rn)>;
+def : Pat<
+ (f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+ (sve_fp_int_fp_cvt<PTRUE_H, UCVTF_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoH, hsub> $Rn)>;
+} // End HasSVEorSME, HasNoNEON
+
let Predicates = [HasBF16, HasSVEorSME] in {
defm BFDOT_ZZZ : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
defm BFDOT_ZZI : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index 9aadf3133ba197..fbbe2cc64ad248 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING
target triple = "aarch64-unknown-linux-gnu"
@@ -7,10 +8,19 @@ target triple = "aarch64-unknown-linux-gnu"
define double @t1(double %x) {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: scvtf d0, x8
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t1:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzs x8, d0
+; NONEON-NOSVE-NEXT: scvtf d0, x8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t1:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzs d0, d0
@@ -25,10 +35,19 @@ entry:
define float @t2(float %x) {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs w8, s0
-; CHECK-NEXT: scvtf s0, w8
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t2:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzs w8, s0
+; NONEON-NOSVE-NEXT: scvtf s0, w8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t2:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzs s0, s0
@@ -43,12 +62,21 @@ entry:
define half @t3(half %x) {
; CHECK-LABEL: t3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: fcvtzs w8, s0
-; CHECK-NEXT: scvtf s0, w8
-; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ptrue p0.h, vl1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t3:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvtzs w8, s0
+; NONEON-NOSVE-NEXT: scvtf s0, w8
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t3:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvt s0, h0
@@ -65,10 +93,19 @@ entry:
define double @t4(double %x) {
; CHECK-LABEL: t4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu x8, d0
-; CHECK-NEXT: ucvtf d0, x8
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t4:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzu x8, d0
+; NONEON-NOSVE-NEXT: ucvtf d0, x8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t4:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzu d0, d0
@@ -83,10 +120,19 @@ entry:
define float @t5(float %x) {
; CHECK-LABEL: t5:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu w8, s0
-; CHECK-NEXT: ucvtf s0, w8
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t5:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvtzu w8, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t5:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzu s0, s0
@@ -101,12 +147,21 @@ entry:
define half @t6(half %x) {
; CHECK-LABEL: t6:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: fcvtzu w8, s0
-; CHECK-NEXT: ucvtf s0, w8
-; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ptrue p0.h, vl1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
+; NONEON-NOSVE-LABEL: t6:
+; NONEON-NOSVE: // %bb.0: // %entry
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: fcvtzu w8, s0
+; NONEON-NOSVE-NEXT: ucvtf s0, w8
+; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: ret
+;
; NON-STREAMING-LABEL: t6:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvt s0, h0
|
60c48e6
to
aea6409
Compare
; CHECK: // %bb.0: | ||
; CHECK-NEXT: ldr d0, [x0] | ||
; CHECK-NEXT: ptrue p0.d | ||
; CHECK-NEXT: mov z1.h, z0.h[3] | ||
; CHECK-NEXT: mov z2.h, z0.h[2] | ||
; CHECK-NEXT: mov z3.h, z0.h[1] | ||
; CHECK-NEXT: fcvtzu x10, h0 | ||
; CHECK-NEXT: fcvtzu x8, h1 | ||
; CHECK-NEXT: fcvtzu x9, h2 | ||
; CHECK-NEXT: fcvtzu x11, h3 | ||
; CHECK-NEXT: fmov d2, x10 | ||
; CHECK-NEXT: fmov d0, x8 | ||
; CHECK-NEXT: fmov d1, x9 | ||
; CHECK-NEXT: zip1 z0.d, z1.d, z0.d | ||
; CHECK-NEXT: fmov d1, x11 | ||
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h | ||
; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h | ||
; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.h | ||
; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h | ||
; CHECK-NEXT: zip1 z1.d, z2.d, z1.d | ||
; CHECK-NEXT: stp q1, q0, [x1] | ||
; CHECK-NEXT: zip1 z0.d, z0.d, z3.d | ||
; CHECK-NEXT: stp q0, q1, [x1] | ||
; CHECK-NEXT: ret |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In some cases, this now removes a decent amount of fmovs
to/from GRPs. 🙂
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice!
; CHECK: // %bb.0: | ||
; CHECK-NEXT: ldr d0, [x0] | ||
; CHECK-NEXT: ptrue p0.d | ||
; CHECK-NEXT: mov z1.h, z0.h[3] | ||
; CHECK-NEXT: mov z2.h, z0.h[2] | ||
; CHECK-NEXT: mov z3.h, z0.h[1] | ||
; CHECK-NEXT: fcvtzu x10, h0 | ||
; CHECK-NEXT: fcvtzu x8, h1 | ||
; CHECK-NEXT: fcvtzu x9, h2 | ||
; CHECK-NEXT: fcvtzu x11, h3 | ||
; CHECK-NEXT: fmov d2, x10 | ||
; CHECK-NEXT: fmov d0, x8 | ||
; CHECK-NEXT: fmov d1, x9 | ||
; CHECK-NEXT: zip1 z0.d, z1.d, z0.d | ||
; CHECK-NEXT: fmov d1, x11 | ||
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h | ||
; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.h | ||
; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.h | ||
; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h | ||
; CHECK-NEXT: zip1 z1.d, z2.d, z1.d | ||
; CHECK-NEXT: stp q1, q0, [x1] | ||
; CHECK-NEXT: zip1 z0.d, z0.d, z3.d | ||
; CHECK-NEXT: stp q0, q1, [x1] | ||
; CHECK-NEXT: ret |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To me this looks more like a DAGCombine rather than a lowering problem. You're essentially saying fp_op->gpr->fpr
should be rewritten as new_fp_op
when possible. I'm assuming there are related combines for the cases where the only reason for the GPR was to either load the source data or store the result, which again feels like a DAG or preferable an IR combine.
There was some discussion on that here #112564 (comment), originally this followed the neon combine that matched |
I hope this wasn't a bum steer, but I thought there might be more general value in this for other cases than this particular fp -> int -> fp combine, e.g. when |
I agree with the sentiments, this isn't a selection problem either, but not the new home of the transformation. DAGCombine is the place to optimise the DAG whereas LowerOperation exists to legalise the DAG. I'm happy to be convinced otherwise but given there is nothing illegal about the DAG to me that suggests a DAGCombine. |
To clarify my previous comment. I'm not suggesting you "must" implement a set of It is worth investigating a route where existing ISD nodes are used but with vector types. This could be the way to represent in_place_fp_cvt, which has the advantage of being generic but it's possible existing DACombines would undo the transformation. If that happens then at least we've a good reason to go the target specific ISD nodes route. |
b92240f
to
e09d40d
Compare
Right, I've moved the logic here into
I'm not sure I follow 😅, this is using the existing ISD nodes + insert/extract element. The AArch64 target nodes only come in after |
assert(!N->isStrictFPOpcode() && "strict fp ops not supported"); | ||
SDValue SrcVal = N->getOperand(0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd recommend delaying the combine until at least after type legalisation (e.g. isBeforeLegalizeOps()) because that gives time to remove the original code entirely and limits the combination of types you need to worry about, which might remove the need for, or at least simplify, isSupportedType
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Waiting till after legalization has the side-effect of promoting f16
to f32
, which leads to extra unnecessary fcvt
s being emitted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like I picked the wrong name when randomly giving an example. isBeforeLegalize()
looks to be more relevant for delaying a combine until after type legalisation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have this same issue with both isBeforeLegalizeOps()
and isBeforeLegalize()
, but it's limited to streaming functions +sme
(not +sve
). Those functions should still be able to use the f16
SVE instructions though.
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drafting this for now as there's some issues that need to be resolved.
define double @ucvtf_i32_f64(ptr %0) { | ||
; CHECK-LABEL: ucvtf_i32_f64: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: ldr w8, [x0] | ||
; CHECK-NEXT: ptrue p0.d | ||
; CHECK-NEXT: ldr s0, [x0] | ||
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s | ||
; CHECK-NEXT: fmov d0, x8 | ||
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d | ||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now loads to GPR instead of FPR (possible hazard + extra fmov)
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: fmov s0, w0 | ||
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 | ||
; CHECK-NEXT: sxtw x8, w0 | ||
; CHECK-NEXT: ptrue p0.d | ||
; CHECK-NEXT: scvtf z0.d, p0/m, z0.s | ||
; CHECK-NEXT: fmov d0, x8 | ||
; CHECK-NEXT: scvtf z0.d, p0/m, z0.d | ||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extra sxtw
s emitted
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is quite sad in cases like:
%conv = fptosi double %x to i32
%conv1 = sitofp i32 %conv to double
which now become:
fcvtzs z0.d, p0/m, z0.d
fmov x8, d0
sxtw x8, w8
fmov d0, x8
scvtf z0.d, p0/m, z0.d
Though maybe some more folds could fix that.
3376b10
to
839d4fc
Compare
if (IsI32ToF64 || isF64ToI32) { | ||
unsigned IntrinsicOpc; | ||
if (IsI32ToF64) | ||
IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_scvtf_f64i32 | ||
: Intrinsic::aarch64_sve_ucvtf_f64i32; | ||
else | ||
IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_fcvtzs_i32f64 | ||
: Intrinsic::aarch64_sve_fcvtzu_i32f64; | ||
SDValue PTrue = getPredicateForVector(DAG, DL, MVT::nxv2f64); | ||
Vec = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, DestVecTy, | ||
{DAG.getConstant(IntrinsicOpc, DL, MVT::i32), | ||
DAG.getUNDEF(DestTy), PTrue, Vec}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure about this, but the easiest way I could think of to get the desired codegen was to directly use intrinsics for the f64
<-> i32
cases (it avoids any sign/zero extension issues).
When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive. Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet). Follow up to llvm#112213.
91368f0
to
b81e8db
Compare
IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_scvtf_f64i32 | ||
: Intrinsic::aarch64_sve_ucvtf_f64i32; | ||
else | ||
IntrinsicOpc = IsSigned ? Intrinsic::aarch64_sve_fcvtzs_i32f64 | ||
: Intrinsic::aarch64_sve_fcvtzu_i32f64; | ||
SDValue PTrue = getPredicateForVector(DAG, DL, MVT::nxv2f64); | ||
Vec = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, DestVecTy, | ||
{DAG.getConstant(IntrinsicOpc, DL, MVT::i32), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not much better than the original isel route. Code generation is rarely about implementing a single function that emits exactly what you need, but rather a sequence of smaller cuts that each move the DAG to its finial destination.
You mention using 64-bit converts did not result in the best code generation. What are the resulting deficiencies? Is it possible for them to be resolved using isolated combines or perhaps better isel? If the deficiencies are integer scalar instructions then doesn't that mean the GPR<->FPR transitions have not been removed and thus the rational for the combine has not been achieved anyway?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not much better than the original isel route. Code generation is rarely about implementing a single function that emits exactly what you need, but rather a sequence of smaller cuts that each move the DAG to its final destination.
I get that, but this is not entirely the standard case. This is emulating Neon/FP, which has f64
<-> i32
conversions, SVE also has these conversions, but they're currently unused (outside of intrinsics), as nxv2i32
is an illegal type that is promoted to nxv2i64
. So you can't rely on the standard lowerings/patterns being in place.
You mention using 64-bit converts did not result in the best code generation. What are the resulting deficiencies?
I've shown a few such cases in my comments above. The issues were:
- Loads not being promoted to FP registers (likely due to the sign/zero extensions in the way)
- Pointless fmovs between GPRs + sign/zero extends for values that should stay within FP/vector registers
- Additional unavoidable sign/zero extends in some cases
- If you can only do i64 -> f64 conversions, and you have an i32, you'll need to extend it
You could probably write a bunch of folds to eventually tidy this up, but given that this will never be used for the general SVE path (since these converts are illegal there), the direct approach made more sense to me.
The inspiration for using the intrinsics directly partly came from the existing performFpToIntCombine()
, which sometimes uses the Neon intrinsics directly, so it didn't seem too unreasonable.
…e] functions (1/n) In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to llvm#112213. Note: This PR does not include support for f64 <-> i32 conversions (like llvm#112564), which needs a bit more work to support.
…e] functions (1/n) In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to llvm#112213. Note: This PR does not include support for f64 <-> i32 conversions (like llvm#112564), which needs a bit more work to support.
…e] functions (1/n) (#118505) In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to #112213. Note: This PR does not include support for f64 <-> i32 conversions (like #112564), which needs a bit more work to support.
In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly.
This patch also updates definitions of
SCVTF_ZPmZ_StoD
andUCVTF_ZPmZ_StoD
to disallow lowering to them from ISD nodes, as doing so requires creating a[U|S]INT_TO_FP_MERGE_PASSTHRU
node with inconsistent types.Follow up to #112213.