Skip to content

Commit a9374dd

Browse files
committed
[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (1/n)
In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to llvm#112213. Note: This PR does not include support for f64 <-> i32 conversions (like llvm#112564), which needs a bit more work to support.
1 parent 4a6ecd3 commit a9374dd

7 files changed

+864
-253
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18989,13 +18989,75 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
1898918989
return SDValue();
1899018990
}
1899118991

18992+
/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
18993+
/// functions, this can help to reduce the number of fmovs to/from GPRs.
18994+
static SDValue
18995+
tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
18996+
const AArch64Subtarget *Subtarget) {
18997+
if (N->isStrictFPOpcode())
18998+
return SDValue();
18999+
19000+
if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19001+
(!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19002+
return SDValue();
19003+
19004+
auto isSupportedType = [](EVT VT) {
19005+
if (!VT.isSimple())
19006+
return false;
19007+
// There are SVE instructions that can convert to/from all pairs of these
19008+
// int and float types. Note: We don't bother with i8 or i16 as those are
19009+
// illegal types for scalars.
19010+
return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
19011+
VT.getSimpleVT().SimpleTy);
19012+
};
19013+
19014+
if (!isSupportedType(N->getValueType(0)) ||
19015+
!isSupportedType(N->getOperand(0).getValueType()))
19016+
return SDValue();
19017+
19018+
SDValue SrcVal = N->getOperand(0);
19019+
EVT SrcTy = SrcVal.getValueType();
19020+
EVT DestTy = N->getValueType(0);
19021+
19022+
bool IsI32ToF64 = SrcTy == MVT::i32 && DestTy == MVT::f64;
19023+
bool isF64ToI32 = SrcTy == MVT::f64 && DestTy == MVT::i32;
19024+
19025+
// Conversions between f64 and i32 are a special case as nxv2i32 is an illegal
19026+
// type (unlike the equivalent nxv2f32 for floating-point types).
19027+
// TODO: Support these conversations.
19028+
if (IsI32ToF64 || isF64ToI32)
19029+
return SDValue();
19030+
19031+
EVT SrcVecTy;
19032+
EVT DestVecTy;
19033+
if (DestTy.bitsGT(SrcTy)) {
19034+
DestVecTy = getPackedSVEVectorVT(DestTy);
19035+
SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
19036+
: DestVecTy.changeVectorElementType(SrcTy);
19037+
} else {
19038+
SrcVecTy = getPackedSVEVectorVT(SrcTy);
19039+
DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
19040+
: SrcVecTy.changeVectorElementType(DestTy);
19041+
}
19042+
19043+
SDLoc DL(N);
19044+
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19045+
SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19046+
DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19047+
SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19048+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19049+
}
19050+
1899219051
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
1899319052
const AArch64Subtarget *Subtarget) {
1899419053
// First try to optimize away the conversion when it's conditionally from
1899519054
// a constant. Vectors only.
1899619055
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
1899719056
return Res;
1899819057

19058+
if (SDValue Res = tryToReplaceScalarFPConversionWithSVE(N, DAG, Subtarget))
19059+
return Res;
19060+
1899919061
EVT VT = N->getValueType(0);
1900019062
if (VT != MVT::f32 && VT != MVT::f64)
1900119063
return SDValue();
@@ -19034,6 +19096,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
1903419096
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
1903519097
TargetLowering::DAGCombinerInfo &DCI,
1903619098
const AArch64Subtarget *Subtarget) {
19099+
if (SDValue Res = tryToReplaceScalarFPConversionWithSVE(N, DAG, Subtarget))
19100+
return Res;
19101+
1903719102
if (!Subtarget->isNeonAvailable())
1903819103
return SDValue();
1903919104

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2348,8 +2348,8 @@ let Predicates = [HasSVEorSME] in {
23482348
defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
23492349
defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
23502350
defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
2351-
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2352-
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2351+
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2352+
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
23532353
defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
23542354
defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
23552355
defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,33 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -force-streaming-compatible < %s | FileCheck %s
3-
; RUN: llc -force-streaming-compatible -mattr=+sme2p2 < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
4-
; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
2+
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
3+
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
4+
; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
5+
; RUN: llc -mattr=+neon < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
6+
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
57

68
target triple = "aarch64-unknown-linux-gnu"
79

810
define double @t1(double %x) {
911
; CHECK-LABEL: t1:
1012
; CHECK: // %bb.0: // %entry
11-
; CHECK-NEXT: fcvtzs x8, d0
12-
; CHECK-NEXT: scvtf d0, x8
13+
; CHECK-NEXT: ptrue p0.d
14+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
15+
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
16+
; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
17+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1318
; CHECK-NEXT: ret
1419
;
1520
; USE-NEON-NO-GPRS-LABEL: t1:
1621
; USE-NEON-NO-GPRS: // %bb.0: // %entry
1722
; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0
1823
; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0
1924
; USE-NEON-NO-GPRS-NEXT: ret
25+
;
26+
; NONEON-NOSVE-LABEL: t1:
27+
; NONEON-NOSVE: // %bb.0: // %entry
28+
; NONEON-NOSVE-NEXT: fcvtzs x8, d0
29+
; NONEON-NOSVE-NEXT: scvtf d0, x8
30+
; NONEON-NOSVE-NEXT: ret
2031
entry:
2132
%conv = fptosi double %x to i64
2233
%conv1 = sitofp i64 %conv to double
@@ -26,15 +37,24 @@ entry:
2637
define float @t2(float %x) {
2738
; CHECK-LABEL: t2:
2839
; CHECK: // %bb.0: // %entry
29-
; CHECK-NEXT: fcvtzs w8, s0
30-
; CHECK-NEXT: scvtf s0, w8
40+
; CHECK-NEXT: ptrue p0.s
41+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
42+
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
43+
; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
44+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
3145
; CHECK-NEXT: ret
3246
;
3347
; USE-NEON-NO-GPRS-LABEL: t2:
3448
; USE-NEON-NO-GPRS: // %bb.0: // %entry
3549
; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0
3650
; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0
3751
; USE-NEON-NO-GPRS-NEXT: ret
52+
;
53+
; NONEON-NOSVE-LABEL: t2:
54+
; NONEON-NOSVE: // %bb.0: // %entry
55+
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
56+
; NONEON-NOSVE-NEXT: scvtf s0, w8
57+
; NONEON-NOSVE-NEXT: ret
3858
entry:
3959
%conv = fptosi float %x to i32
4060
%conv1 = sitofp i32 %conv to float
@@ -44,10 +64,11 @@ entry:
4464
define half @t3(half %x) {
4565
; CHECK-LABEL: t3:
4666
; CHECK: // %bb.0: // %entry
47-
; CHECK-NEXT: fcvt s0, h0
48-
; CHECK-NEXT: fcvtzs w8, s0
49-
; CHECK-NEXT: scvtf s0, w8
50-
; CHECK-NEXT: fcvt h0, s0
67+
; CHECK-NEXT: ptrue p0.s
68+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
69+
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
70+
; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
71+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
5172
; CHECK-NEXT: ret
5273
;
5374
; USE-NEON-NO-GPRS-LABEL: t3:
@@ -57,6 +78,14 @@ define half @t3(half %x) {
5778
; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0
5879
; USE-NEON-NO-GPRS-NEXT: fcvt h0, s0
5980
; USE-NEON-NO-GPRS-NEXT: ret
81+
;
82+
; NONEON-NOSVE-LABEL: t3:
83+
; NONEON-NOSVE: // %bb.0: // %entry
84+
; NONEON-NOSVE-NEXT: fcvt s0, h0
85+
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
86+
; NONEON-NOSVE-NEXT: scvtf s0, w8
87+
; NONEON-NOSVE-NEXT: fcvt h0, s0
88+
; NONEON-NOSVE-NEXT: ret
6089
entry:
6190
%conv = fptosi half %x to i32
6291
%conv1 = sitofp i32 %conv to half
@@ -66,15 +95,24 @@ entry:
6695
define double @t4(double %x) {
6796
; CHECK-LABEL: t4:
6897
; CHECK: // %bb.0: // %entry
69-
; CHECK-NEXT: fcvtzu x8, d0
70-
; CHECK-NEXT: ucvtf d0, x8
98+
; CHECK-NEXT: ptrue p0.d
99+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
100+
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
101+
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
102+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
71103
; CHECK-NEXT: ret
72104
;
73105
; USE-NEON-NO-GPRS-LABEL: t4:
74106
; USE-NEON-NO-GPRS: // %bb.0: // %entry
75107
; USE-NEON-NO-GPRS-NEXT: fcvtzu d0, d0
76108
; USE-NEON-NO-GPRS-NEXT: ucvtf d0, d0
77109
; USE-NEON-NO-GPRS-NEXT: ret
110+
;
111+
; NONEON-NOSVE-LABEL: t4:
112+
; NONEON-NOSVE: // %bb.0: // %entry
113+
; NONEON-NOSVE-NEXT: fcvtzu x8, d0
114+
; NONEON-NOSVE-NEXT: ucvtf d0, x8
115+
; NONEON-NOSVE-NEXT: ret
78116
entry:
79117
%conv = fptoui double %x to i64
80118
%conv1 = uitofp i64 %conv to double
@@ -84,15 +122,24 @@ entry:
84122
define float @t5(float %x) {
85123
; CHECK-LABEL: t5:
86124
; CHECK: // %bb.0: // %entry
87-
; CHECK-NEXT: fcvtzu w8, s0
88-
; CHECK-NEXT: ucvtf s0, w8
125+
; CHECK-NEXT: ptrue p0.s
126+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
127+
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
128+
; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
129+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
89130
; CHECK-NEXT: ret
90131
;
91132
; USE-NEON-NO-GPRS-LABEL: t5:
92133
; USE-NEON-NO-GPRS: // %bb.0: // %entry
93134
; USE-NEON-NO-GPRS-NEXT: fcvtzu s0, s0
94135
; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0
95136
; USE-NEON-NO-GPRS-NEXT: ret
137+
;
138+
; NONEON-NOSVE-LABEL: t5:
139+
; NONEON-NOSVE: // %bb.0: // %entry
140+
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
141+
; NONEON-NOSVE-NEXT: ucvtf s0, w8
142+
; NONEON-NOSVE-NEXT: ret
96143
entry:
97144
%conv = fptoui float %x to i32
98145
%conv1 = uitofp i32 %conv to float
@@ -102,10 +149,11 @@ entry:
102149
define half @t6(half %x) {
103150
; CHECK-LABEL: t6:
104151
; CHECK: // %bb.0: // %entry
105-
; CHECK-NEXT: fcvt s0, h0
106-
; CHECK-NEXT: fcvtzu w8, s0
107-
; CHECK-NEXT: ucvtf s0, w8
108-
; CHECK-NEXT: fcvt h0, s0
152+
; CHECK-NEXT: ptrue p0.s
153+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
154+
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
155+
; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
156+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
109157
; CHECK-NEXT: ret
110158
;
111159
; USE-NEON-NO-GPRS-LABEL: t6:
@@ -115,6 +163,14 @@ define half @t6(half %x) {
115163
; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0
116164
; USE-NEON-NO-GPRS-NEXT: fcvt h0, s0
117165
; USE-NEON-NO-GPRS-NEXT: ret
166+
;
167+
; NONEON-NOSVE-LABEL: t6:
168+
; NONEON-NOSVE: // %bb.0: // %entry
169+
; NONEON-NOSVE-NEXT: fcvt s0, h0
170+
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
171+
; NONEON-NOSVE-NEXT: ucvtf s0, w8
172+
; NONEON-NOSVE-NEXT: fcvt h0, s0
173+
; NONEON-NOSVE-NEXT: ret
118174
entry:
119175
%conv = fptoui half %x to i32
120176
%conv1 = uitofp i32 %conv to half

0 commit comments

Comments
 (0)