Skip to content

Commit 9441ab2

Browse files
committed
[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (1/n)
In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to #112213. Note: This PR does not include support for f64 <-> i32 conversions (like #112564), which needs a bit more work to support.
1 parent d866005 commit 9441ab2

7 files changed

+864
-253
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19083,13 +19083,75 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
1908319083
return SDValue();
1908419084
}
1908519085

19086+
/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19087+
/// functions, this can help to reduce the number of fmovs to/from GPRs.
19088+
static SDValue
19089+
tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
19090+
const AArch64Subtarget *Subtarget) {
19091+
if (N->isStrictFPOpcode())
19092+
return SDValue();
19093+
19094+
if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19095+
(!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19096+
return SDValue();
19097+
19098+
auto isSupportedType = [](EVT VT) {
19099+
if (!VT.isSimple())
19100+
return false;
19101+
// There are SVE instructions that can convert to/from all pairs of these
19102+
// int and float types. Note: We don't bother with i8 or i16 as those are
19103+
// illegal types for scalars.
19104+
return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
19105+
VT.getSimpleVT().SimpleTy);
19106+
};
19107+
19108+
if (!isSupportedType(N->getValueType(0)) ||
19109+
!isSupportedType(N->getOperand(0).getValueType()))
19110+
return SDValue();
19111+
19112+
SDValue SrcVal = N->getOperand(0);
19113+
EVT SrcTy = SrcVal.getValueType();
19114+
EVT DestTy = N->getValueType(0);
19115+
19116+
bool IsI32ToF64 = SrcTy == MVT::i32 && DestTy == MVT::f64;
19117+
bool isF64ToI32 = SrcTy == MVT::f64 && DestTy == MVT::i32;
19118+
19119+
// Conversions between f64 and i32 are a special case as nxv2i32 is an illegal
19120+
// type (unlike the equivalent nxv2f32 for floating-point types).
19121+
// TODO: Support these conversations.
19122+
if (IsI32ToF64 || isF64ToI32)
19123+
return SDValue();
19124+
19125+
EVT SrcVecTy;
19126+
EVT DestVecTy;
19127+
if (DestTy.bitsGT(SrcTy)) {
19128+
DestVecTy = getPackedSVEVectorVT(DestTy);
19129+
SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
19130+
: DestVecTy.changeVectorElementType(SrcTy);
19131+
} else {
19132+
SrcVecTy = getPackedSVEVectorVT(SrcTy);
19133+
DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
19134+
: SrcVecTy.changeVectorElementType(DestTy);
19135+
}
19136+
19137+
SDLoc DL(N);
19138+
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19139+
SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19140+
DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19141+
SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19142+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19143+
}
19144+
1908619145
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
1908719146
const AArch64Subtarget *Subtarget) {
1908819147
// First try to optimize away the conversion when it's conditionally from
1908919148
// a constant. Vectors only.
1909019149
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
1909119150
return Res;
1909219151

19152+
if (SDValue Res = tryToReplaceScalarFPConversionWithSVE(N, DAG, Subtarget))
19153+
return Res;
19154+
1909319155
EVT VT = N->getValueType(0);
1909419156
if (VT != MVT::f32 && VT != MVT::f64)
1909519157
return SDValue();
@@ -19128,6 +19190,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
1912819190
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
1912919191
TargetLowering::DAGCombinerInfo &DCI,
1913019192
const AArch64Subtarget *Subtarget) {
19193+
if (SDValue Res = tryToReplaceScalarFPConversionWithSVE(N, DAG, Subtarget))
19194+
return Res;
19195+
1913119196
if (!Subtarget->isNeonAvailable())
1913219197
return SDValue();
1913319198

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2348,8 +2348,8 @@ let Predicates = [HasSVEorSME] in {
23482348
defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
23492349
defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
23502350
defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
2351-
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2352-
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2351+
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
2352+
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
23532353
defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
23542354
defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
23552355
defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,33 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -force-streaming-compatible < %s | FileCheck %s
3-
; RUN: llc -force-streaming-compatible -mattr=+sme2p2 < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
4-
; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
2+
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
3+
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
4+
; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
5+
; RUN: llc -mattr=+neon < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
6+
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
57

68
target triple = "aarch64-unknown-linux-gnu"
79

810
define double @t1(double %x) {
911
; CHECK-LABEL: t1:
1012
; CHECK: // %bb.0: // %entry
11-
; CHECK-NEXT: fcvtzs x8, d0
12-
; CHECK-NEXT: scvtf d0, x8
13+
; CHECK-NEXT: ptrue p0.d
14+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
15+
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
16+
; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
17+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1318
; CHECK-NEXT: ret
1419
;
1520
; USE-NEON-NO-GPRS-LABEL: t1:
1621
; USE-NEON-NO-GPRS: // %bb.0: // %entry
1722
; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0
1823
; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0
1924
; USE-NEON-NO-GPRS-NEXT: ret
25+
;
26+
; NONEON-NOSVE-LABEL: t1:
27+
; NONEON-NOSVE: // %bb.0: // %entry
28+
; NONEON-NOSVE-NEXT: fcvtzs x8, d0
29+
; NONEON-NOSVE-NEXT: scvtf d0, x8
30+
; NONEON-NOSVE-NEXT: ret
2031
entry:
2132
%conv = fptosi double %x to i64
2233
%conv1 = sitofp i64 %conv to double
@@ -26,15 +37,24 @@ entry:
2637
define float @t2(float %x) {
2738
; CHECK-LABEL: t2:
2839
; CHECK: // %bb.0: // %entry
29-
; CHECK-NEXT: fcvtzs w8, s0
30-
; CHECK-NEXT: scvtf s0, w8
40+
; CHECK-NEXT: ptrue p0.s
41+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
42+
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
43+
; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
44+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
3145
; CHECK-NEXT: ret
3246
;
3347
; USE-NEON-NO-GPRS-LABEL: t2:
3448
; USE-NEON-NO-GPRS: // %bb.0: // %entry
3549
; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0
3650
; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0
3751
; USE-NEON-NO-GPRS-NEXT: ret
52+
;
53+
; NONEON-NOSVE-LABEL: t2:
54+
; NONEON-NOSVE: // %bb.0: // %entry
55+
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
56+
; NONEON-NOSVE-NEXT: scvtf s0, w8
57+
; NONEON-NOSVE-NEXT: ret
3858
entry:
3959
%conv = fptosi float %x to i32
4060
%conv1 = sitofp i32 %conv to float
@@ -44,11 +64,20 @@ entry:
4464
define half @t3(half %x) {
4565
; CHECK-LABEL: t3:
4666
; CHECK: // %bb.0: // %entry
47-
; CHECK-NEXT: fcvt s0, h0
48-
; CHECK-NEXT: fcvtzs w8, s0
49-
; CHECK-NEXT: scvtf s0, w8
50-
; CHECK-NEXT: fcvt h0, s0
67+
; CHECK-NEXT: ptrue p0.s
68+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
69+
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
70+
; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
71+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
5172
; CHECK-NEXT: ret
73+
;
74+
; NONEON-NOSVE-LABEL: t3:
75+
; NONEON-NOSVE: // %bb.0: // %entry
76+
; NONEON-NOSVE-NEXT: fcvt s0, h0
77+
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
78+
; NONEON-NOSVE-NEXT: scvtf s0, w8
79+
; NONEON-NOSVE-NEXT: fcvt h0, s0
80+
; NONEON-NOSVE-NEXT: ret
5281
entry:
5382
%conv = fptosi half %x to i32
5483
%conv1 = sitofp i32 %conv to half
@@ -58,15 +87,24 @@ entry:
5887
define double @t4(double %x) {
5988
; CHECK-LABEL: t4:
6089
; CHECK: // %bb.0: // %entry
61-
; CHECK-NEXT: fcvtzu x8, d0
62-
; CHECK-NEXT: ucvtf d0, x8
90+
; CHECK-NEXT: ptrue p0.d
91+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
92+
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
93+
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
94+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
6395
; CHECK-NEXT: ret
6496
;
6597
; USE-NEON-NO-GPRS-LABEL: t4:
6698
; USE-NEON-NO-GPRS: // %bb.0: // %entry
6799
; USE-NEON-NO-GPRS-NEXT: fcvtzu d0, d0
68100
; USE-NEON-NO-GPRS-NEXT: ucvtf d0, d0
69101
; USE-NEON-NO-GPRS-NEXT: ret
102+
;
103+
; NONEON-NOSVE-LABEL: t4:
104+
; NONEON-NOSVE: // %bb.0: // %entry
105+
; NONEON-NOSVE-NEXT: fcvtzu x8, d0
106+
; NONEON-NOSVE-NEXT: ucvtf d0, x8
107+
; NONEON-NOSVE-NEXT: ret
70108
entry:
71109
%conv = fptoui double %x to i64
72110
%conv1 = uitofp i64 %conv to double
@@ -76,15 +114,24 @@ entry:
76114
define float @t5(float %x) {
77115
; CHECK-LABEL: t5:
78116
; CHECK: // %bb.0: // %entry
79-
; CHECK-NEXT: fcvtzu w8, s0
80-
; CHECK-NEXT: ucvtf s0, w8
117+
; CHECK-NEXT: ptrue p0.s
118+
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
119+
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
120+
; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
121+
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
81122
; CHECK-NEXT: ret
82123
;
83124
; USE-NEON-NO-GPRS-LABEL: t5:
84125
; USE-NEON-NO-GPRS: // %bb.0: // %entry
85126
; USE-NEON-NO-GPRS-NEXT: fcvtzu s0, s0
86127
; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0
87128
; USE-NEON-NO-GPRS-NEXT: ret
129+
;
130+
; NONEON-NOSVE-LABEL: t5:
131+
; NONEON-NOSVE: // %bb.0: // %entry
132+
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
133+
; NONEON-NOSVE-NEXT: ucvtf s0, w8
134+
; NONEON-NOSVE-NEXT: ret
88135
entry:
89136
%conv = fptoui float %x to i32
90137
%conv1 = uitofp i32 %conv to float
@@ -94,11 +141,20 @@ entry:
94141
define half @t6(half %x) {
95142
; CHECK-LABEL: t6:
96143
; CHECK: // %bb.0: // %entry
97-
; CHECK-NEXT: fcvt s0, h0
98-
; CHECK-NEXT: fcvtzu w8, s0
99-
; CHECK-NEXT: ucvtf s0, w8
100-
; CHECK-NEXT: fcvt h0, s0
144+
; CHECK-NEXT: ptrue p0.s
145+
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
146+
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
147+
; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
148+
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
101149
; CHECK-NEXT: ret
150+
;
151+
; NONEON-NOSVE-LABEL: t6:
152+
; NONEON-NOSVE: // %bb.0: // %entry
153+
; NONEON-NOSVE-NEXT: fcvt s0, h0
154+
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
155+
; NONEON-NOSVE-NEXT: ucvtf s0, w8
156+
; NONEON-NOSVE-NEXT: fcvt h0, s0
157+
; NONEON-NOSVE-NEXT: ret
102158
entry:
103159
%conv = fptoui half %x to i32
104160
%conv1 = uitofp i32 %conv to half

0 commit comments

Comments
 (0)