Skip to content

Commit 5f935e9

Browse files
committed
[AArch64] Optimize fp64 <-> fp16 SIMD conversions
Legalization would result in needless scalarization. Add some DAGCombines to fix this up.
1 parent e963d07 commit 5f935e9

File tree

9 files changed

+252
-170
lines changed

9 files changed

+252
-170
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4507,13 +4507,16 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
45074507
};
45084508

45094509
if (Op.getValueType() == MVT::bf16) {
4510+
unsigned MaxWidth = IsSigned
4511+
? DAG.ComputeMaxSignificantBits(SrcVal)
4512+
: DAG.computeKnownBits(SrcVal).countMaxActiveBits();
45104513
// bf16 conversions are promoted to f32 when converting from i16.
4511-
if (DAG.ComputeMaxSignificantBits(SrcVal) <= 24) {
4514+
if (MaxWidth <= 24) {
45124515
return IntToFpViaPromotion(MVT::f32);
45134516
}
45144517

45154518
// bf16 conversions are promoted to f64 when converting from i32.
4516-
if (DAG.ComputeMaxSignificantBits(SrcVal) <= 53) {
4519+
if (MaxWidth <= 53) {
45174520
return IntToFpViaPromotion(MVT::f64);
45184521
}
45194522

@@ -19376,6 +19379,94 @@ static SDValue performBuildVectorCombine(SDNode *N,
1937619379
SDLoc DL(N);
1937719380
EVT VT = N->getValueType(0);
1937819381

19382+
if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19383+
SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19384+
Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19385+
if (Elt0->getOpcode() == ISD::FP_ROUND &&
19386+
Elt1->getOpcode() == ISD::FP_ROUND &&
19387+
isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19388+
isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19389+
Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19390+
Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19391+
Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19392+
// Constant index.
19393+
isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19394+
isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19395+
Elt0->getOperand(0)->getOperand(0) ==
19396+
Elt1->getOperand(0)->getOperand(0) &&
19397+
Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19398+
Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19399+
SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19400+
if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19401+
SDValue HighLanes;
19402+
if (Elt2->getOpcode() == ISD::UNDEF &&
19403+
Elt3->getOpcode() == ISD::UNDEF) {
19404+
HighLanes = DAG.getUNDEF(MVT::v2f32);
19405+
} else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19406+
Elt3->getOpcode() == ISD::FP_ROUND &&
19407+
isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19408+
isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19409+
Elt2->getConstantOperandVal(1) ==
19410+
Elt3->getConstantOperandVal(1) &&
19411+
Elt2->getOperand(0)->getOpcode() ==
19412+
ISD::EXTRACT_VECTOR_ELT &&
19413+
Elt3->getOperand(0)->getOpcode() ==
19414+
ISD::EXTRACT_VECTOR_ELT &&
19415+
// Constant index.
19416+
isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19417+
isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19418+
Elt2->getOperand(0)->getOperand(0) ==
19419+
Elt3->getOperand(0)->getOperand(0) &&
19420+
Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19421+
Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19422+
SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19423+
HighLanes =
19424+
DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19425+
}
19426+
if (HighLanes) {
19427+
SDValue DoubleToSingleSticky =
19428+
DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19429+
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19430+
DoubleToSingleSticky, HighLanes);
19431+
return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19432+
Elt0->getOperand(1));
19433+
}
19434+
}
19435+
}
19436+
}
19437+
19438+
if (VT == MVT::v2f64) {
19439+
SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19440+
if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19441+
Elt1->getOpcode() == ISD::FP_EXTEND &&
19442+
Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19443+
Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19444+
Elt0->getOperand(0)->getOperand(0) ==
19445+
Elt1->getOperand(0)->getOperand(0) &&
19446+
// Constant index.
19447+
isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19448+
isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19449+
Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19450+
Elt1->getOperand(0)->getConstantOperandVal(1) &&
19451+
// EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19452+
// ResultType's known minimum vector length.
19453+
Elt0->getOperand(0)->getConstantOperandVal(1) %
19454+
VT.getVectorMinNumElements() ==
19455+
0) {
19456+
SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19457+
if (SrcVec.getValueType() == MVT::v4f16 ||
19458+
SrcVec.getValueType() == MVT::v4bf16) {
19459+
SDValue HalfToSingle =
19460+
DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19461+
SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19462+
SDValue Extract = DAG.getNode(
19463+
ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32),
19464+
HalfToSingle, SubvectorIdx);
19465+
return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19466+
}
19467+
}
19468+
}
19469+
1937919470
// A build vector of two extracted elements is equivalent to an
1938019471
// extract subvector where the inner vector is any-extended to the
1938119472
// extract_vector_elt VT.

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6832,7 +6832,7 @@ multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
68326832
}
68336833

68346834
multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
6835-
Intrinsic OpNode> {
6835+
SDPatternOperator OpNode> {
68366836
def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
68376837
asm, ".2s", ".2d",
68386838
[(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
@@ -7547,7 +7547,7 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
75477547
let mayRaiseFPException = 1, Uses = [FPCR] in
75487548
class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
75497549
: I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
7550-
[(set (f32 FPR32:$Rd), (AArch64fcvtxn (f64 FPR64:$Rn)))]>,
7550+
[(set (f32 FPR32:$Rd), (AArch64fcvtxnsdr (f64 FPR64:$Rn)))]>,
75517551
Sched<[WriteVd]> {
75527552
bits<5> Rd;
75537553
bits<5> Rn;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -757,9 +757,12 @@ def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
757757
def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
758758

759759
def AArch64fcvtxn_n: SDNode<"AArch64ISD::FCVTXN", SDTFPRoundOp>;
760-
def AArch64fcvtxn: PatFrags<(ops node:$Rn),
761-
[(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))),
762-
(f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>;
760+
def AArch64fcvtxnsdr: PatFrags<(ops node:$Rn),
761+
[(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))),
762+
(f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>;
763+
def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
764+
[(int_aarch64_neon_fcvtxn node:$Rn),
765+
(AArch64fcvtxn_n node:$Rn)]>;
763766

764767
def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
765768
def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
@@ -5042,7 +5045,7 @@ def : Pat<(concat_vectors V64:$Rd, (v4f16 (any_fpround (v4f32 V128:$Rn)))),
50425045
defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
50435046
defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
50445047
defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
5045-
int_aarch64_neon_fcvtxn>;
5048+
AArch64fcvtxnv>;
50465049
defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
50475050
defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
50485051

llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,60 @@ define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
199199
ret <2 x float> %vcvt1.i
200200
}
201201

202+
; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64)
203+
; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64)
204+
define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
205+
; GENERIC-LABEL: test_vcvt_bf16_f64:
206+
; GENERIC: // %bb.0:
207+
; GENERIC-NEXT: fcvtxn v0.2s, v0.2d
208+
; GENERIC-NEXT: movi.4s v1, #127, msl #8
209+
; GENERIC-NEXT: movi.4s v2, #1
210+
; GENERIC-NEXT: ushr.4s v3, v0, #16
211+
; GENERIC-NEXT: add.4s v1, v0, v1
212+
; GENERIC-NEXT: and.16b v2, v3, v2
213+
; GENERIC-NEXT: add.4s v1, v2, v1
214+
; GENERIC-NEXT: fcmeq.4s v2, v0, v0
215+
; GENERIC-NEXT: orr.4s v0, #64, lsl #16
216+
; GENERIC-NEXT: bit.16b v0, v1, v2
217+
; GENERIC-NEXT: shrn.4h v0, v0, #16
218+
; GENERIC-NEXT: ret
219+
;
220+
; FAST-LABEL: test_vcvt_bf16_f64:
221+
; FAST: // %bb.0:
222+
; FAST-NEXT: fcvtxn v1.2s, v0.2d
223+
; FAST-NEXT: // implicit-def: $q0
224+
; FAST-NEXT: fmov d0, d1
225+
; FAST-NEXT: ushr.4s v1, v0, #16
226+
; FAST-NEXT: movi.4s v2, #1
227+
; FAST-NEXT: and.16b v1, v1, v2
228+
; FAST-NEXT: add.4s v1, v1, v0
229+
; FAST-NEXT: movi.4s v2, #127, msl #8
230+
; FAST-NEXT: add.4s v1, v1, v2
231+
; FAST-NEXT: mov.16b v2, v0
232+
; FAST-NEXT: orr.4s v2, #64, lsl #16
233+
; FAST-NEXT: fcmeq.4s v0, v0, v0
234+
; FAST-NEXT: bsl.16b v0, v1, v2
235+
; FAST-NEXT: shrn.4h v0, v0, #16
236+
; FAST-NEXT: ret
237+
;
238+
; GISEL-LABEL: test_vcvt_bf16_f64:
239+
; GISEL: // %bb.0:
240+
; GISEL-NEXT: fcvtxn v0.2s, v0.2d
241+
; GISEL-NEXT: movi.4s v1, #127, msl #8
242+
; GISEL-NEXT: movi.4s v2, #1
243+
; GISEL-NEXT: ushr.4s v3, v0, #16
244+
; GISEL-NEXT: add.4s v1, v0, v1
245+
; GISEL-NEXT: and.16b v2, v3, v2
246+
; GISEL-NEXT: add.4s v1, v2, v1
247+
; GISEL-NEXT: fcmeq.4s v2, v0, v0
248+
; GISEL-NEXT: orr.4s v0, #64, lsl #16
249+
; GISEL-NEXT: bit.16b v0, v1, v2
250+
; GISEL-NEXT: shrn.4h v0, v0, #16
251+
; GISEL-NEXT: ret
252+
%vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
253+
ret <2 x bfloat> %vcvt1.i
254+
}
255+
202256
define half @test_vcvt_f16_f32(<1 x float> %x) {
203257
; GENERIC-LABEL: test_vcvt_f16_f32:
204258
; GENERIC: // %bb.0:
@@ -350,3 +404,5 @@ define float @from_half(i16 %in) {
350404

351405
declare float @llvm.convert.from.fp16.f32(i16) #1
352406
declare i16 @llvm.convert.to.fp16.f32(float) #1
407+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
408+
; FALLBACK: {{.*}}

llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll

Lines changed: 12 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -312,25 +312,12 @@ define <8 x half> @s_to_h(<8 x float> %a) {
312312
define <8 x half> @d_to_h(<8 x double> %a) {
313313
; CHECK-LABEL: d_to_h:
314314
; CHECK: // %bb.0:
315-
; CHECK-NEXT: mov d5, v0.d[1]
316-
; CHECK-NEXT: fcvt h0, d0
317-
; CHECK-NEXT: fcvt h4, d1
318-
; CHECK-NEXT: mov d1, v1.d[1]
319-
; CHECK-NEXT: fcvt h5, d5
320-
; CHECK-NEXT: fcvt h1, d1
321-
; CHECK-NEXT: mov v0.h[1], v5.h[0]
322-
; CHECK-NEXT: mov v0.h[2], v4.h[0]
323-
; CHECK-NEXT: mov v0.h[3], v1.h[0]
324-
; CHECK-NEXT: fcvt h1, d2
325-
; CHECK-NEXT: mov d2, v2.d[1]
326-
; CHECK-NEXT: mov v0.h[4], v1.h[0]
327-
; CHECK-NEXT: fcvt h1, d2
328-
; CHECK-NEXT: mov d2, v3.d[1]
329-
; CHECK-NEXT: mov v0.h[5], v1.h[0]
330-
; CHECK-NEXT: fcvt h1, d3
331-
; CHECK-NEXT: mov v0.h[6], v1.h[0]
332-
; CHECK-NEXT: fcvt h1, d2
333-
; CHECK-NEXT: mov v0.h[7], v1.h[0]
315+
; CHECK-NEXT: fcvtxn v0.2s, v0.2d
316+
; CHECK-NEXT: fcvtxn v2.2s, v2.2d
317+
; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
318+
; CHECK-NEXT: fcvtxn2 v2.4s, v3.2d
319+
; CHECK-NEXT: fcvtn v0.4h, v0.4s
320+
; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
334321
; CHECK-NEXT: ret
335322
%1 = fptrunc <8 x double> %a to <8 x half>
336323
ret <8 x half> %1
@@ -349,25 +336,12 @@ define <8 x float> @h_to_s(<8 x half> %a) {
349336
define <8 x double> @h_to_d(<8 x half> %a) {
350337
; CHECK-LABEL: h_to_d:
351338
; CHECK: // %bb.0:
352-
; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
353-
; CHECK-NEXT: mov h1, v0.h[1]
354-
; CHECK-NEXT: mov h3, v0.h[3]
355-
; CHECK-NEXT: mov h4, v0.h[2]
356-
; CHECK-NEXT: fcvt d0, h0
357-
; CHECK-NEXT: mov h5, v2.h[1]
358-
; CHECK-NEXT: mov h6, v2.h[3]
359-
; CHECK-NEXT: mov h7, v2.h[2]
360-
; CHECK-NEXT: fcvt d16, h1
361-
; CHECK-NEXT: fcvt d17, h3
362-
; CHECK-NEXT: fcvt d1, h4
363-
; CHECK-NEXT: fcvt d2, h2
364-
; CHECK-NEXT: fcvt d4, h5
365-
; CHECK-NEXT: fcvt d5, h6
366-
; CHECK-NEXT: fcvt d3, h7
367-
; CHECK-NEXT: mov v0.d[1], v16.d[0]
368-
; CHECK-NEXT: mov v1.d[1], v17.d[0]
369-
; CHECK-NEXT: mov v2.d[1], v4.d[0]
370-
; CHECK-NEXT: mov v3.d[1], v5.d[0]
339+
; CHECK-NEXT: fcvtl v1.4s, v0.4h
340+
; CHECK-NEXT: fcvtl2 v2.4s, v0.8h
341+
; CHECK-NEXT: fcvtl v0.2d, v1.2s
342+
; CHECK-NEXT: fcvtl2 v3.2d, v2.4s
343+
; CHECK-NEXT: fcvtl2 v1.2d, v1.4s
344+
; CHECK-NEXT: fcvtl v2.2d, v2.2s
371345
; CHECK-NEXT: ret
372346
%1 = fpext <8 x half> %a to <8 x double>
373347
ret <8 x double> %1

llvm/test/CodeGen/AArch64/fpext.ll

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -85,29 +85,46 @@ entry:
8585
}
8686

8787
define <2 x double> @fpext_v2f16_v2f64(<2 x half> %a) {
88-
; CHECK-LABEL: fpext_v2f16_v2f64:
89-
; CHECK: // %bb.0: // %entry
90-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
91-
; CHECK-NEXT: mov h1, v0.h[1]
92-
; CHECK-NEXT: fcvt d0, h0
93-
; CHECK-NEXT: fcvt d1, h1
94-
; CHECK-NEXT: mov v0.d[1], v1.d[0]
95-
; CHECK-NEXT: ret
88+
; CHECK-SD-LABEL: fpext_v2f16_v2f64:
89+
; CHECK-SD: // %bb.0: // %entry
90+
; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
91+
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
92+
; CHECK-SD-NEXT: ret
93+
;
94+
; CHECK-GI-LABEL: fpext_v2f16_v2f64:
95+
; CHECK-GI: // %bb.0: // %entry
96+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
97+
; CHECK-GI-NEXT: mov h1, v0.h[1]
98+
; CHECK-GI-NEXT: fcvt d0, h0
99+
; CHECK-GI-NEXT: fcvt d1, h1
100+
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
101+
; CHECK-GI-NEXT: ret
96102
entry:
97103
%c = fpext <2 x half> %a to <2 x double>
98104
ret <2 x double> %c
99105
}
100106

101107
define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) {
102-
; CHECK-LABEL: fpext_v3f16_v3f64:
103-
; CHECK: // %bb.0: // %entry
104-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
105-
; CHECK-NEXT: mov h1, v0.h[1]
106-
; CHECK-NEXT: mov h2, v0.h[2]
107-
; CHECK-NEXT: fcvt d0, h0
108-
; CHECK-NEXT: fcvt d1, h1
109-
; CHECK-NEXT: fcvt d2, h2
110-
; CHECK-NEXT: ret
108+
; CHECK-SD-LABEL: fpext_v3f16_v3f64:
109+
; CHECK-SD: // %bb.0: // %entry
110+
; CHECK-SD-NEXT: fcvtl v1.4s, v0.4h
111+
; CHECK-SD-NEXT: fcvtl v0.2d, v1.2s
112+
; CHECK-SD-NEXT: fcvtl2 v2.2d, v1.4s
113+
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
114+
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
115+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
116+
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
117+
; CHECK-SD-NEXT: ret
118+
;
119+
; CHECK-GI-LABEL: fpext_v3f16_v3f64:
120+
; CHECK-GI: // %bb.0: // %entry
121+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
122+
; CHECK-GI-NEXT: mov h1, v0.h[1]
123+
; CHECK-GI-NEXT: mov h2, v0.h[2]
124+
; CHECK-GI-NEXT: fcvt d0, h0
125+
; CHECK-GI-NEXT: fcvt d1, h1
126+
; CHECK-GI-NEXT: fcvt d2, h2
127+
; CHECK-GI-NEXT: ret
111128
entry:
112129
%c = fpext <3 x half> %a to <3 x double>
113130
ret <3 x double> %c
@@ -116,16 +133,9 @@ entry:
116133
define <4 x double> @fpext_v4f16_v4f64(<4 x half> %a) {
117134
; CHECK-SD-LABEL: fpext_v4f16_v4f64:
118135
; CHECK-SD: // %bb.0: // %entry
119-
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
120-
; CHECK-SD-NEXT: mov h1, v0.h[1]
121-
; CHECK-SD-NEXT: mov h2, v0.h[3]
122-
; CHECK-SD-NEXT: mov h3, v0.h[2]
123-
; CHECK-SD-NEXT: fcvt d0, h0
124-
; CHECK-SD-NEXT: fcvt d4, h1
125-
; CHECK-SD-NEXT: fcvt d2, h2
126-
; CHECK-SD-NEXT: fcvt d1, h3
127-
; CHECK-SD-NEXT: mov v0.d[1], v4.d[0]
128-
; CHECK-SD-NEXT: mov v1.d[1], v2.d[0]
136+
; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
137+
; CHECK-SD-NEXT: fcvtl2 v1.2d, v0.4s
138+
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
129139
; CHECK-SD-NEXT: ret
130140
;
131141
; CHECK-GI-LABEL: fpext_v4f16_v4f64:

0 commit comments

Comments
 (0)