WIP

MacDue · MacDue · commit 64335db5b4c7 · 2024-10-24T17:57:04.000Z
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18961,9 +18961,39 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
-static bool
-shouldUseSVEForScalarFPConversion(SDNode *N,
-                                  const AArch64Subtarget *Subtarget) {
+/// Creates a scalar FP <-> INT conversion with a scalable one, wrapped
+/// with an insert and extract.
+static SDValue createScalarSVEFPConversion(SelectionDAG &DAG, unsigned Opc,
+                                           SDLoc DL, SDValue SrcVal, EVT SrcTy,
+                                           EVT DestTy) {
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
+  }
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+  Vec = DAG.getNode(Opc, DL, DestVecTy, Vec);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Vec, ZeroIdx);
+}
+
+/// Tries to replace scalar FP <-> conversions with SVE in streaming functions.
+static SDValue
+tryReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const AArch64Subtarget *Subtarget) {
+  // Uncomment to introduce extra fcvts.
+  // if (DCI.isBeforeLegalizeOps())
+  //   return SDValue();
+
+  if (N->isStrictFPOpcode())
+    return SDValue();
+
   auto isSupportedType = [](EVT VT) {
     if (!VT.isSimple())
       return false;
@@ -18973,54 +19003,52 @@ shouldUseSVEForScalarFPConversion(SDNode *N,
     return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
                         VT.getSimpleVT().SimpleTy);
   };
+
+  if (!isSupportedType(N->getValueType(0)) ||
+      !isSupportedType(N->getOperand(0).getValueType()))
+    return SDValue();
+
   // If we are in a streaming[-compatible] function, use SVE for scalar FP <->
-  // INT conversions as this can help avoid movs between GPRs and FPRs, which
+  // INT conversions as this can help avoid moves between GPRs and FPRs, which
   // could be quite expensive.
-  return !N->isStrictFPOpcode() && Subtarget->isSVEorStreamingSVEAvailable() &&
-         (Subtarget->isStreaming() || Subtarget->isStreamingCompatible()) &&
-         isSupportedType(N->getValueType(0)) &&
-         isSupportedType(N->getOperand(0).getValueType());
-}
+  if (!Subtarget->isSVEorStreamingSVEAvailable() ||
+      (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
+    return SDValue();
 
-/// Replaces a scalar FP <-> INT conversion with an SVE (scalable) one, wrapped
-/// with an insert and extract.
-static SDValue replaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG) {
-  assert(!N->isStrictFPOpcode() && "strict fp ops not supported");
+  SDLoc DL(N);
+  unsigned Opc = N->getOpcode();
   SDValue SrcVal = N->getOperand(0);
   EVT SrcTy = SrcVal.getValueType();
   EVT DestTy = N->getValueType(0);
-  EVT SrcVecTy;
-  EVT DestVecTy;
-  // Use a packed vector for the larger type.
-  // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that
-  // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as
-  // (unlike floats) nxv2i32 is an illegal unpacked type.
-  if (DestTy.bitsGT(SrcTy)) {
-    DestVecTy = getPackedSVEVectorVT(DestTy);
-    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
-                                 : DestVecTy.changeVectorElementType(SrcTy);
-  } else {
-    SrcVecTy = getPackedSVEVectorVT(SrcTy);
-    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
-                                   : SrcVecTy.changeVectorElementType(DestTy);
+
+  // Conversions between f64 and i32 are a special case as nxv2i32 is an illegal
+  // type (unlike the equivalent nxv2f32 for floating-point types).
+  // May materialize extra instructions :(
+  if (SrcTy == MVT::i32 && DestTy == MVT::f64) {
+    SDValue ExtSrc = DAG.getNode(Opc == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND
+                                                        : ISD::ZERO_EXTEND,
+                                 DL, MVT::i64, SrcVal);
+    return createScalarSVEFPConversion(DAG, Opc, DL, ExtSrc, MVT::i64,
+                                       MVT::f64);
   }
-  SDLoc dl(N);
-  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
-  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
-                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
-  Vec = DAG.getNode(N->getOpcode(), dl, DestVecTy, Vec);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DestTy, Vec, ZeroIdx);
+  if (SrcTy == MVT::f64 && DestTy == MVT::i32) {
+    SDValue ExtDest =
+        createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, MVT::f64, MVT::i64);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, ExtDest);
+  }
+  return createScalarSVEFPConversion(DAG, Opc, DL, SrcVal, SrcTy, DestTy);
 }
 
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
-  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
-    return replaceScalarFPConversionWithSVE(N, DAG);
+  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+    return Res;
 
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
@@ -19060,8 +19088,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
-  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
-    return replaceScalarFPConversionWithSVE(N, DAG);
+  if (SDValue Res = tryReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+    return Res;
 
   if (!Subtarget->isNeonAvailable())
     return SDValue();
@@ -26082,7 +26110,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG, Subtarget);
+    return performIntToFpCombine(N, DAG, DCI, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT_SAT:
@@ -28384,21 +28412,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
                                                    unsigned NewOp) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  SDValue Pg;
-
-  // FCVTZS_ZPmZ_DtoS and FCVTZU_ZPmZ_DtoS are special cases. These operations
-  // return nxv4i32 rather than the correct nxv2i32, as nxv2i32 is an illegal
-  // unpacked type. So, in this case, we take the predicate size from the
-  // operand.
-  SDValue LastOp{};
-  if ((NewOp == AArch64ISD::FCVTZU_MERGE_PASSTHRU ||
-       NewOp == AArch64ISD::FCVTZS_MERGE_PASSTHRU) &&
-      VT == MVT::nxv4i32 &&
-      (LastOp = Op->ops().back().get()).getValueType() == MVT::nxv2f64) {
-    Pg = getPredicateForVector(DAG, DL, LastOp.getValueType());
-  } else {
-    Pg = getPredicateForVector(DAG, DL, VT);
-  }
+  auto Pg = getPredicateForVector(DAG, DL, VT);
 
   if (VT.isFixedLengthVector()) {
     assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2328,8 +2328,8 @@ let Predicates = [HasSVEorSME] in {
   defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd< 0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
   defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zdr<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   AArch64fcvtr_mt,  nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
   defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd< 0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  null_frag,        nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  null_frag,        nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
   defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
   defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
   defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
@@ -2338,8 +2338,8 @@ let Predicates = [HasSVEorSME] in {
   defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
   defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
   defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, AArch64fcvtzs_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, AArch64fcvtzu_mt, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
   defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
   defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
   defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
@@ -84,8 +84,9 @@ define i32 @f64_to_s32(double %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: f64_to_s32:
@@ -194,8 +195,9 @@ define i32 @f64_to_u32(double %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.d
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: f64_to_u32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -45,9 +45,11 @@ entry:
 define double @s32_to_f64(i32 %x) {
 ; CHECK-LABEL: s32_to_f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -100,9 +102,10 @@ entry:
 define double @u32_to_f64(i32 %x) {
 ; CHECK-LABEL: u32_to_f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1166,7 +1166,7 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -2867,7 +2867,7 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -392,8 +392,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -2836,10 +2836,10 @@ define float @scvtf_i16_f32(ptr %0) {
 define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsh w8, [x0]
+; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -2895,9 +2895,10 @@ define float @scvtf_i32_f32(ptr %0) {
 define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -3015,8 +3016,8 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -3072,9 +3073,10 @@ define float @ucvtf_i32_f32(ptr %0) {
 define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;