Move to DAGCombine + fixups

MacDue · MacDue · commit 3a5683a9d37f · 2024-10-24T17:57:03.000Z
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4550,10 +4550,9 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   EVT VT = Op.getValueType();
 
   if (VT.isScalableVector()) {
-    unsigned Opc = Op.getOpcode();
-    bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
-    unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
-                               : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
+    unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
+                          ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
+                          : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
     return LowerToPredicatedOp(Op, DAG, Opcode);
   }
 
@@ -4629,46 +4628,6 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   return Op;
 }
 
-static bool CanLowerToScalarSVEFPIntConversion(EVT VT) {
-  if (!VT.isSimple())
-    return false;
-  // There are SVE instructions that can convert to/from all pairs of these int
-  // and float types. Note: We don't bother with i8 or i16 as those are illegal
-  // types for scalars.
-  return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
-                      VT.getSimpleVT().SimpleTy);
-}
-
-/// Lowers a scalar FP conversion (to/from) int to SVE.
-static SDValue LowerScalarFPConversionToSVE(SDValue Op, SelectionDAG &DAG) {
-  assert(!Op->isStrictFPOpcode() && "strict fp ops not supported");
-  SDValue SrcVal = Op.getOperand(0);
-  EVT SrcTy = SrcVal.getValueType();
-  EVT DestTy = Op.getValueType();
-  EVT SrcVecTy;
-  EVT DestVecTy;
-  // Use a packed vector for the larger type.
-  // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that
-  // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as
-  // (unlike floats) nxv2i32 is an illegal unpacked type.
-  if (DestTy.bitsGT(SrcTy)) {
-    DestVecTy = getPackedSVEVectorVT(DestTy);
-    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
-                                 : DestVecTy.changeVectorElementType(SrcTy);
-  } else {
-    SrcVecTy = getPackedSVEVectorVT(SrcTy);
-    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
-                                   : SrcVecTy.changeVectorElementType(DestTy);
-  }
-  SDLoc dl(Op);
-  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
-  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
-                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
-  Vec = DAG.getNode(Op.getOpcode(), dl, DestVecTy, Vec);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                     ZeroIdx);
-}
-
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
                                               SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
@@ -4677,12 +4636,6 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
   if (SrcVal.getValueType().isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
-  if (!IsStrict && !Subtarget->isNeonAvailable() &&
-      Subtarget->isSVEorStreamingSVEAvailable() &&
-      CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
-      CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
-    return LowerScalarFPConversionToSVE(Op, DAG);
-
   // f16 conversions are promoted to f32 when full fp16 is not supported.
   if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
       SrcVal.getValueType() == MVT::bf16) {
@@ -4986,12 +4939,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
 
-  if (!IsStrict && !Subtarget->isNeonAvailable() &&
-      Subtarget->isSVEorStreamingSVEAvailable() &&
-      CanLowerToScalarSVEFPIntConversion(SrcVal.getValueType()) &&
-      CanLowerToScalarSVEFPIntConversion(Op.getValueType()))
-    return LowerScalarFPConversionToSVE(Op, DAG);
-
   bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
                   Op->getOpcode() == ISD::SINT_TO_FP;
 
@@ -19014,13 +18961,67 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
+static bool
+shouldUseSVEForScalarFPConversion(SDNode *N,
+                                  const AArch64Subtarget *Subtarget) {
+  auto isSupportedType = [](EVT VT) {
+    if (!VT.isSimple())
+      return false;
+    // There are SVE instructions that can convert to/from all pairs of these
+    // int and float types. Note: We don't bother with i8 or i16 as those are
+    // illegal types for scalars.
+    return is_contained({MVT::i32, MVT::i64, MVT::f16, MVT::f32, MVT::f64},
+                        VT.getSimpleVT().SimpleTy);
+  };
+  // If we are in a streaming[-compatible] function, use SVE for scalar FP <->
+  // INT conversions as this can help avoid movs between GPRs and FPRs, which
+  // could be quite expensive.
+  return !N->isStrictFPOpcode() && Subtarget->isSVEorStreamingSVEAvailable() &&
+         (Subtarget->isStreaming() || Subtarget->isStreamingCompatible()) &&
+         isSupportedType(N->getValueType(0)) &&
+         isSupportedType(N->getOperand(0).getValueType());
+}
+
+/// Replaces a scalar FP <-> INT conversion with an SVE (scalable) one, wrapped
+/// with an insert and extract.
+static SDValue replaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG) {
+  assert(!N->isStrictFPOpcode() && "strict fp ops not supported");
+  SDValue SrcVal = N->getOperand(0);
+  EVT SrcTy = SrcVal.getValueType();
+  EVT DestTy = N->getValueType(0);
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  // Use a packed vector for the larger type.
+  // Note: For conversions such as FCVTZS_ZPmZ_DtoS, and UCVTF_ZPmZ_StoD that
+  // notionally take or return a nxv2i32 type we must instead use a nxv4i32, as
+  // (unlike floats) nxv2i32 is an illegal unpacked type.
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = SrcTy == MVT::i32 ? getPackedSVEVectorVT(SrcTy)
+                                 : DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = DestTy == MVT::i32 ? getPackedSVEVectorVT(DestTy)
+                                   : SrcVecTy.changeVectorElementType(DestTy);
+  }
+  SDLoc dl(N);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+  Vec = DAG.getNode(N->getOpcode(), dl, DestVecTy, Vec);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DestTy, Vec, ZeroIdx);
+}
+
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
+  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
+    return replaceScalarFPConversionWithSVE(N, DAG);
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
     return SDValue();
@@ -19059,6 +19060,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
+  if (shouldUseSVEForScalarFPConversion(N, Subtarget))
+    return replaceScalarFPConversionWithSVE(N, DAG);
+
   if (!Subtarget->isNeonAvailable())
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming-compatible  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible -mattr=+sme2p2  < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
-; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -mattr=+neon < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,6 +22,12 @@ define double @t1(double %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzs d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    scvtf d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t1:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi double %x to i64
   %conv1 = sitofp i64 %conv to double
@@ -41,6 +49,12 @@ define float @t2(float %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzs s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    scvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t2:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi float %x to i32
   %conv1 = sitofp i32 %conv to float
@@ -64,6 +78,14 @@ define half @t3(half %x)  {
 ; USE-NEON-NO-GPRS-NEXT:    scvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    fcvt h0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t3:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi half %x to i32
   %conv1 = sitofp i32 %conv to half
@@ -85,6 +107,12 @@ define double @t4(double %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzu d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t4:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui double %x to i64
   %conv1 = uitofp i64 %conv to double
@@ -106,6 +134,12 @@ define float @t5(float %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzu s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t5:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui float %x to i32
   %conv1 = uitofp i32 %conv to float
@@ -129,6 +163,14 @@ define half @t6(half %x)  {
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    fcvt h0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t6:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui half %x to i32
   %conv1 = uitofp i32 %conv to half
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
-; RUN: llc -mattr=+sve,+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,FORCE-STREAMING
 ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -226,10 +226,11 @@ entry:
 }
 
 define half @strict_convert_signed(i32 %x) {
-; CHECK-LABEL: strict_convert_signed:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf h0, w0
-; CHECK-NEXT:    ret
+; FORCE-STREAMING-LABEL: strict_convert_signed:
+; FORCE-STREAMING:       // %bb.0: // %entry
+; FORCE-STREAMING-NEXT:    scvtf s0, w0
+; FORCE-STREAMING-NEXT:    fcvt h0, s0
+; FORCE-STREAMING-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: strict_convert_signed:
 ; NONEON-NOSVE:       // %bb.0: // %entry