llvm · changpeng · Apr 7, 2025 · Apr 8, 2025 · Apr 11, 2025 · arsenm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1053,10 +1053,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   }
 
   auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
-  if (ST.hasCvtPkF16F32Inst())
-    FPTruncActions.legalFor(
-        {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}});
-  else
+  if (ST.hasCvtPkF16F32Inst()) {
+    FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
+                  .customFor({V2S16, V2S64});
+  } else
     FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
   FPTruncActions.scalarize(0).lower();
 
@@ -2155,6 +2155,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
+  case TargetOpcode::G_FPTRUNC:
+    return legalizeFPTrunc(Helper, MI, MRI);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, B);
   case TargetOpcode::G_INSERT_VECTOR_ELT:
@@ -2741,6 +2743,29 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
 }
 
+bool AMDGPULegalizerInfo::legalizeFPTrunc(LegalizerHelper &Helper,
+                                          MachineInstr &MI,
+                                          MachineRegisterInfo &MRI) const {
+  // TODO: We should only use fast math flag. But the global option is
+  // still used here to be consistent, especially when the fast math flag is
+  // not working for FP_ROUND on the SelectDAG path at this moment.
+  MachineFunction &MF = Helper.MIRBuilder.getMF();
+  bool AllowInaccurateFPTRUNC = MI.getFlag(MachineInstr::FmAfn) ||
+                                MF.getTarget().Options.UnsafeFPMath;
+
+  if (AllowInaccurateFPTRUNC) {
+    // Use the tablegen pattern to select native instructions.
+    return true;
+  }
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  // Scalarize the vector and fall through to lower f64 -> f16.
+  return Helper.fewerElementsVector(MI, 0, DstTy.getElementType()) ==
+         LegalizerHelper::Legalized;
+}
+
 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -56,6 +56,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &B, bool Signed) const;
   bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const;
+  bool legalizeFPTrunc(LegalizerHelper &Helper, MachineInstr &MI,
+                       MachineRegisterInfo &MRI) const;
   bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
                                 MachineIRBuilder &B) const;
   bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -916,7 +916,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget->hasCvtPkF16F32Inst()) {
-    setOperationAction(ISD::FP_ROUND, MVT::v2f16, Legal);
+    setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
   }
 
   setTargetDAGCombine({ISD::ADD,
@@ -6893,6 +6893,19 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
     return Op;
 
   EVT DstVT = Op.getValueType();
+
+  if (DstVT == MVT::v2f16) {
+    // FIXME: We should only use fast math flag here. However, the fast math
+    // flag is lost during fptrunc to fp_round lowering. In addition, the flag
+    // is not propagated during subsequent lowering.
+    bool AllowInaccurateFP_ROUND = Op->getFlags().hasApproximateFuncs() ||
+                                   DAG.getTarget().Options.UnsafeFPMath;
+    // With fast math, the tablegen pattern is used to select native
+    // instructions. Otherwise, the vector will be scalarized and custom lowered
+    // to preserve the precision.
+    return AllowInaccurateFP_ROUND ? Op : SDValue();
+  }
+
   SDLoc DL(Op);
   if (DstVT == MVT::f16) {
     // TODO: Handle strictfp