-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Directly select minimumnum/maximumnum with ieee_mode=0 #141903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Directly select minimumnum/maximumnum with ieee_mode=0 #141903
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThe hardware min/max follow the IR rules with IEEE mode disabled, Patch is 230.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141903.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..7a50923ffedc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -92,6 +92,8 @@ def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().F
def NoFP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals == DenormalMode::getPreserveSign()">;
def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
+def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
+def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 177750b639c67..ae530d35eff00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -957,12 +957,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
}
- auto &MinNumMaxNum = getActionDefinitionsBuilder({
- G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
-
- // TODO: These should be custom lowered and are directly legal with IEEE=0
- auto &MinimumNumMaximumNum =
- getActionDefinitionsBuilder({G_FMINIMUMNUM, G_FMAXIMUMNUM});
+ auto &MinNumMaxNum = getActionDefinitionsBuilder(
+ {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
+ G_FMAXNUM_IEEE});
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
@@ -980,8 +977,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
}
- MinimumNumMaximumNum.lower();
-
if (ST.hasVOP3PInsts())
FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
@@ -2160,6 +2155,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeFPTOI(MI, MRI, B, false);
case TargetOpcode::G_FMINNUM:
case TargetOpcode::G_FMAXNUM:
+ case TargetOpcode::G_FMINIMUMNUM:
+ case TargetOpcode::G_FMAXIMUMNUM:
case TargetOpcode::G_FMINNUM_IEEE:
case TargetOpcode::G_FMAXNUM_IEEE:
return legalizeMinNumMaxNum(Helper, MI);
@@ -2739,9 +2736,17 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
// With ieee_mode disabled, the instructions have the correct behavior
- // already for G_FMINNUM/G_FMAXNUM
- if (!MFI->getMode().IEEE)
+ // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
+ //
+ // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
+ // enabled.
+ if (!MFI->getMode().IEEE) {
+ if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
+ MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
+ return true;
+
return !IsIEEEOp;
+ }
if (IsIEEEOp)
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..4391a48ff2b68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4009,6 +4009,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
+ case AMDGPU::G_FMINIMUMNUM:
+ case AMDGPU::G_FMAXIMUMNUM:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_STRICT_FADD:
case AMDGPU::G_STRICT_FSUB:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..f161e5185e196 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -523,8 +523,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
Legal);
- setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
- Custom);
+ setOperationAction(
+ {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+ {MVT::f32, MVT::f64}, Custom);
// These are really only legal for ieee_mode functions. We should be avoiding
// them for functions that don't have ieee_mode enabled, so just say they are
@@ -756,7 +757,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
- setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
+ setOperationAction(
+ {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+ MVT::f16, Custom);
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
@@ -810,8 +813,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
VT, Custom);
- setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
- Custom);
+ setOperationAction(
+ {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+ {MVT::v2f16, MVT::v4f16}, Custom);
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
@@ -6057,6 +6061,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
+ return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
return lowerFMINIMUM_FMAXIMUM(Op, DAG);
@@ -6081,8 +6088,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
- case ISD::FMINIMUMNUM:
- case ISD::FMAXIMUMNUM:
case ISD::UADDSAT:
case ISD::USUBSAT:
case ISD::SADDSAT:
@@ -6967,6 +6972,23 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
return Op;
}
+SDValue
+SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool IsIEEEMode = Info->getMode().IEEE;
+
+ if (IsIEEEMode)
+ return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
+
+ if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+ VT == MVT::v16bf16)
+ return splitBinaryVectorOp(Op, DAG);
+ return Op;
+}
+
SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..532d1e46714e6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2e2913d88cc54..0cb3ba38e8016 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1376,6 +1376,52 @@ def : GCNPat <
(i32 (V_MOV_B32_e32 (i32 0))), sub1)
>;
+
+
+class FPBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+ : GCNPat <(vt (node (vt (VOP3Mods vt:$src0, i32:$src0_mods)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_mods)))),
+ (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+class FPPkBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+ : GCNPat <(vt (node (VOP3PMods v2f16:$src0, i32:$src0_mods),
+ (VOP3PMods v2f16:$src1, i32:$src1_mods))),
+ (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE)
+>;
+
+/// With IEEE=0, signalingness is ignored and the non-nan input will
+/// be directly returned.
+let OtherPredicates = [IEEEModeDisabled] in {
+ def : FPBinOpPat<fminimumnum, f32, V_MIN_F32_e64>;
+ def : FPBinOpPat<fmaximumnum, f32, V_MAX_F32_e64>;
+ def : FPBinOpPat<fminimumnum, f64, V_MIN_F64_e64>;
+ def : FPBinOpPat<fmaximumnum, f64, V_MAX_F64_e64>;
+
+ let SubtargetPredicate = Has16BitInsts,
+ True16Predicate = NotHasTrue16BitInsts in {
+ def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_e64>;
+ def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_e64>;
+ }
+
+ let SubtargetPredicate = Has16BitInsts,
+ True16Predicate = UseRealTrue16Insts in {
+ def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_t16_e64>;
+ def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_t16_e64>;
+ }
+
+ let SubtargetPredicate = Has16BitInsts,
+ True16Predicate = UseFakeTrue16Insts in {
+ def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_fake16_e64>;
+ def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_fake16_e64>;
+ }
+
+ let SubtargetPredicate = HasVOP3PInsts in {
+ def : FPPkBinOpPat<fminimumnum, v2f16, V_PK_MIN_F16>;
+ def : FPPkBinOpPat<fmaximumnum, v2f16, V_PK_MAX_F16>;
+ }
+}
+
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
/********** ============================================ **********/
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 46da9d33639b6..86e73ed03f187 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2019,9 +2019,7 @@ define float @v_fneg_minimumnum_f32_no_ieee(float %a, float %b) #4 {
; GCN-LABEL: v_fneg_minimumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: v_max_f32_e64 v0, -v0, -v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimumnum.f32(float %a, float %b)
%fneg = fneg float %min
@@ -2044,8 +2042,7 @@ define float @v_fneg_self_minimumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_self_minimumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_max_f32_e32 v0, v0, v0
+; GCN-NEXT: v_max_f32_e64 v0, -v0, -v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimumnum.f32(float %a, float %a)
%min.fneg = fneg float %min
@@ -2068,8 +2065,7 @@ define float @v_fneg_posk_minimumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_posk_minimumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_max_f32_e32 v0, -4.0, v0
+; GCN-NEXT: v_max_f32_e64 v0, -v0, -4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimumnum.f32(float 4.0, float %a)
%fneg = fneg float %min
@@ -2092,8 +2088,7 @@ define float @v_fneg_negk_minimumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_negk_minimumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_max_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_max_f32_e64 v0, -v0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimumnum.f32(float -4.0, float %a)
%fneg = fneg float %min
@@ -2251,8 +2246,7 @@ define float @v_fneg_neg0_minimumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_neg0_minimumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_max_f32_e32 v0, 0, v0
+; GCN-NEXT: v_max_f32_e64 v0, -v0, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimumnum.f32(float -0.0, float %a)
%fneg = fneg float %min
@@ -2299,7 +2293,6 @@ define float @v_fneg_0_minimumnum_foldable_use_f32_no_ieee(float %a, float %b) #
; GCN-LABEL: v_fneg_0_minimumnum_foldable_use_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, 0, v0
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2330,9 +2323,7 @@ define <2 x float> @v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee(float %a,
; GCN-LABEL: v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: v_max_f32_e64 v0, -v0, -v1
; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minimumnum.f32(float %a, float %b)
@@ -2364,9 +2355,7 @@ define float @v_fneg_maximumnum_f32_no_ieee(float %a, float %b) #4 {
; GCN-LABEL: v_fneg_maximumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: v_min_f32_e64 v0, -v0, -v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maximumnum.f32(float %a, float %b)
%fneg = fneg float %max
@@ -2389,8 +2378,7 @@ define float @v_fneg_self_maximumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_self_maximumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, v0, v0
+; GCN-NEXT: v_min_f32_e64 v0, -v0, -v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maximumnum.f32(float %a, float %a)
%max.fneg = fneg float %max
@@ -2413,8 +2401,7 @@ define float @v_fneg_posk_maximumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_posk_maximumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, -4.0, v0
+; GCN-NEXT: v_min_f32_e64 v0, -v0, -4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maximumnum.f32(float 4.0, float %a)
%fneg = fneg float %max
@@ -2437,8 +2424,7 @@ define float @v_fneg_negk_maximumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_negk_maximumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_min_f32_e64 v0, -v0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maximumnum.f32(float -4.0, float %a)
%fneg = fneg float %max
@@ -2473,8 +2459,7 @@ define float @v_fneg_neg0_maximumnum_f32_no_ieee(float %a) #4 {
; GCN-LABEL: v_fneg_neg0_maximumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, 0, v0
+; GCN-NEXT: v_min_f32_e64 v0, -v0, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maximumnum.f32(float -0.0, float %a)
%fneg = fneg float %max
@@ -2499,7 +2484,6 @@ define float @v_fneg_0_maximumnum_foldable_use_f32_no_ieee(float %a, float %b) #
; GCN-LABEL: v_fneg_0_maximumnum_foldable_use_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v0, 0, v0
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2530,9 +2514,7 @@ define <2 x float> @v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee(float %a,
; GCN-LABEL: v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: v_min_f32_e64 v0, -v0, -v1
; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maximumnum.f32(float %a, float %b)
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index c45d86ce306e7..4f73e8e9c1883 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -3414,8 +3414,8 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-GISEL-NEXT: v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-GISEL-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-GISEL-NEXT: v_max_f16_e32 v2, v2, v3
; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3652,57 +3652,57 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_max_f16_e32 v4, v0, v0
-; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-GISEL-NEXT: v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-GISEL-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-GISEL-NEXT: v_max_f16_e32 v4, v4, v5
; GFX8-GISEL-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v1, v2
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_pk_max_f16 v2, v2, v2
-; GFX900-SDAG-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX900-SDAG-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX900-SDAG-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT: v_pk_max_f16 v1, v1, v2
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX9-SDAG-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-SDAG-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-SDAG-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[...
[truncated]
|
ping |
1 similar comment
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping
The hardware min/max follow the IR rules with IEEE mode disabled, so we can avoid the canonicalizes of the input. We lose the quieting of a signaling nan if both inputs are nans, but we only require that with strictfp.
740cd37
to
13d4fd4
Compare
…141903) The hardware min/max follow the IR rules with IEEE mode disabled, so we can avoid the canonicalizes of the input. We lose the quieting of a signaling nan if both inputs are nans, but we only require that with strictfp.
…141903) The hardware min/max follow the IR rules with IEEE mode disabled, so we can avoid the canonicalizes of the input. We lose the quieting of a signaling nan if both inputs are nans, but we only require that with strictfp.
The hardware min/max follow the IR rules with IEEE mode disabled,
so we can avoid the canonicalizes of the input. We lose the quieting
of a signaling nan if both inputs are nans, but we only require that
with strictfp.