Skip to content

[AMDGPU] Min/max changes for GFX12 #75214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) {
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_FSIN:
case AMDGPU::G_FPEXT:
case AMDGPU::G_INTRINSIC_TRUNC:
Expand Down Expand Up @@ -174,6 +176,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return AMDGPU::G_FMINNUM_IEEE;
case AMDGPU::G_FMINNUM_IEEE:
return AMDGPU::G_FMAXNUM_IEEE;
case AMDGPU::G_FMAXIMUM:
return AMDGPU::G_FMINIMUM;
case AMDGPU::G_FMINIMUM:
return AMDGPU::G_FMAXIMUM;
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
return AMDGPU::G_AMDGPU_FMIN_LEGACY;
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
Expand Down Expand Up @@ -207,6 +213,8 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
// 0 doesn't have a negated inline immediate.
Expand Down Expand Up @@ -304,6 +312,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
NegateOperand(MatchInfo->getOperand(1));
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,8 @@ static bool fnegFoldsIntoOpcode(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
case ISD::SELECT:
case ISD::FSIN:
case ISD::FTRUNC:
Expand Down Expand Up @@ -4572,6 +4574,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return ISD::FMINNUM_IEEE;
case ISD::FMINNUM_IEEE:
return ISD::FMAXNUM_IEEE;
case ISD::FMAXIMUM:
return ISD::FMINIMUM;
case ISD::FMINIMUM:
return ISD::FMAXIMUM;
case AMDGPUISD::FMAX_LEGACY:
return AMDGPUISD::FMIN_LEGACY;
case AMDGPUISD::FMIN_LEGACY:
Expand Down Expand Up @@ -4695,6 +4701,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
case AMDGPUISD::FMAX_LEGACY:
case AMDGPUISD::FMIN_LEGACY: {
// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
Expand Down Expand Up @@ -5305,6 +5313,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
NODE_NAME_CASE(FMAXIMUM3)
NODE_NAME_CASE(FMINIMUM3)
NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
Expand Down Expand Up @@ -5759,6 +5769,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::FMED3:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMINIMUM3:
case AMDGPUISD::FMAXIMUM3:
case AMDGPUISD::FMAD_FTZ: {
if (SNaN)
return true;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,8 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
FMAXIMUM3,
FMINIMUM3,
FDOT2,
URECIP,
DIV_SCALE,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = max(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
def AMDGPUfmaximum3 : SDNode<"AMDGPUISD::FMAXIMUM3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = max(a, b, c) a, b, and c are signed ints
def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
Expand All @@ -185,6 +190,11 @@ def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = min(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
def AMDGPUfminimum3 : SDNode<"AMDGPUISD::FMINIMUM3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = min(a, b, c) a, b and c are signed ints
def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ def umin_oneuse : HasOneUseBinOp<umin>;

def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
def fminimum_oneuse : HasOneUseBinOp<fminimum>;
def fmaximum_oneuse : HasOneUseBinOp<fmaximum>;

def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
Expand Down
27 changes: 16 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1959,20 +1959,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0)
.scalarize(0);

getActionDefinitionsBuilder({
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,
getActionDefinitionsBuilder(
{// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,

G_ATOMIC_CMPXCHG_WITH_SUCCESS,
G_ATOMICRMW_NAND,
G_ATOMICRMW_FSUB,
G_READ_REGISTER,
G_WRITE_REGISTER,
G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
G_READ_REGISTER, G_WRITE_REGISTER,

G_SADDO, G_SSUBO,
G_SADDO, G_SSUBO})
.lower();

// TODO: Implement
G_FMINIMUM, G_FMAXIMUM}).lower();
if (ST.hasIEEEMinMax()) {
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
.legalFor(FPTypesPK16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0);
} else {
// TODO: Implement
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
}

getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
.lower();
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3727,14 +3727,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_STRICT_FADD:
case AMDGPU::G_STRICT_FSUB:
case AMDGPU::G_STRICT_FMUL:
case AMDGPU::G_STRICT_FMA: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
isSALUMapping(MI))
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
unsigned Size = Ty.getSizeInBits();
if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
(Size == 32 || Size == 16) && isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1218,6 +1218,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true if the target has IEEE kernel descriptor mode bit
bool hasIEEEMode() const { return getGeneration() < GFX12; }

// \returns true if the target has IEEE fminimum/fmaximum instructions
bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }

// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
bool hasRrWGMode() const { return getGeneration() >= GFX12; }

Expand Down
26 changes: 23 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);

if (Subtarget->hasIEEEMinMax())
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
{MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);

setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::v2i16, MVT::v2f16, MVT::i128},
Expand Down Expand Up @@ -800,6 +804,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FMAXNUM,
ISD::FMINNUM_IEEE,
ISD::FMAXNUM_IEEE,
ISD::FMINIMUM,
ISD::FMAXIMUM,
ISD::FMA,
ISD::SMIN,
ISD::SMAX,
Expand Down Expand Up @@ -11786,10 +11792,14 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
case AMDGPUISD::CLAMP:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMIN3: {
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAXIMUM3:
case AMDGPUISD::FMINIMUM3: {
// FIXME: Shouldn't treat the generic operations different based these.
// However, we aren't really required to flush the result from
// minnum/maxnum..
Expand Down Expand Up @@ -11943,7 +11953,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE: {
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM: {
if (Subtarget->supportsMinMaxDenormModes() ||
// FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(MRI.getType(Reg), MF))
Expand Down Expand Up @@ -12131,13 +12143,17 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
case ISD::FMAXIMUM:
return AMDGPUISD::FMAXIMUM3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
return AMDGPUISD::UMAX3;
case ISD::FMINNUM:
case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
case ISD::FMINIMUM:
return AMDGPUISD::FMINIMUM3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
case ISD::UMIN:
Expand Down Expand Up @@ -12497,7 +12513,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE: {
case ISD::FMINNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM: {
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
Vec.getOperand(0), Idx);
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
Expand Down Expand Up @@ -13759,6 +13777,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5255,11 +5255,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
Expand Down Expand Up @@ -7101,6 +7105,26 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32:
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
.add(Inst.getOperand(1))
.addImm(0) // src1_modifiers
.add(Inst.getOperand(2))
.addImm(0) // clamp
.addImm(0); // omod
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);

legalizeOperands(*NewInstr, MDT);
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
Inst.eraseFromParent();
return;
}
}

if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3441,6 +3441,12 @@ defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>;
defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
} // End Predicates = [isGFX9Plus]

let OtherPredicates = [isGFX12Plus] in {
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
}

// Convert a floating-point power of 2 to the integer exponent.
def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,15 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
} // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
// Uses = [MODE], SchedRW = [WriteSFPU]

// On GFX12 MIN/MAX instructions do not read MODE register.
let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 1, isCommutable = 1,
isReMaterializable = 1, SchedRW = [WriteSFPU] in {
def S_MINIMUM_F32 : SOP2_F32_Inst<"s_minimum_f32", fminimum>;
def S_MAXIMUM_F32 : SOP2_F32_Inst<"s_maximum_f32", fmaximum>;
def S_MINIMUM_F16 : SOP2_F16_Inst<"s_minimum_f16", fminimum>;
def S_MAXIMUM_F16 : SOP2_F16_Inst<"s_maximum_f16", fmaximum>;
}

//===----------------------------------------------------------------------===//
// SOPK Instructions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -2017,6 +2026,10 @@ defm S_MIN_NUM_F32 : SOP2_Real_Renamed_gfx12<0x042, S_MIN_F32, "s_min_num_f32">;
defm S_MAX_NUM_F32 : SOP2_Real_Renamed_gfx12<0x043, S_MAX_F32, "s_max_num_f32">;
defm S_MIN_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04b, S_MIN_F16, "s_min_num_f16">;
defm S_MAX_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04c, S_MAX_F16, "s_max_num_f16">;
defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>;
defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>;
defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>;
defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;

defm S_ADD_CO_U32 : SOP2_Real_Renamed_gfx12<0x000, S_ADD_U32, "s_add_co_u32">;
defm S_SUB_CO_U32 : SOP2_Real_Renamed_gfx12<0x001, S_SUB_U32, "s_sub_co_u32">;
Expand Down
Loading