Skip to content

[AMDGPU] Add pseudo scalar trans instructions for GFX12 #75204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,12 @@ def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
"Has single-use VGPR hint instructions"
>;

def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
"HasPseudoScalarTrans",
"true",
"Has Pseudo Scalar Transcendental instructions"
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -1467,6 +1473,7 @@ def FeatureISAVersion12 : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
Expand Down Expand Up @@ -2009,6 +2016,9 @@ def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">,
AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>;

def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;

def HasGDS : Predicate<"Subtarget->hasGDS()">;

def HasGWS : Predicate<"Subtarget->hasGWS()">;
Expand Down
28 changes: 20 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3781,14 +3781,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case AMDGPU::G_FSQRT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
case AMDGPU::G_SSUBSAT:
case AMDGPU::G_UADDSAT:
case AMDGPU::G_USUBSAT:
case AMDGPU::G_FMAD:
case AMDGPU::G_FSQRT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
case AMDGPU::G_FLDEXP:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
Expand Down Expand Up @@ -4253,12 +4259,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_log_clamp:
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_sqrt:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_fmul_legacy:
Expand Down Expand Up @@ -4315,6 +4316,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_sqrt: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:
if (isSALUMapping(MI))
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)

DECODE_OPERAND_REG_7(SReg_32, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
DECODE_OPERAND_REG_7(SReg_64, OPW64)
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/GCNProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,10 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
// GCN GFX12.
//===----------------------------------------------------------------------===//

def : ProcessorModel<"gfx1200", GFX11SpeedModel,
def : ProcessorModel<"gfx1200", GFX12SpeedModel,
FeatureISAVersion12.Features
>;

def : ProcessorModel<"gfx1201", GFX11SpeedModel,
def : ProcessorModel<"gfx1201", GFX12SpeedModel,
FeatureISAVersion12.Features
>;
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool ScalarizeGlobal = false;
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
bool HasPseudoScalarTrans = false;

bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
Expand Down Expand Up @@ -1160,6 +1161,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }

bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
Expand Down
12 changes: 11 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5305,6 +5305,16 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
Expand Down Expand Up @@ -7189,7 +7199,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Use the new VALU Opcode.
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
.setMIFlags(Inst.getFlags());
if (isVOP3(NewOpcode)) {
if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
// Intersperse VOP3 modifiers among the SALU operands.
NewInstr->addOperand(Inst.getOperand(0));
if (AMDGPU::getNamedOperandIdx(NewOpcode,
Expand Down
36 changes: 36 additions & 0 deletions llvm/lib/Target/AMDGPU/SISchedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def Write8PassDGEMM : SchedWrite;
// Scalar float instructions
def WriteSFPU : SchedWrite;

// F16 or F32 pseudo scalar transcendental instructions
def WritePseudoScalarTrans : SchedWrite;

// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
Expand All @@ -93,6 +96,7 @@ def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
def GFX12SpeedModel : SISchedMachineModel;

// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
Expand Down Expand Up @@ -174,6 +178,7 @@ multiclass SICommonWriteRes {
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;

def : UnsupportedWriteRes<WriteSFPU>;
def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1

def : ReadAdvance<MIVGPRRead, -2>;
Expand Down Expand Up @@ -318,6 +323,7 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;

def : UnsupportedWriteRes<WriteSFPU>;
def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1

def : InstRW<[WriteCopy], (instrs COPY)>;
Expand Down Expand Up @@ -351,6 +357,36 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
} // End RetireOOO = 1

def : UnsupportedWriteRes<WritePseudoScalarTrans>;

def : InstRW<[WriteCopy], (instrs COPY)>;

} // End SchedModel = GFX11SpeedModel

let SchedModel = GFX12SpeedModel in {

def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>;

def : HWWriteRes<WriteBranch, [HWBranch], 32>;
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>;
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;

def : InstRW<[WriteCopy], (instrs COPY)>;

} // End SchedModel = GFX12SpeedModel
34 changes: 22 additions & 12 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -675,19 +675,8 @@ let SubtargetPredicate = isGFX12Plus in {

} // End SubtargetPredicate = isGFX12Plus

def SelectPat : PatFrag <
(ops node:$src1, node:$src2),
(select SCC, $src1, $src2),
[{ return !N->isDivergent(); }]
>;

let Uses = [SCC] in {
let AddedComplexity = 20 in {
def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
[(set i32:$sdst, (SelectPat i32:$src0, i32:$src1))]
>;
}

def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
} // End Uses = [SCC]

Expand Down Expand Up @@ -1808,6 +1797,27 @@ def : GetFPModePat<fpmode_mask_gfx6plus>;
// SOP2 Patterns
//===----------------------------------------------------------------------===//

def UniformSelect : PatFrag<
(ops node:$src0, node:$src1),
(select SCC, $src0, $src1),
[{ return !N->isDivergent(); }]
>;

let AddedComplexity = 20 in {
def : GCNPat<
(i32 (UniformSelect i32:$src0, i32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>;

// TODO: The predicate should not be necessary, but enabling this pattern for
// all subtargets generates worse code in some cases.
let OtherPredicates = [HasPseudoScalarTrans] in
def : GCNPat<
(f32 (UniformSelect f32:$src0, f32:$src1)),
(S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
>;
}

// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
Expand Down
53 changes: 53 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,49 @@ let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
}

class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp,
ValueType dstVt, ValueType srcVt = dstVt>
: VOPProfile<[dstVt, srcVt, untyped, untyped]> {
let DstRC = VOPDstOperand<Dst>;
let Src0RC64 = SrcOp;

let HasOMod = 1;
let HasModifiers = 1;
}

def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>;
def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>;

let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in {
defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32, AMDGPUexp>;
defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32, AMDGPUlog>;
defm V_S_LOG_F16 : VOP3PseudoScalarInst<"v_s_log_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32, AMDGPUrcp>;
defm V_S_RCP_F16 : VOP3PseudoScalarInst<"v_s_rcp_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32, AMDGPUrsq>;
defm V_S_RSQ_F16 : VOP3PseudoScalarInst<"v_s_rsq_f16", VOP_Pseudo_Scalar_F16>;
defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32, any_amdgcn_sqrt>;
defm V_S_SQRT_F16 : VOP3PseudoScalarInst<"v_s_sqrt_f16", VOP_Pseudo_Scalar_F16>;
}

class PseudoScalarPatF16<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat <
(f16 (UniformUnaryFrag<node> (f16 (VOP3Mods0 f16:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)))),
(f16 (COPY_TO_REGCLASS (f32 (inst i32:$src0_modifiers, f16:$src0, i1:$clamp,
i32:$omod)),
SReg_32_XEXEC))
>;

let SubtargetPredicate = HasPseudoScalarTrans in {
def : PseudoScalarPatF16<AMDGPUexpf16, V_S_EXP_F16_e64>;
def : PseudoScalarPatF16<AMDGPUlogf16, V_S_LOG_F16_e64>;
def : PseudoScalarPatF16<AMDGPUrcp, V_S_RCP_F16_e64>;
def : PseudoScalarPatF16<AMDGPUrsq, V_S_RSQ_F16_e64>;
def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
}

//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -906,6 +949,16 @@ defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>;
defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>;
defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>;
defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>;
defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12<0x283>;
defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12<0x284>;
defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12<0x285>;
defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12<0x286>;
defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12<0x287>;
defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>;
defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>;
defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">;
defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">;
defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1303,6 +1303,19 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
} // end SubtargetPredicate = isGFX11Plus
}

class UniformUnaryFragOrOp<SDPatternOperator Op> {
SDPatternOperator ret = !if(!or(!isa<SDNode>(Op), !isa<PatFrags>(Op)),
UniformUnaryFrag<Op>, Op);
}

multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag> {
def _e64 : VOP3_Pseudo<OpName, P, [(set P.DstVT:$vdst,
(UniformUnaryFragOrOp<node>.ret
(P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp,
i32:$omod))))]>;
}

//===----------------------------------------------------------------------===//
// VOP3 DPP
//===----------------------------------------------------------------------===//
Expand Down
Loading