Skip to content

Commit 5d1c596

Browse files
authored
[AMDGPU][True16][MC] true16 for minimummaximum/max/min/max3/min3 (#124184)
true16 support for gfx12 instructions including: v_minimummaximum_f16 v_maximumminimum_f16 v_maximum_f16 v_minimum_f16 v_maximum3_f16 v_minimum3_f16
1 parent b707d52 commit 5d1c596

File tree

10 files changed

+1988
-755
lines changed

10 files changed

+1988
-755
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5573,8 +5573,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55735573
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
55745574
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
55755575
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5576-
case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5577-
case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5576+
case AMDGPU::S_MINIMUM_F16:
5577+
return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5578+
: AMDGPU::V_MINIMUM_F16_fake16_e64;
5579+
case AMDGPU::S_MAXIMUM_F16:
5580+
return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5581+
: AMDGPU::V_MAXIMUM_F16_fake16_e64;
55785582
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55795583
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55805584
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
@@ -7547,9 +7551,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
75477551
return;
75487552
}
75497553
case AMDGPU::S_MINIMUM_F32:
7550-
case AMDGPU::S_MAXIMUM_F32:
7551-
case AMDGPU::S_MINIMUM_F16:
7552-
case AMDGPU::S_MAXIMUM_F16: {
7554+
case AMDGPU::S_MAXIMUM_F32: {
75537555
const DebugLoc &DL = Inst.getDebugLoc();
75547556
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
75557557
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
@@ -7566,6 +7568,26 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
75667568
Inst.eraseFromParent();
75677569
return;
75687570
}
7571+
case AMDGPU::S_MINIMUM_F16:
7572+
case AMDGPU::S_MAXIMUM_F16: {
7573+
const DebugLoc &DL = Inst.getDebugLoc();
7574+
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7575+
? &AMDGPU::VGPR_16RegClass
7576+
: &AMDGPU::VGPR_32RegClass);
7577+
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7578+
.addImm(0) // src0_modifiers
7579+
.add(Inst.getOperand(1))
7580+
.addImm(0) // src1_modifiers
7581+
.add(Inst.getOperand(2))
7582+
.addImm(0) // clamp
7583+
.addImm(0) // omod
7584+
.addImm(0); // opsel0
7585+
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7586+
legalizeOperands(*NewInstr, MDT);
7587+
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7588+
Inst.eraseFromParent();
7589+
return;
7590+
}
75697591
}
75707592

75717593
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3788,15 +3788,18 @@ let True16Predicate = UseFakeTrue16Insts in {
37883788
}
37893789
} // End SubtargetPredicate = [isGFX9Plus]
37903790

3791-
let OtherPredicates = [isGFX12Plus] in {
3791+
let SubtargetPredicate = isGFX12Plus in {
37923792
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
37933793
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3794-
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3795-
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
37963794
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
37973795
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3798-
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3799-
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3796+
}
3797+
3798+
let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in {
3799+
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3800+
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3801+
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3802+
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
38003803
}
38013804

38023805
// Convert a floating-point power of 2 to the integer exponent.

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs
170170
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0, AddedComplexity = 1 in {
171171
defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
172172
defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
173-
defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
174-
defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
173+
defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, DivergentBinFrag<fminimum>>;
174+
defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, DivergentBinFrag<fmaximum>>;
175175

176176
let SchedRW = [WriteDoubleAdd] in {
177177
defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
@@ -637,8 +637,8 @@ defm V_MAX3_I16 : VOP3Inst_t16 <"v_max3_i16", VOP_I16_I16_I16_I16, AMDGPUsmax3>;
637637
defm V_MAX3_U16 : VOP3Inst_t16 <"v_max3_u16", VOP_I16_I16_I16_I16, AMDGPUumax3>;
638638

639639
let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
640-
defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
641-
defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
640+
defm V_MINIMUM3_F16 : VOP3Inst_t16 <"v_minimum3_f16", VOP_F16_F16_F16_F16, AMDGPUfminimum3>;
641+
defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>;
642642
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
643643

644644
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
@@ -1443,8 +1443,8 @@ let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
14431443
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
14441444
defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
14451445
defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
1446-
defm V_MAXIMUMMINIMUM_F16 : VOP3Inst<"v_maximumminimum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
1447-
defm V_MINIMUMMAXIMUM_F16 : VOP3Inst<"v_minimummaximum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
1446+
defm V_MAXIMUMMINIMUM_F16 : VOP3Inst_t16<"v_maximumminimum_f16", VOP_F16_F16_F16_F16>;
1447+
defm V_MINIMUMMAXIMUM_F16 : VOP3Inst_t16<"v_minimummaximum_f16", VOP_F16_F16_F16_F16>;
14481448
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
14491449

14501450
let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
@@ -1594,8 +1594,8 @@ defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_
15941594
defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
15951595
defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>;
15961596
defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
1597-
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
1598-
defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x230>;
1597+
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x22f, "v_minimum3_f16">;
1598+
defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x230, "v_maximum3_f16">;
15991599
defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">;
16001600
defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
16011601
defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
@@ -1604,8 +1604,8 @@ defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minma
16041604
defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">;
16051605
defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
16061606
defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
1607-
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
1608-
defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>;
1607+
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x26e, "v_minimummaximum_f16">;
1608+
defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x26f, "v_maximumminimum_f16">;
16091609
defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>;
16101610
defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>;
16111611
defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>;
@@ -1622,8 +1622,8 @@ defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
16221622
defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12<0x342>;
16231623
defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>;
16241624
defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>;
1625-
defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>;
1626-
defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>;
1625+
defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x367, "v_minimum_f16">;
1626+
defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_maximum_f16">;
16271627

16281628
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
16291629
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,9 +1947,6 @@ multiclass VOP3Only_Realtriple_gfx12<bits<10> op, bit isSingle = 0> :
19471947
multiclass VOP3Only_Real_Base_gfx12<bits<10> op> :
19481948
VOP3_Real_Base<GFX12Gen, op, NAME, 1/*IsSingle*/>;
19491949

1950-
multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> :
1951-
VOP3Only_Realtriple<GFX12Gen, op>;
1952-
19531950
multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
19541951
string pseudo_mnemonic = "", bit isSingle = 0> :
19551952
VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
@@ -1960,6 +1957,16 @@ multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, str
19601957
defm _fake16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
19611958
}
19621959

1960+
multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op, string asmName,
1961+
string opName = NAME, string pseudo_mnemonic = "">
1962+
: VOP3_Realtriple_t16_gfx12<op, asmName, opName, pseudo_mnemonic, 1>;
1963+
1964+
multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName,
1965+
string opName = NAME, string pseudo_mnemonic = ""> {
1966+
defm _t16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic>;
1967+
defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>;
1968+
}
1969+
19631970
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
19641971
string asmName, bit isSingle = 0> {
19651972
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");

0 commit comments

Comments
 (0)