Skip to content

Commit 538f29c

Browse files
committed
true16 for minimummaximum/maximumminimum/maximum/minimum
1 parent 1a8f49f commit 538f29c

File tree

10 files changed

+1987
-754
lines changed

10 files changed

+1987
-754
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5573,8 +5573,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55735573
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
55745574
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
55755575
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5576-
case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5577-
case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5576+
case AMDGPU::S_MINIMUM_F16:
5577+
return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5578+
: AMDGPU::V_MINIMUM_F16_fake16_e64;
5579+
case AMDGPU::S_MAXIMUM_F16:
5580+
return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5581+
: AMDGPU::V_MAXIMUM_F16_fake16_e64;
55785582
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55795583
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55805584
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
@@ -7547,9 +7551,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
75477551
return;
75487552
}
75497553
case AMDGPU::S_MINIMUM_F32:
7550-
case AMDGPU::S_MAXIMUM_F32:
7551-
case AMDGPU::S_MINIMUM_F16:
7552-
case AMDGPU::S_MAXIMUM_F16: {
7554+
case AMDGPU::S_MAXIMUM_F32: {
75537555
const DebugLoc &DL = Inst.getDebugLoc();
75547556
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
75557557
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
@@ -7566,6 +7568,26 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
75667568
Inst.eraseFromParent();
75677569
return;
75687570
}
7571+
case AMDGPU::S_MINIMUM_F16:
7572+
case AMDGPU::S_MAXIMUM_F16: {
7573+
const DebugLoc &DL = Inst.getDebugLoc();
7574+
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7575+
? &AMDGPU::VGPR_16RegClass
7576+
: &AMDGPU::VGPR_32RegClass);
7577+
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7578+
.addImm(0) // src0_modifiers
7579+
.add(Inst.getOperand(1))
7580+
.addImm(0) // src1_modifiers
7581+
.add(Inst.getOperand(2))
7582+
.addImm(0) // clamp
7583+
.addImm(0) // omod
7584+
.addImm(0); // opsel0
7585+
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7586+
legalizeOperands(*NewInstr, MDT);
7587+
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7588+
Inst.eraseFromParent();
7589+
return;
7590+
}
75697591
}
75707592

75717593
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3770,12 +3770,15 @@ let True16Predicate = UseFakeTrue16Insts in {
37703770
let OtherPredicates = [isGFX12Plus] in {
37713771
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
37723772
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3773-
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3774-
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
37753773
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
37763774
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3777-
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3778-
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3775+
}
3776+
3777+
let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = [isGFX12Plus] in {
3778+
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3779+
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3780+
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3781+
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
37793782
}
37803783

37813784
// Convert a floating-point power of 2 to the integer exponent.

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs
170170
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0, AddedComplexity = 1 in {
171171
defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
172172
defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
173-
defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
174-
defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
173+
defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, DivergentBinFrag<fminimum>>;
174+
defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, DivergentBinFrag<fmaximum>>;
175175

176176
let SchedRW = [WriteDoubleAdd] in {
177177
defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
@@ -634,8 +634,8 @@ defm V_MAX3_I16 : VOP3Inst_t16 <"v_max3_i16", VOP_I16_I16_I16_I16, AMDGPUsmax3>;
634634
defm V_MAX3_U16 : VOP3Inst_t16 <"v_max3_u16", VOP_I16_I16_I16_I16, AMDGPUumax3>;
635635

636636
let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
637-
defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
638-
defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
637+
defm V_MINIMUM3_F16 : VOP3Inst_t16 <"v_minimum3_f16", VOP_F16_F16_F16_F16, AMDGPUfminimum3>;
638+
defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>;
639639
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
640640

641641
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
@@ -1440,8 +1440,8 @@ let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
14401440
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
14411441
defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
14421442
defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
1443-
defm V_MAXIMUMMINIMUM_F16 : VOP3Inst<"v_maximumminimum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
1444-
defm V_MINIMUMMAXIMUM_F16 : VOP3Inst<"v_minimummaximum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
1443+
defm V_MAXIMUMMINIMUM_F16 : VOP3Inst_t16<"v_maximumminimum_f16", VOP_F16_F16_F16_F16>;
1444+
defm V_MINIMUMMAXIMUM_F16 : VOP3Inst_t16<"v_minimummaximum_f16", VOP_F16_F16_F16_F16>;
14451445
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
14461446

14471447
let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
@@ -1591,8 +1591,8 @@ defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_
15911591
defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
15921592
defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>;
15931593
defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
1594-
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
1595-
defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x230>;
1594+
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x22f, "v_minimum3_f16">;
1595+
defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x230, "v_maximum3_f16">;
15961596
defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">;
15971597
defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
15981598
defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
@@ -1601,8 +1601,8 @@ defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minma
16011601
defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">;
16021602
defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
16031603
defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
1604-
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
1605-
defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>;
1604+
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x26e, "v_minimummaximum_f16">;
1605+
defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x26f, "v_maximumminimum_f16">;
16061606
defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>;
16071607
defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>;
16081608
defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>;
@@ -1619,8 +1619,8 @@ defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
16191619
defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12<0x342>;
16201620
defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>;
16211621
defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>;
1622-
defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>;
1623-
defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>;
1622+
defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x367, "v_minimum_f16">;
1623+
defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_maximum_f16">;
16241624

16251625
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
16261626
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,9 +1947,6 @@ multiclass VOP3Only_Realtriple_gfx12<bits<10> op, bit isSingle = 0> :
19471947
multiclass VOP3Only_Real_Base_gfx12<bits<10> op> :
19481948
VOP3_Real_Base<GFX12Gen, op, NAME, 1/*IsSingle*/>;
19491949

1950-
multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> :
1951-
VOP3Only_Realtriple<GFX12Gen, op>;
1952-
19531950
multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
19541951
string pseudo_mnemonic = "", bit isSingle = 0> :
19551952
VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
@@ -1960,6 +1957,16 @@ multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, str
19601957
defm _fake16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
19611958
}
19621959

1960+
multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op, string asmName,
1961+
string opName = NAME, string pseudo_mnemonic = "">
1962+
: VOP3_Realtriple_t16_gfx12<op, asmName, opName, pseudo_mnemonic, 1>;
1963+
1964+
multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName,
1965+
string opName = NAME, string pseudo_mnemonic = ""> {
1966+
defm _t16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic>;
1967+
defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>;
1968+
}
1969+
19631970
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
19641971
string asmName, bit isSingle = 0> {
19651972
defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");

0 commit comments

Comments
 (0)