@@ -3533,6 +3533,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3533
3533
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3534
3534
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3535
3535
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3536
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3536
3537
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3537
3538
// Don't fold if we are using source or output modifiers. The new VOP2
3538
3539
// instructions don't have them.
@@ -3555,6 +3556,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3555
3556
bool IsFMA =
3556
3557
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3557
3558
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3559
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3558
3560
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3559
3561
MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
3560
3562
MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3588,16 +3590,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3588
3590
3589
3591
unsigned NewOpc =
3590
3592
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3591
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3593
+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3594
+ ? AMDGPU::V_FMAMK_F16_t16
3595
+ : AMDGPU::V_FMAMK_F16_fake16
3592
3596
: AMDGPU::V_FMAMK_F16)
3593
3597
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3594
3598
if (pseudoToMCOpcode (NewOpc) == -1 )
3595
3599
return false ;
3596
3600
3597
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3598
- // would also require restricting their register classes. For now
3599
- // just bail out.
3600
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3601
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3602
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3603
+ // restricting their register classes. For now just bail out.
3604
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3605
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3601
3606
return false ;
3602
3607
3603
3608
const std::optional<int64_t > SubRegImm = extractSubregFromImm (
@@ -3613,7 +3618,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3613
3618
Src0->setIsKill (RegSrc->isKill ());
3614
3619
3615
3620
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3616
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3621
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3617
3622
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3618
3623
UseMI.untieRegOperand (
3619
3624
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3668,23 +3673,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
3668
3673
3669
3674
unsigned NewOpc =
3670
3675
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3671
- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3676
+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3677
+ ? AMDGPU::V_FMAAK_F16_t16
3678
+ : AMDGPU::V_FMAAK_F16_fake16
3672
3679
: AMDGPU::V_FMAAK_F16)
3673
3680
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3674
3681
if (pseudoToMCOpcode (NewOpc) == -1 )
3675
3682
return false ;
3676
3683
3677
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3678
- // would also require restricting their register classes. For now
3679
- // just bail out.
3680
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3684
+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3685
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3686
+ // restricting their register classes. For now just bail out.
3687
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3688
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3681
3689
return false ;
3682
3690
3683
3691
// FIXME: This would be a lot easier if we could return a new instruction
3684
3692
// instead of having to modify in place.
3685
3693
3686
3694
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3687
- Opc == AMDGPU::V_FMAC_F32_e64 ||
3695
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3688
3696
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3689
3697
UseMI.untieRegOperand (
3690
3698
AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3874,8 +3882,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3874
3882
return AMDGPU::V_FMA_LEGACY_F32_e64;
3875
3883
case AMDGPU::V_FMAC_F16_e32:
3876
3884
case AMDGPU::V_FMAC_F16_e64:
3885
+ case AMDGPU::V_FMAC_F16_t16_e64:
3877
3886
case AMDGPU::V_FMAC_F16_fake16_e64:
3878
- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3887
+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3888
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3889
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
3879
3890
: AMDGPU::V_FMA_F16_gfx9_e64;
3880
3891
case AMDGPU::V_FMAC_F32_e32:
3881
3892
case AMDGPU::V_FMAC_F32_e64:
@@ -3941,19 +3952,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3941
3952
return MIB;
3942
3953
}
3943
3954
3944
- assert (
3945
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3946
- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3947
- " pre-RA" );
3955
+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3956
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3957
+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3958
+ " present "
3959
+ " pre-RA" );
3948
3960
3949
3961
// Handle MAC/FMAC.
3950
3962
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3951
3963
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3964
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3952
3965
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3953
3966
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3954
3967
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3955
3968
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3956
3969
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3970
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3957
3971
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3958
3972
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3959
3973
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3968,6 +3982,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
3968
3982
return nullptr ;
3969
3983
case AMDGPU::V_MAC_F16_e64:
3970
3984
case AMDGPU::V_FMAC_F16_e64:
3985
+ case AMDGPU::V_FMAC_F16_t16_e64:
3971
3986
case AMDGPU::V_FMAC_F16_fake16_e64:
3972
3987
case AMDGPU::V_MAC_F32_e64:
3973
3988
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4053,8 +4068,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4053
4068
int64_t Imm;
4054
4069
if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
4055
4070
unsigned NewOpc =
4056
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4057
- : AMDGPU::V_FMAAK_F16)
4071
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts ()
4072
+ ? ST.useRealTrue16Insts ()
4073
+ ? AMDGPU::V_FMAAK_F16_t16
4074
+ : AMDGPU::V_FMAAK_F16_fake16
4075
+ : AMDGPU::V_FMAAK_F16)
4058
4076
: AMDGPU::V_FMAAK_F32)
4059
4077
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4060
4078
if (pseudoToMCOpcode (NewOpc) != -1 ) {
@@ -4071,11 +4089,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
4071
4089
return MIB;
4072
4090
}
4073
4091
}
4074
- unsigned NewOpc =
4075
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4076
- : AMDGPU::V_FMAMK_F16)
4077
- : AMDGPU::V_FMAMK_F32)
4078
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4092
+ unsigned NewOpc = IsFMA
4093
+ ? (IsF16 ? (ST.hasTrue16BitInsts ()
4094
+ ? ST.useRealTrue16Insts ()
4095
+ ? AMDGPU::V_FMAMK_F16_t16
4096
+ : AMDGPU::V_FMAMK_F16_fake16
4097
+ : AMDGPU::V_FMAMK_F16)
4098
+ : AMDGPU::V_FMAMK_F32)
4099
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4079
4100
if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
4080
4101
if (pseudoToMCOpcode (NewOpc) != -1 ) {
4081
4102
MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4513,6 +4534,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
4513
4534
case AMDGPU::V_MAC_F32_e64:
4514
4535
case AMDGPU::V_MAC_LEGACY_F32_e64:
4515
4536
case AMDGPU::V_FMAC_F16_e64:
4537
+ case AMDGPU::V_FMAC_F16_t16_e64:
4516
4538
case AMDGPU::V_FMAC_F16_fake16_e64:
4517
4539
case AMDGPU::V_FMAC_F32_e64:
4518
4540
case AMDGPU::V_FMAC_F64_e64:
@@ -5569,7 +5591,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
5569
5591
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5570
5592
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5571
5593
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5572
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5594
+ case AMDGPU::S_FMAC_F16:
5595
+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5596
+ : AMDGPU::V_FMAC_F16_fake16_e64;
5573
5597
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5574
5598
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5575
5599
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments