Skip to content

[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions #118589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 24 additions & 15 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -916,21 +916,30 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return nullptr;
} else {
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
// with op_sel[3:2] != 0)
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
SISrcMods::DST_OP_SEL ||
(AMDGPU::isFP8DstSelInst(Opcode) &&
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
SISrcMods::OP_SEL_0))))
return nullptr;
}

return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
}

AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
// Type 2: VOP3 which write the hi bits
if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
SISrcMods::DST_OP_SEL)
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

// Type 3: FP8DstSelInst with op_sel[3:2] != 0)
if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
(TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
SISrcMods::OP_SEL_0))
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
}

// Special case: nop is required for all the opsel values for fp4 sr variant
// cvt scale instructions
if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

return nullptr;
}

/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2567,6 +2567,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsFP8SrcByteSel = 0;
field bit IsFP8DstByteSel = 0;
field bit HasFP8DstByteSel = 0;
field bit HasFP4DstByteSel = 0;
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);

field bit HasDst = !ne(DstVT.Value, untyped.Value);
Expand Down Expand Up @@ -3249,13 +3250,13 @@ def isMFMA_F8F6F4Table : GenericTable {
let PrimaryKeyName = "isMFMA_F8F6F4" ;
}

def FP8DstByteSelTable : GenericTable {
def FP4FP8DstByteSelTable : GenericTable {
let FilterClass = "VOP3_Pseudo";
let CppTypeName = "FP8DstByteSelInfo";
let Fields = ["Opcode", "HasFP8DstByteSel"];
let CppTypeName = "FP4FP8DstByteSelInfo";
let Fields = ["Opcode", "HasFP8DstByteSel", "HasFP4DstByteSel"];

let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getFP8DstByteSelHelper";
let PrimaryKeyName = "getFP4FP8DstByteSelHelper";
}

def VOPDComponentTable : GenericTable {
Expand Down
20 changes: 14 additions & 6 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,17 +378,18 @@ struct VOPTrue16Info {
bool IsTrue16;
};

#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL
#define GET_FP4FP8DstByteSelTable_DECL
#define GET_FP4FP8DstByteSelTable_IMPL

struct DPMACCInstructionInfo {
uint16_t Opcode;
bool IsDPMACCInstruction;
};

struct FP8DstByteSelInfo {
struct FP4FP8DstByteSelInfo {
uint16_t Opcode;
bool HasFP8DstByteSel;
bool HasFP4DstByteSel;
};

#define GET_FP8DstByteSelTable_DECL
Expand Down Expand Up @@ -657,9 +658,16 @@ bool isTrue16Inst(unsigned Opc) {
return Info ? Info->IsTrue16 : false;
}

bool isFP8DstSelInst(unsigned Opc) {
const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
return Info ? Info->HasFP8DstByteSel : false;
FPType getFPDstSelType(unsigned Opc) {
const FP4FP8DstByteSelInfo *Info = getFP4FP8DstByteSelHelper(Opc);
if (!Info)
return FPType::None;
if (Info->HasFP8DstByteSel)
return FPType::FP8;
if (Info->HasFP4DstByteSel)
return FPType::FP4;

return FPType::None;
}

unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ static constexpr unsigned GFX12 = 1;

enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5, AMDHSA_COV6 = 6 };

enum class FPType { None, FP4, FP8 };

/// \returns True if \p STI is AMDHSA.
bool isHsaAbi(const MCSubtargetInfo &STI);

Expand Down Expand Up @@ -885,7 +887,7 @@ LLVM_READONLY
bool isTrue16Inst(unsigned Opc);

LLVM_READONLY
bool isFP8DstSelInst(unsigned Opc);
FPType getFPDstSelType(unsigned Opc);

LLVM_READONLY
bool isInvalidSingleUseConsumerInst(unsigned Opc);
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
let HasFP4DstByteSel = 1;
}

def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
Expand All @@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
let HasFP4DstByteSel = 1;
}

class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsSWMMAC = P.IsSWMMAC;

bit HasFP8DstByteSel = P.HasFP8DstByteSel;
bit HasFP4DstByteSel = P.HasFP4DstByteSel;

let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
P.AsmVOP3OpSel,
Expand Down
15 changes: 9 additions & 6 deletions llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
Original file line number Diff line number Diff line change
Expand Up @@ -642,17 +642,18 @@ body: |
...

---
name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
Expand Down Expand Up @@ -731,17 +732,18 @@ body: |
...

---
name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why rename here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change adds nop for opsel==0 as well now. So its not negative test anymore.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was never a negative test though, it was pre-commited for this

body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
Expand Down Expand Up @@ -1119,17 +1121,18 @@ body: |
...

---
name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
S_WAITCNT 0
Expand Down
Loading