Skip to content

Commit 5e007af

Browse files
authored
[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions (#118589)
Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespective of opsel values. Note: We might need to add few others into the same table.
1 parent 04379c9 commit 5e007af

File tree

7 files changed

+58
-34
lines changed

7 files changed

+58
-34
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -916,21 +916,30 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
916916
if (SIInstrInfo::isSDWA(MI)) {
917917
// Type 1: SDWA with dst_sel != DWORD
918918
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
919-
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
920-
return nullptr;
921-
} else {
922-
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
923-
// with op_sel[3:2] != 0)
924-
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
925-
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
926-
SISrcMods::DST_OP_SEL ||
927-
(AMDGPU::isFP8DstSelInst(Opcode) &&
928-
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
929-
SISrcMods::OP_SEL_0))))
930-
return nullptr;
931-
}
932-
933-
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
919+
if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
920+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
921+
}
922+
923+
AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode);
924+
if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
925+
// Type 2: VOP3 which write the hi bits
926+
if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &
927+
SISrcMods::DST_OP_SEL)
928+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
929+
930+
// Type 3: FP8DstSelInst with op_sel[3:2] != 0)
931+
if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
932+
(TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &
933+
SISrcMods::OP_SEL_0))
934+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
935+
}
936+
937+
// Special case: nop is required for all the opsel values for fp4 sr variant
938+
// cvt scale instructions
939+
if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
940+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
941+
942+
return nullptr;
934943
}
935944

936945
/// Checks whether the provided \p MI "consumes" the operand with a Dest sel

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2567,6 +2567,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
25672567
field bit IsFP8SrcByteSel = 0;
25682568
field bit IsFP8DstByteSel = 0;
25692569
field bit HasFP8DstByteSel = 0;
2570+
field bit HasFP4DstByteSel = 0;
25702571
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
25712572

25722573
field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -3249,13 +3250,13 @@ def isMFMA_F8F6F4Table : GenericTable {
32493250
let PrimaryKeyName = "isMFMA_F8F6F4" ;
32503251
}
32513252

3252-
def FP8DstByteSelTable : GenericTable {
3253+
def FP4FP8DstByteSelTable : GenericTable {
32533254
let FilterClass = "VOP3_Pseudo";
3254-
let CppTypeName = "FP8DstByteSelInfo";
3255-
let Fields = ["Opcode", "HasFP8DstByteSel"];
3255+
let CppTypeName = "FP4FP8DstByteSelInfo";
3256+
let Fields = ["Opcode", "HasFP8DstByteSel", "HasFP4DstByteSel"];
32563257

32573258
let PrimaryKey = ["Opcode"];
3258-
let PrimaryKeyName = "getFP8DstByteSelHelper";
3259+
let PrimaryKeyName = "getFP4FP8DstByteSelHelper";
32593260
}
32603261

32613262
def VOPDComponentTable : GenericTable {

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,17 +378,18 @@ struct VOPTrue16Info {
378378
bool IsTrue16;
379379
};
380380

381-
#define GET_FP8DstByteSelTable_DECL
382-
#define GET_FP8DstByteSelTable_IMPL
381+
#define GET_FP4FP8DstByteSelTable_DECL
382+
#define GET_FP4FP8DstByteSelTable_IMPL
383383

384384
struct DPMACCInstructionInfo {
385385
uint16_t Opcode;
386386
bool IsDPMACCInstruction;
387387
};
388388

389-
struct FP8DstByteSelInfo {
389+
struct FP4FP8DstByteSelInfo {
390390
uint16_t Opcode;
391391
bool HasFP8DstByteSel;
392+
bool HasFP4DstByteSel;
392393
};
393394

394395
#define GET_MTBUFInfoTable_DECL
@@ -655,9 +656,16 @@ bool isTrue16Inst(unsigned Opc) {
655656
return Info ? Info->IsTrue16 : false;
656657
}
657658

658-
bool isFP8DstSelInst(unsigned Opc) {
659-
const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
660-
return Info ? Info->HasFP8DstByteSel : false;
659+
FPType getFPDstSelType(unsigned Opc) {
660+
const FP4FP8DstByteSelInfo *Info = getFP4FP8DstByteSelHelper(Opc);
661+
if (!Info)
662+
return FPType::None;
663+
if (Info->HasFP8DstByteSel)
664+
return FPType::FP8;
665+
if (Info->HasFP4DstByteSel)
666+
return FPType::FP4;
667+
668+
return FPType::None;
661669
}
662670

663671
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ static constexpr unsigned GFX12 = 1;
5555

5656
enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5, AMDHSA_COV6 = 6 };
5757

58+
enum class FPType { None, FP4, FP8 };
59+
5860
/// \returns True if \p STI is AMDHSA.
5961
bool isHsaAbi(const MCSubtargetInfo &STI);
6062

@@ -885,7 +887,7 @@ LLVM_READONLY
885887
bool isTrue16Inst(unsigned Opc);
886888

887889
LLVM_READONLY
888-
bool isFP8DstSelInst(unsigned Opc);
890+
FPType getFPDstSelType(unsigned Opc);
889891

890892
LLVM_READONLY
891893
bool isInvalidSingleUseConsumerInst(unsigned Opc);

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
10141014
let HasExtVOP3DPP = 0;
10151015
let HasOpSel = 1;
10161016
let HasOMod = 0;
1017-
let HasFP8DstByteSel = 1;
1017+
let HasFP4DstByteSel = 1;
10181018
}
10191019

10201020
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
10261026
let HasExtVOP3DPP = 0;
10271027
let HasOpSel = 1;
10281028
let HasOMod = 0;
1029-
let HasFP8DstByteSel = 1;
1029+
let HasFP4DstByteSel = 1;
10301030
}
10311031

10321032
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
110110
let IsSWMMAC = P.IsSWMMAC;
111111

112112
bit HasFP8DstByteSel = P.HasFP8DstByteSel;
113+
bit HasFP4DstByteSel = P.HasFP4DstByteSel;
113114

114115
let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
115116
P.AsmVOP3OpSel,

llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -642,17 +642,18 @@ body: |
642642
...
643643

644644
---
645-
name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
645+
name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
646646
body: |
647647
bb.0:
648648
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
649-
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard
649+
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
650650
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
651651
; GCN-NEXT: {{ $}}
652652
; GCN-NEXT: S_WAITCNT 0
653653
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
654654
; GCN-NEXT: S_WAITCNT 3952
655655
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
656+
; GCN-NEXT: S_NOP 0
656657
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
657658
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
658659
S_WAITCNT 0
@@ -731,17 +732,18 @@ body: |
731732
...
732733

733734
---
734-
name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
735+
name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
735736
body: |
736737
bb.0:
737738
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
738-
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
739+
; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
739740
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
740741
; GCN-NEXT: {{ $}}
741742
; GCN-NEXT: S_WAITCNT 0
742743
; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
743744
; GCN-NEXT: S_WAITCNT 3952
744745
; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
746+
; GCN-NEXT: S_NOP 0
745747
; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
746748
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
747749
S_WAITCNT 0
@@ -1119,17 +1121,18 @@ body: |
11191121
...
11201122

11211123
---
1122-
name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
1124+
name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
11231125
body: |
11241126
bb.0:
11251127
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1126-
; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard
1128+
; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
11271129
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
11281130
; GCN-NEXT: {{ $}}
11291131
; GCN-NEXT: S_WAITCNT 0
11301132
; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
11311133
; GCN-NEXT: S_NOP 0
11321134
; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
1135+
; GCN-NEXT: S_NOP 0
11331136
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
11341137
; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
11351138
S_WAITCNT 0

0 commit comments

Comments
 (0)