Skip to content

Commit 6ee546d

Browse files
committed
[AMDGPU] Handle CvtScaleForwardingHazard using HasFP4DstByteSel for:
cvt_scalef32_sr_pk_fp4_f32, cvt_scalef32_sr_pk_fp4_[f|bf]16 of gfx950. Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespctive of opsel values. Note: We might need to add few others into the same table.
1 parent f947d5a commit 6ee546d

File tree

7 files changed

+103
-18
lines changed

7 files changed

+103
-18
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -913,24 +913,31 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
913913
// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
914914
// op_sel[3:2]
915915
// != 0
916-
if (SIInstrInfo::isSDWA(MI)) {
916+
if (SIInstrInfo::isSDWA(MI))
917917
// Type 1: SDWA with dst_sel != DWORD
918918
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
919-
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
920-
return nullptr;
921-
} else {
922-
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
923-
// with op_sel[3:2] != 0)
924-
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
925-
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
926-
SISrcMods::DST_OP_SEL ||
927-
(AMDGPU::isFP8DstSelInst(Opcode) &&
928-
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
929-
SISrcMods::OP_SEL_0))))
930-
return nullptr;
931-
}
932-
933-
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
919+
if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
920+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
921+
922+
if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
923+
// Type 2: VOP3 which write the hi bits
924+
if (TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
925+
SISrcMods::DST_OP_SEL)
926+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
927+
928+
// Type 3: FP8DstSelInst with op_sel[3:2] != 0)
929+
if (AMDGPU::isFP8DstSelInst(Opcode) &&
930+
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
931+
SISrcMods::OP_SEL_0))
932+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
933+
}
934+
935+
// Special case: nop is required for all the opsel values for fp4 sr variant
936+
// cvt scale instructions
937+
if (AMDGPU::isFP4DstSelInst(Opcode))
938+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
939+
940+
return nullptr;
934941
}
935942

936943
/// Checks whether the provided \p MI "consumes" the operand with a Dest sel

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2566,6 +2566,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
25662566
field bit IsFP8SrcByteSel = 0;
25672567
field bit IsFP8DstByteSel = 0;
25682568
field bit HasFP8DstByteSel = 0;
2569+
field bit HasFP4DstByteSel = 0;
25692570
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
25702571

25712572
field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -3257,6 +3258,15 @@ def FP8DstByteSelTable : GenericTable {
32573258
let PrimaryKeyName = "getFP8DstByteSelHelper";
32583259
}
32593260

3261+
def FP4DstByteSelTable : GenericTable {
3262+
let FilterClass = "VOP3_Pseudo";
3263+
let CppTypeName = "FP4DstByteSelInfo";
3264+
let Fields = ["Opcode", "HasFP4DstByteSel"];
3265+
3266+
let PrimaryKey = ["Opcode"];
3267+
let PrimaryKeyName = "getFP4DstByteSelHelper";
3268+
}
3269+
32603270
def VOPDComponentTable : GenericTable {
32613271
let FilterClass = "VOPD_Component";
32623272
let CppTypeName = "VOPDComponentInfo";

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ struct VOPTrue16Info {
380380

381381
#define GET_FP8DstByteSelTable_DECL
382382
#define GET_FP8DstByteSelTable_IMPL
383+
#define GET_FP4DstByteSelTable_DECL
384+
#define GET_FP4DstByteSelTable_IMPL
383385

384386
struct DPMACCInstructionInfo {
385387
uint16_t Opcode;
@@ -391,6 +393,11 @@ struct FP8DstByteSelInfo {
391393
bool HasFP8DstByteSel;
392394
};
393395

396+
struct FP4DstByteSelInfo {
397+
uint16_t Opcode;
398+
bool HasFP4DstByteSel;
399+
};
400+
394401
#define GET_FP8DstByteSelTable_DECL
395402
#define GET_FP8DstByteSelTable_IMPL
396403
#define GET_MTBUFInfoTable_DECL
@@ -662,6 +669,11 @@ bool isFP8DstSelInst(unsigned Opc) {
662669
return Info ? Info->HasFP8DstByteSel : false;
663670
}
664671

672+
bool isFP4DstSelInst(unsigned Opc) {
673+
const FP4DstByteSelInfo *Info = getFP4DstByteSelHelper(Opc);
674+
return Info ? Info->HasFP4DstByteSel : false;
675+
}
676+
665677
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
666678
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
667679
return Info ? Info->Opcode3Addr : ~0u;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,9 @@ bool isTrue16Inst(unsigned Opc);
887887
LLVM_READONLY
888888
bool isFP8DstSelInst(unsigned Opc);
889889

890+
LLVM_READONLY
891+
bool isFP4DstSelInst(unsigned Opc);
892+
890893
LLVM_READONLY
891894
bool isInvalidSingleUseConsumerInst(unsigned Opc);
892895

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
10141014
let HasExtVOP3DPP = 0;
10151015
let HasOpSel = 1;
10161016
let HasOMod = 0;
1017-
let HasFP8DstByteSel = 1;
1017+
let HasFP4DstByteSel = 1;
10181018
}
10191019

10201020
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
10261026
let HasExtVOP3DPP = 0;
10271027
let HasOpSel = 1;
10281028
let HasOMod = 0;
1029-
let HasFP8DstByteSel = 1;
1029+
let HasFP4DstByteSel = 1;
10301030
}
10311031

10321032
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
110110
let IsSWMMAC = P.IsSWMMAC;
111111

112112
bit HasFP8DstByteSel = P.HasFP8DstByteSel;
113+
bit HasFP4DstByteSel = P.HasFP4DstByteSel;
113114

114115
let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
115116
P.AsmVOP3OpSel,

llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,23 @@ body: |
485485
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
486486
...
487487

488+
---
489+
# GCN-LABEL: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
490+
# GCN: V_CVT_SCALEF32_SR_PK_FP4_BF16_e64
491+
# GCN: S_NOP 0
492+
# GCN: V_ADD_U32_e32
493+
name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
494+
body: |
495+
bb.0:
496+
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
497+
S_WAITCNT 0
498+
renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
499+
S_WAITCNT 3952
500+
early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
501+
renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
502+
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
503+
...
504+
488505
---
489506
# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_hazard
490507
# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64
@@ -502,6 +519,23 @@ body: |
502519
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
503520
...
504521

522+
---
523+
# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
524+
# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64
525+
# GCN: S_NOP 0
526+
# GCN: V_ADD_U32_e32
527+
name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
528+
body: |
529+
bb.0:
530+
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
531+
S_WAITCNT 0
532+
renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
533+
S_WAITCNT 3952
534+
early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
535+
renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
536+
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
537+
...
538+
505539
---
506540
# GCN-LABEL: test_cvt_scalef32_fp4_f16_hazard
507541
# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
@@ -636,6 +670,24 @@ body: |
636670
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
637671
...
638672

673+
---
674+
# GCN-LABEL: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
675+
# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
676+
# GCN: S_NOP 0
677+
# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64
678+
# GCN: S_NOP 0
679+
# GCN: S_SETPC_B64_return
680+
name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
681+
body: |
682+
bb.0:
683+
liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
684+
S_WAITCNT 0
685+
renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
686+
early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
687+
$vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
688+
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
689+
...
690+
639691
---
640692
# GCN-LABEL: test_cvt_scale_cvt_scale_waw_hazard
641693
# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64

0 commit comments

Comments
 (0)