Skip to content

[AMDGPU] Make maximum hard clause size a subtarget feature #81287

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,21 @@ def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard
"Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
>;

class FeatureMaxHardClauseLength<int size> : SubtargetFeature<
"max-hard-clause-length-"#size,
"MaxHardClauseLength",
!cast<string>(size),
"Maximum number of instructions in an explicit S_CLAUSE is "#size
>;

/// Work around a hardware bug on some chips that can be triggered
/// under certain circumstances when clauses are longer than 32 operations.
def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>;
/// While the S_CLAUSE instruction permits encoding clause lengths up to 64,
/// hardware documentation for gfx10+ indicates that 63 is the maximum
/// permitted clause length.
def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>;

def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
"HasNSAtoVMEMBug",
"true",
Expand Down Expand Up @@ -1092,7 +1107,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength63
]
>;

Expand All @@ -1112,7 +1128,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero
FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength32
]
>;

Expand All @@ -1132,7 +1149,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
FeatureMaxHardClauseLength32
]
>;

Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasFlatAtomicFaddF32Inst = false;
bool HasDefaultComponentZero = false;
bool HasDefaultComponentBroadcast = false;
/// The maximum number of instructions that may be placed within an S_CLAUSE,
/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
/// indicates a lack of S_CLAUSE support.
unsigned MaxHardClauseLength = 0;
bool SupportsSRAMECC = false;

// This should not be used directly. 'TargetID' tracks the dynamic settings
Expand Down Expand Up @@ -1145,7 +1149,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasNSAClauseBug() const { return HasNSAClauseBug; }

bool hasHardClauses() const { return getGeneration() >= GFX10; }
bool hasHardClauses() const { return MaxHardClauseLength > 0; }

bool hasGFX90AInsts() const { return GFX90AInsts; }

Expand Down Expand Up @@ -1212,6 +1216,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }

/// \returns The maximum number of instructions that can be enclosed in an
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
/// instruction.
unsigned maxHardClauseLength() const { return MaxHardClauseLength; }

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
Expand Down
10 changes: 3 additions & 7 deletions llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,6 @@ using namespace llvm;

namespace {

// A clause length of 64 instructions could be encoded in the s_clause
// instruction, but the hardware documentation (at least for GFX11) says that
// 63 is the maximum allowed.
constexpr unsigned MaxInstructionsInClause = 63;

enum HardClauseType {
// For GFX10:

Expand Down Expand Up @@ -182,7 +177,8 @@ class SIInsertHardClauses : public MachineFunctionPass {
bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
if (CI.First == CI.Last)
return false;
assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!");
assert(CI.Length <= ST->maxHardClauseLength() &&
"Hard clause is too long!");

auto &MBB = *CI.First->getParent();
auto ClauseMI =
Expand Down Expand Up @@ -223,7 +219,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
}
}

if (CI.Length == MaxInstructionsInClause ||
if (CI.Length == ST->maxHardClauseLength() ||
(CI.Length && Type != HARDCLAUSE_INTERNAL &&
Type != HARDCLAUSE_IGNORE &&
(Type != CI.Type ||
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28974,7 +28974,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11-LABEL: v_vselect_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x20
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_u16 v31, off, s32
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:64
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/function-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4037,7 +4037,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x20
; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72
Expand Down
Loading