Skip to content

Commit b497234

Browse files
authored
[AMDGPU] Make maximum hard clause size a subtarget feature (#81287)
gfx11 chips may, in some conditions, behave incorrectly with S_CLAUSE instructions (hard clauses) containing more than 32 operations (that is, whose arguments exceed 0x1f). However, gfx10 targets will work successfully with clauses of up to length 63. Therefore, define the MaxHardClauseLength property on GCNSubtarget and make it a subtarget feature via tablegen, thus allowing us to specify, both now and in the future, the maximum viable size of clauses on various hardware from the tablegen definition. If MaxHardClauseLength is 0, which is the default, the hardware does not support hard clauses.
1 parent 82a4a41 commit b497234

File tree

8 files changed

+1478
-19
lines changed

8 files changed

+1478
-19
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,21 @@ def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard
227227
"Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
228228
>;
229229

230+
class FeatureMaxHardClauseLength<int size> : SubtargetFeature<
231+
"max-hard-clause-length-"#size,
232+
"MaxHardClauseLength",
233+
!cast<string>(size),
234+
"Maximum number of instructions in an explicit S_CLAUSE is "#size
235+
>;
236+
237+
/// Work around a hardware bug on some chips that can be triggered
238+
/// under certain circumstances when clauses are longer than 32 operations.
239+
def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>;
240+
/// While the S_CLAUSE instruction permits encoding clause lengths up to 64,
241+
/// hardware documentation for gfx10+ indicates that 63 is the maximum
242+
/// permitted clause length.
243+
def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>;
244+
230245
def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
231246
"HasNSAtoVMEMBug",
232247
"true",
@@ -1092,7 +1107,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
10921107
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
10931108
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
10941109
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
1095-
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
1110+
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
1111+
FeatureMaxHardClauseLength63
10961112
]
10971113
>;
10981114

@@ -1112,7 +1128,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
11121128
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
11131129
FeatureA16, FeatureFastDenormalF32, FeatureG16,
11141130
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
1115-
FeatureGWS, FeatureDefaultComponentZero
1131+
FeatureGWS, FeatureDefaultComponentZero,
1132+
FeatureMaxHardClauseLength32
11161133
]
11171134
>;
11181135

@@ -1132,7 +1149,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
11321149
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
11331150
FeatureA16, FeatureFastDenormalF32, FeatureG16,
11341151
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1135-
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast
1152+
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
1153+
FeatureMaxHardClauseLength32
11361154
]
11371155
>;
11381156

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
168168
bool HasFlatAtomicFaddF32Inst = false;
169169
bool HasDefaultComponentZero = false;
170170
bool HasDefaultComponentBroadcast = false;
171+
/// The maximum number of instructions that may be placed within an S_CLAUSE,
172+
/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
173+
/// indicates a lack of S_CLAUSE support.
174+
unsigned MaxHardClauseLength = 0;
171175
bool SupportsSRAMECC = false;
172176

173177
// This should not be used directly. 'TargetID' tracks the dynamic settings
@@ -1145,7 +1149,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
11451149

11461150
bool hasNSAClauseBug() const { return HasNSAClauseBug; }
11471151

1148-
bool hasHardClauses() const { return getGeneration() >= GFX10; }
1152+
bool hasHardClauses() const { return MaxHardClauseLength > 0; }
11491153

11501154
bool hasGFX90AInsts() const { return GFX90AInsts; }
11511155

@@ -1212,6 +1216,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12121216
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
12131217
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
12141218

1219+
/// \returns The maximum number of instructions that can be enclosed in an
1220+
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1221+
/// instruction.
1222+
unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1223+
12151224
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
12161225
/// SGPRs
12171226
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;

llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,6 @@ using namespace llvm;
4343

4444
namespace {
4545

46-
// A clause length of 64 instructions could be encoded in the s_clause
47-
// instruction, but the hardware documentation (at least for GFX11) says that
48-
// 63 is the maximum allowed.
49-
constexpr unsigned MaxInstructionsInClause = 63;
50-
5146
enum HardClauseType {
5247
// For GFX10:
5348

@@ -182,7 +177,8 @@ class SIInsertHardClauses : public MachineFunctionPass {
182177
bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
183178
if (CI.First == CI.Last)
184179
return false;
185-
assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!");
180+
assert(CI.Length <= ST->maxHardClauseLength() &&
181+
"Hard clause is too long!");
186182

187183
auto &MBB = *CI.First->getParent();
188184
auto ClauseMI =
@@ -223,7 +219,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
223219
}
224220
}
225221

226-
if (CI.Length == MaxInstructionsInClause ||
222+
if (CI.Length == ST->maxHardClauseLength() ||
227223
(CI.Length && Type != HARDCLAUSE_INTERNAL &&
228224
Type != HARDCLAUSE_IGNORE &&
229225
(Type != CI.Type ||

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28974,7 +28974,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
2897428974
; GFX11-LABEL: v_vselect_v32bf16:
2897528975
; GFX11: ; %bb.0:
2897628976
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28977-
; GFX11-NEXT: s_clause 0x20
28977+
; GFX11-NEXT: s_clause 0x1f
2897828978
; GFX11-NEXT: scratch_load_u16 v31, off, s32
2897928979
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:128
2898028980
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:64

llvm/test/CodeGen/AMDGPU/function-args.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4037,7 +4037,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
40374037
; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
40384038
; GFX11: ; %bb.0:
40394039
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4040-
; GFX11-NEXT: s_clause 0x20
4040+
; GFX11-NEXT: s_clause 0x1f
40414041
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80
40424042
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76
40434043
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72

0 commit comments

Comments
 (0)