Skip to content

Commit f53cd7e

Browse files
kzhuravlyanyao-wang
authored andcommitted
[AMDGPU] Make maximum hard clause size a subtarget feature
gfx11 chips may, in some conditions, behave incorrectly with S_CLAUSE instructions (hard clauses) containing more than 32 operations (that is, whose arguments exceed 0x1f). However, gfx10 targets will work successfully with clauses of up to length 63. Therefore, define the MaxHardClauseLength property on GCNSubtarget and make it a subtarget feature via tablegen, thus allowing us to specify, both now and in the future, the maximum viable size of clauses on various hardware from the tablegen definition. If MaxHardClauseLength is 0, which is the default, the hardware does not support hard clauses. Cherry-pick of llvm#81287 Change-Id: Id55b25ae70dd2dc521b45ff71058c5a183a97fda
1 parent 33efb91 commit f53cd7e

File tree

5 files changed

+56
-16
lines changed

5 files changed

+56
-16
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,21 @@ def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard
227227
"Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
228228
>;
229229

230+
class FeatureMaxHardClauseLength<int size> : SubtargetFeature<
231+
"max-hard-clause-length-"#size,
232+
"MaxHardClauseLength",
233+
!cast<string>(size),
234+
"Maximum number of instructions in an explicit S_CLAUSE is "#size
235+
>;
236+
237+
/// Work around a hardware bug on some chips that can be triggered
238+
/// under certain circumstances when clauses are longer than 32 operations.
239+
def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>;
240+
/// While the S_CLAUSE instruction permits encoding clause lengths up to 64,
241+
/// hardware documentation for gfx10+ indicates that 63 is the maximum
242+
/// permitted clause length.
243+
def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>;
244+
230245
def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
231246
"HasNSAtoVMEMBug",
232247
"true",
@@ -986,7 +1001,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
9861001
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
9871002
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
9881003
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
989-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts
1004+
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
1005+
FeatureMaxHardClauseLength63
9901006
]
9911007
>;
9921008

@@ -1005,7 +1021,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
10051021
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
10061022
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
10071023
FeatureA16, FeatureFastDenormalF32, FeatureG16,
1008-
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
1024+
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1025+
FeatureMaxHardClauseLength32
10091026
]
10101027
>;
10111028

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
161161
bool HasAtomicBufferGlobalPkAddF16Insts = false;
162162
bool HasAtomicGlobalPkAddBF16Inst = false;
163163
bool HasFlatAtomicFaddF32Inst = false;
164+
/// The maximum number of instructions that may be placed within an S_CLAUSE,
165+
/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
166+
/// indicates a lack of S_CLAUSE support.
167+
unsigned MaxHardClauseLength = 0;
164168
bool SupportsSRAMECC = false;
165169

166170
// This should not be used directly. 'TargetID' tracks the dynamic settings
@@ -1078,7 +1082,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
10781082

10791083
bool hasNSAClauseBug() const { return HasNSAClauseBug; }
10801084

1081-
bool hasHardClauses() const { return getGeneration() >= GFX10; }
1085+
bool hasHardClauses() const { return MaxHardClauseLength > 0; }
10821086

10831087
bool hasGFX90AInsts() const { return GFX90AInsts; }
10841088

@@ -1129,6 +1133,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
11291133
// hasGFX90AInsts is also true.
11301134
bool hasGFX940Insts() const { return GFX940Insts; }
11311135

1136+
/// \returns The maximum number of instructions that can be enclosed in an
1137+
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1138+
/// instruction.
1139+
unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1140+
11321141
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
11331142
/// SGPRs
11341143
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;

llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,6 @@ using namespace llvm;
4343

4444
namespace {
4545

46-
// A clause length of 64 instructions could be encoded in the s_clause
47-
// instruction, but the hardware documentation (at least for GFX11) says that
48-
// 63 is the maximum allowed.
49-
constexpr unsigned MaxInstructionsInClause = 63;
50-
5146
enum HardClauseType {
5247
// For GFX10:
5348

@@ -182,7 +177,8 @@ class SIInsertHardClauses : public MachineFunctionPass {
182177
bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
183178
if (CI.First == CI.Last)
184179
return false;
185-
assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!");
180+
assert(CI.Length <= ST->maxHardClauseLength() &&
181+
"Hard clause is too long!");
186182

187183
auto &MBB = *CI.First->getParent();
188184
auto ClauseMI =
@@ -223,7 +219,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
223219
}
224220
}
225221

226-
if (CI.Length == MaxInstructionsInClause ||
222+
if (CI.Length == ST->maxHardClauseLength() ||
227223
(CI.Length && Type != HARDCLAUSE_INTERNAL &&
228224
Type != HARDCLAUSE_IGNORE &&
229225
(Type != CI.Type ||

llvm/test/CodeGen/AMDGPU/function-args.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3309,7 +3309,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
33093309
; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
33103310
; GFX11: ; %bb.0:
33113311
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3312-
; GFX11-NEXT: s_clause 0x20
3312+
; GFX11-NEXT: s_clause 0x1f
33133313
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80
33143314
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76
33153315
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72

0 commit comments

Comments
 (0)