[AMDGPU] Make maximum hard clause size a subtarget feature

kzhuravl · yanyao-wang · commit f53cd7e03908 · 2024-04-18T17:41:07.000-04:00
gfx11 chips may, in some conditions, behave incorrectly with S_CLAUSE instructions (hard clauses) containing more than 32 operations (that is, whose arguments exceed 0x1f). However, gfx10 targets will work successfully with clauses of up to length 63. Therefore, define the MaxHardClauseLength property on GCNSubtarget and make it a subtarget feature via tablegen, thus allowing us to specify, both now and in the future, the maximum viable size of clauses on various hardware from the tablegen definition. If MaxHardClauseLength is 0, which is the default, the hardware does not support hard clauses. Cherry-pick of llvm#81287 Change-Id: Id55b25ae70dd2dc521b45ff71058c5a183a97fda
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -227,6 +227,21 @@ def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard
   "Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
 >;
 
+class FeatureMaxHardClauseLength<int size> : SubtargetFeature<
+  "max-hard-clause-length-"#size,
+  "MaxHardClauseLength",
+  !cast<string>(size),
+  "Maximum number of instructions in an explicit S_CLAUSE is "#size
+>;
+
+/// Work around a hardware bug on some chips that can be triggered
+/// under certain circumstances when clauses are longer than 32 operations.
+def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>;
+/// While the S_CLAUSE instruction permits encoding clause lengths up to 64,
+/// hardware documentation for gfx10+ indicates that 63 is the maximum
+/// permitted clause length.
+def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>;
+
 def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
   "HasNSAtoVMEMBug",
   "true",
@@ -986,7 +1001,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
+   FeatureMaxHardClauseLength63
   ]
 >;
 
@@ -1005,7 +1021,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+   FeatureMaxHardClauseLength32
   ]
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -161,6 +161,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicBufferGlobalPkAddF16Insts = false;
   bool HasAtomicGlobalPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
+  /// The maximum number of instructions that may be placed within an S_CLAUSE,
+  /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
+  /// indicates a lack of S_CLAUSE support.
+  unsigned MaxHardClauseLength = 0;
   bool SupportsSRAMECC = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
@@ -1078,7 +1082,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
 
-  bool hasHardClauses() const { return getGeneration() >= GFX10; }
+  bool hasHardClauses() const { return MaxHardClauseLength > 0; }
 
   bool hasGFX90AInsts() const { return GFX90AInsts; }
 
@@ -1129,6 +1133,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // hasGFX90AInsts is also true.
   bool hasGFX940Insts() const { return GFX940Insts; }
 
+  /// \returns The maximum number of instructions that can be enclosed in an
+  /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
+  /// instruction.
+  unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -43,11 +43,6 @@ using namespace llvm;
 
 namespace {
 
-// A clause length of 64 instructions could be encoded in the s_clause
-// instruction, but the hardware documentation (at least for GFX11) says that
-// 63 is the maximum allowed.
-constexpr unsigned MaxInstructionsInClause = 63;
-
 enum HardClauseType {
   // For GFX10:
 
@@ -182,7 +177,8 @@ class SIInsertHardClauses : public MachineFunctionPass {
   bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
     if (CI.First == CI.Last)
       return false;
-    assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!");
+    assert(CI.Length <= ST->maxHardClauseLength() &&
+           "Hard clause is too long!");
 
     auto &MBB = *CI.First->getParent();
     auto ClauseMI =
@@ -223,7 +219,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
           }
         }
 
-        if (CI.Length == MaxInstructionsInClause ||
+        if (CI.Length == ST->maxHardClauseLength() ||
             (CI.Length && Type != HARDCLAUSE_INTERNAL &&
              Type != HARDCLAUSE_IGNORE &&
              (Type != CI.Type ||
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -3309,7 +3309,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x20
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:72
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir