[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566

cdevadas · 2025-06-24T18:27:47Z

This patch tracks the register operands of both VMEM (FLAT, MUBUF,
MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT
instruction with sufficient wait-count before potentially redefining
them. For VMEM instructions, XNACK is returned in the same order as
they were issued and hence non-zero counter values can be inserted.
However, SMEM execution is out-of-order and so is their XNACK reception.
Thus, only zero counter value can be inserted to capture SMEM dependencies.

cdevadas · 2025-06-24T18:28:08Z

[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566 👈 (View in Graphite)
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

github-actions · 2025-06-24T18:31:28Z

✅ With the latest revision this PR passed the undef deprecator.

llvmbot · 2025-06-24T18:32:17Z

@llvm/pr-subscribers-backend-amdgpu

Author: Christudasan Devadasan (cdevadas)

Changes

This patch tracks the register operands of both VMEM (FLAT, MUBUF,
MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT
instruction with sufficient wait-count before potentially redefining
them. For VMEM instructions, XNACK is returned in the same order as
they were issued and hence non-zero counter values can be inserted.
However, SMEM execution is out-of-order and so is their XNACK reception.
Thus, only zero counter value can be inserted to capture SMEM dependencies.

Patch is 86.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145566.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+121-16)
(modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+9)
(modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+9-4)
(added) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+396)
(added) llvm/test/CodeGen/AMDGPU/wait-xcnt.mir (+922)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9a7dd3c31e498..f43831016952a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -73,6 +73,7 @@ enum InstCounterType {
   SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
   BVH_CNT,                           // gfx12+ only.
   KM_CNT,                            // gfx12+ only.
+  X_CNT,                             // gfx1250.
   NUM_EXTENDED_INST_CNTS,
   NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
 };
@@ -102,6 +103,7 @@ struct HardwareLimits {
   unsigned SamplecntMax; // gfx12+ only.
   unsigned BvhcntMax;    // gfx12+ only.
   unsigned KmcntMax;     // gfx12+ only.
+  unsigned XcntMax;      // gfx1250.
 };
 
 #define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \
@@ -111,10 +113,12 @@ struct HardwareLimits {
   DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
   DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
   DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
+  DECL(VMEM_GROUP)               /* vmem group */                              \
   DECL(LDS_ACCESS)               /* lds read & write */                        \
   DECL(GDS_ACCESS)               /* gds read & write */                        \
   DECL(SQ_MESSAGE)               /* send message */                            \
   DECL(SMEM_ACCESS)              /* scalar-memory read & write */              \
+  DECL(SMEM_GROUP)               /* scalar-memory group */                     \
   DECL(EXP_GPR_LOCK)             /* export holding on its data src */          \
   DECL(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */    \
   DECL(EXP_POS_ACCESS)           /* write to export position */                \
@@ -178,7 +182,7 @@ enum VmemType {
 static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
     AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
     AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
-    AMDGPU::S_WAIT_KMCNT};
+    AMDGPU::S_WAIT_KMCNT,    AMDGPU::S_WAIT_XCNT};
 
 static bool updateVMCntOnly(const MachineInstr &Inst) {
   return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
@@ -223,6 +227,8 @@ unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
     return Wait.BvhCnt;
   case KM_CNT:
     return Wait.KmCnt;
+  case X_CNT:
+    return Wait.XCnt;
   default:
     llvm_unreachable("bad InstCounterType");
   }
@@ -283,12 +289,27 @@ class WaitcntBrackets {
       return Limits.BvhcntMax;
     case KM_CNT:
       return Limits.KmcntMax;
+    case X_CNT:
+      return Limits.XcntMax;
     default:
       break;
     }
     return 0;
   }
 
+  bool isSmemCounter(InstCounterType T) const {
+    return T == SmemAccessCounter || T == X_CNT;
+  }
+
+  unsigned getSgprScoresIdx(InstCounterType T) const {
+    if (T == SmemAccessCounter)
+      return 0;
+    if (T == X_CNT)
+      return 1;
+
+    llvm_unreachable("Invalid SMEM counter");
+  }
+
   unsigned getScoreLB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     return ScoreLBs[T];
@@ -307,8 +328,8 @@ class WaitcntBrackets {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
     }
-    assert(T == SmemAccessCounter);
-    return SgprScores[GprNo - NUM_ALL_VGPRS];
+    assert(isSmemCounter(T));
+    return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
   }
 
   bool merge(const WaitcntBrackets &Other);
@@ -331,6 +352,7 @@ class WaitcntBrackets {
 
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
   void applyWaitcnt(InstCounterType T, unsigned Count);
+  void applyXcnt(const AMDGPU::Waitcnt &Wait);
   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
@@ -462,9 +484,11 @@ class WaitcntBrackets {
   int VgprUB = -1;
   int SgprUB = -1;
   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
-  // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
-  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+  // X_CNT score.
+  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
   // write to each vgpr.
   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@@ -572,6 +596,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
         0,
         0,
+        0,
         0};
 
     return WaitEventMaskForInstPreGFX12;
@@ -607,7 +632,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
         eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
         eventMask({VMEM_SAMPLER_READ_ACCESS}),
         eventMask({VMEM_BVH_READ_ACCESS}),
-        eventMask({SMEM_ACCESS, SQ_MESSAGE})};
+        eventMask({SMEM_ACCESS, SQ_MESSAGE}),
+        eventMask({VMEM_GROUP, SMEM_GROUP})};
 
     return WaitEventMaskForInstGFX12Plus;
   }
@@ -743,9 +769,12 @@ class SIInsertWaitcnts {
     return VmemReadMapping[getVmemType(Inst)];
   }
 
+  bool hasXcnt() const { return ST->hasWaitXCnt(); }
+
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+  bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr,
@@ -837,9 +866,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
       VgprUB = std::max(VgprUB, RegNo);
       VgprScores[CntTy][RegNo] = Score;
     } else {
-      assert(CntTy == SmemAccessCounter);
+      assert(isSmemCounter(CntTy));
       SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
-      SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
+      SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
     }
   }
 }
@@ -976,6 +1005,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
           setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
       }
     }
+  } else if (T == X_CNT) {
+    for (const MachineOperand &Op : Inst.all_uses()) {
+      RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
+      for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        setRegScore(RegNo, T, CurrScore);
+      }
+    }
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
     // Match the score to the destination registers.
     //
@@ -1080,6 +1116,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
     case KM_CNT:
       OS << "    KM_CNT(" << SR << "): ";
       break;
+    case X_CNT:
+      OS << "    X_CNT(" << SR << "): ";
+      break;
     default:
       OS << "    UNKNOWN(" << SR << "): ";
       break;
@@ -1100,8 +1139,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
           OS << RelScore << ":ds ";
         }
       }
-      // Also need to print sgpr scores for lgkm_cnt.
-      if (T == SmemAccessCounter) {
+      // Also need to print sgpr scores for lgkm_cnt or xcnt.
+      if (isSmemCounter(T)) {
         for (int J = 0; J <= SgprUB; J++) {
           unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
           if (RegScore <= LB)
@@ -1140,6 +1179,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
   simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
   simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
   simplifyWaitcnt(KM_CNT, Wait.KmCnt);
+  simplifyWaitcnt(X_CNT, Wait.XCnt);
 }
 
 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1191,6 +1231,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
   applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
   applyWaitcnt(BVH_CNT, Wait.BvhCnt);
   applyWaitcnt(KM_CNT, Wait.KmCnt);
+  applyXcnt(Wait);
 }
 
 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1207,11 +1248,29 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
   }
 }
 
+void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+  // Wait on XCNT is redundant if we are already waiting for a load to complete.
+  // SMEM can return out of order, so only omit XCNT wait if we are waiting till
+  // zero.
+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+    return applyWaitcnt(X_CNT, 0);
+
+  // If we have pending store we cannot optimize XCnt because we do not wait for
+  // stores. VMEM loads retun in order, so if we only have loads XCnt is
+  // decremented to the same number as LOADCnt.
+  if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+      !hasPendingEvent(STORE_CNT))
+    return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+
+  applyWaitcnt(X_CNT, Wait.XCnt);
+}
+
 // Where there are multiple types of event in the bracket of a counter,
 // the decrement may go out of order.
 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   // Scalar memory read always can go out of order.
-  if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
+  if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+      (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
     return true;
   return hasMixedPendingEvents(T);
 }
@@ -1263,6 +1322,8 @@ static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
     return DS_CNT;
   case AMDGPU::S_WAIT_KMCNT:
     return KM_CNT;
+  case AMDGPU::S_WAIT_XCNT:
+    return X_CNT;
   default:
     return {};
   }
@@ -1427,7 +1488,8 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
 
 AMDGPU::Waitcnt
 WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
-  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
+                         ~0u /* XCNT */);
 }
 
 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1909,6 +1971,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
             ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
             ScoreBrackets.clearVgprVmemTypes(Interval);
           }
+
           if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
             ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
           }
@@ -1916,6 +1979,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         } else {
           ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
         }
+
+        if (hasXcnt() && Op.isDef())
+          ScoreBrackets.determineWait(X_CNT, Interval, Wait);
       }
     }
   }
@@ -1958,6 +2024,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
     Wait.BvhCnt = 0;
   if (ForceEmitWaitcnt[KM_CNT])
     Wait.KmCnt = 0;
+  if (ForceEmitWaitcnt[X_CNT])
+    Wait.XCnt = 0;
 
   if (FlushVmCnt) {
     if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
@@ -2007,6 +2075,21 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                       << "Update Instr: " << *It);
   }
 
+  // XCnt may be already consumed by a load wait.
+  if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
+      !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+    Wait.XCnt = ~0u;
+
+  if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
+      !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+    Wait.XCnt = ~0u;
+
+  // Since the translation for VMEM addresses occur in-order, we can skip the
+  // XCnt if the current instruction is of VMEM type and has a memory dependency
+  // with another VMEM instruction in flight.
+  if (Wait.XCnt != ~0u && isVmemAccess(*It))
+    Wait.XCnt = ~0u;
+
   if (WCG->createNewWaitcnt(Block, It, Wait))
     Modified = true;
 
@@ -2096,6 +2179,11 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
   });
 }
 
+bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
+  return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+         (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
+}
+
 static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
   auto Opc = Inst.getOpcode();
   return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
@@ -2167,6 +2255,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   // bracket and the destination operand scores.
   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
 
+  bool IsVMEMAccess = false;
+  bool IsSMEMAccess = false;
   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -2189,6 +2279,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
 
     if (mayAccessVMEMThroughFlat(Inst)) {
       ++FlatASCount;
+      IsVMEMAccess = true;
       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                    Inst);
     }
@@ -2208,6 +2299,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->setPendingFlat();
   } else if (SIInstrInfo::isVMEM(Inst) &&
              !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+    IsVMEMAccess = true;
     ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                  Inst);
 
@@ -2216,6 +2308,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
     }
   } else if (TII->isSMRD(Inst)) {
+    IsSMEMAccess = true;
     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
   } else if (Inst.isCall()) {
     if (callWaitsOnFunctionReturn(Inst)) {
@@ -2258,6 +2351,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       break;
     }
   }
+
+  if (!hasXcnt())
+    return;
+
+  if (IsVMEMAccess)
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
+
+  if (IsSMEMAccess)
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
 }
 
 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
@@ -2311,9 +2413,11 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
     for (int J = 0; J <= VgprUB; J++)
       StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
 
-    if (T == SmemAccessCounter) {
+    if (isSmemCounter(T)) {
+      unsigned Idx = getSgprScoresIdx(T);
       for (int J = 0; J <= SgprUB; J++)
-        StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
+        StrictDom |=
+            mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
     }
   }
 
@@ -2651,6 +2755,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
   Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
   Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
+  Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
 
   [[maybe_unused]] unsigned NumVGPRsMax =
       ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
@@ -2679,7 +2784,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
           .addImm(0);
       for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
           continue;
 
         if (!ST->hasImageInsts() &&
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0e5493259edb9..13549e5c4e58b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -129,6 +129,11 @@ unsigned getKmcntBitWidth(unsigned VersionMajor) {
   return VersionMajor >= 12 ? 5 : 0;
 }
 
+/// \returns Xcnt bit width.
+unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
+  return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
+}
+
 /// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
 unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
   return VersionMajor >= 12 ? 8 : 0;
@@ -1493,6 +1498,10 @@ unsigned getKmcntBitMask(const IsaVersion &Version) {
   return (1 << getKmcntBitWidth(Version.Major)) - 1;
 }
 
+unsigned getXcntBitMask(const IsaVersion &Version) {
+  return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1;
+}
+
 unsigned getStorecntBitMask(const IsaVersion &Version) {
   return (1 << getStorecntBitWidth(Version.Major)) - 1;
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ac7c5100be3d4..e6078d6918ac2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -980,6 +980,7 @@ struct Waitcnt {
   unsigned SampleCnt = ~0u; // gfx12+ only.
   unsigned BvhCnt = ~0u;    // gfx12+ only.
   unsigned KmCnt = ~0u;     // gfx12+ only.
+  unsigned XCnt = ~0u;      // gfx1250.
 
   Waitcnt() = default;
   // Pre-gfx12 constructor.
@@ -988,15 +989,15 @@ struct Waitcnt {
 
   // gfx12+ constructor.
   Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
-          unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt)
+          unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt)
       : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
-        SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
+        SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {}
 
   bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
 
   bool hasWaitExceptStoreCnt() const {
     return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
-           SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u;
+           SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u;
   }
 
   bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
@@ -1008,7 +1009,7 @@ struct Waitcnt {
         std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
         std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
         std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
-        std::min(KmCnt, Other.KmCnt));
+        std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt));
   }
 };
 
@@ -1114,6 +1115,10 @@ unsigned getDscntBitMask(const IsaVersion &Version);
 /// Returns 0 for versions that do not support KMcnt
 unsigned getKmcntBitMask(const IsaVersion &Version);
 
+/// \returns Xcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support Xcnt.
+unsigned getXcntBitMask(const IsaVersion &Version);
+
 /// \return STOREcnt or VScnt bit mask for given isa \p Version.
 /// returns 0 for versions that do not support STOREcnt or VScnt.
 /// STOREcnt and VScnt are the same counter, the name used
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
new file mode 100644
index 0000000000000..f86216837fe0e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -0,0 +1,396 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Introduced additional operations in between the clauses to have the register dependency
+; between the operands of VMEM operations and the def ops of VALU instructions that followed.
+
+define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %ptr_c, ptr addrspace(1) %ptr_d, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_i8load...
[truncated]

cdevadas · 2025-06-24T18:38:09Z

I removed the verify-machineinstrs from the tests.

shiltian

This seems

rampitec

Thanks!

rampitec · 2025-06-24T22:53:39Z

llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll

+}
+
+;TODO: This test should be enabled in the upstream later. It currently causes a crash
+; during branch relaxation as the gfx1250 real opcode definition for V_LSHL_ADD_U64


The instruction is just added. Will the test pass now?

Yes, it did. I re-enabled the test.

This patch tracks the register operands of both VMEM (FLAT, MUBUF, MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT instruction with sufficient wait-count before potentially redefining them. For VMEM instructions, XNACK is returned in the same order as they were issued and hence non-zero counter values can be inserted. However, SMEM execution is out-of-order and so is their XNACK reception. Thus, only zero counter value can be inserted to capture SMEM dependencies.

cdevadas · 2025-06-25T05:08:51Z

Merge activity

Jun 25, 5:08 AM UTC: A user started a stack merge that includes this pull request via Graphite.
Jun 25, 5:10 AM UTC: @cdevadas merged this pull request with Graphite.

jayfoad · 2025-06-25T10:34:23Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+  return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+         (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));


isVMEM now includes isFLAT, so I think this simplifies to:

Suggested change

return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||

(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));

return TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode());

which is maybe not what you want because it has lost the mayAccessVMEMThroughFlat check. See #137148. Cc @ro-i.

jayfoad · 2025-06-25T11:54:04Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+    return applyWaitcnt(X_CNT, 0);


I don't understand the logic here. What if there is also an pending VMEM? Then it is not safe to assume that waiting for SMEM to complete will also imply a wait for XCNT. See #145681 for examples that I think are miscompiled because of this.

Maybe you intended that there would never be pending SMEM_GROUP and VMEM_GROUP at the same time?

…lvm#145566) This patch tracks the register operands of both VMEM (FLAT, MUBUF, MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT instruction with sufficient wait-count before potentially redefining them. For VMEM instructions, XNACK is returned in the same order as they were issued and hence non-zero counter values can be inserted. However, SMEM execution is out-of-order and so is their XNACK reception. Thus, only zero counter value can be inserted to capture SMEM dependencies.

cdevadas requested review from changpeng, rampitec and shiltian June 24, 2025 18:31

cdevadas marked this pull request as ready for review June 24, 2025 18:31

llvmbot added the backend:AMDGPU label Jun 24, 2025

shiltian approved these changes Jun 24, 2025

View reviewed changes

rampitec approved these changes Jun 24, 2025

View reviewed changes

rampitec reviewed Jun 24, 2025

View reviewed changes

cdevadas added 4 commits June 25, 2025 07:44

test fixup.

b59ca18

test update after rebase from main.

40a7ef2

enabled the one test earlier disabled.

4169c88

cdevadas force-pushed the users/cdevadas/implement_s_wait_xcnt_insertion_gfx1250 branch from d1f8c45 to 4169c88 Compare June 25, 2025 02:53

cdevadas merged commit 08b8d46 into main Jun 25, 2025
7 checks passed

cdevadas deleted the users/cdevadas/implement_s_wait_xcnt_insertion_gfx1250 branch June 25, 2025 05:10

jayfoad reviewed Jun 25, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566

[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566

cdevadas commented Jun 24, 2025

Uh oh!

cdevadas commented Jun 24, 2025

Uh oh!

github-actions bot commented Jun 24, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jun 24, 2025

Uh oh!

cdevadas commented Jun 24, 2025

Uh oh!

shiltian left a comment

Uh oh!

rampitec left a comment

Uh oh!

rampitec Jun 24, 2025

Uh oh!

cdevadas Jun 25, 2025

Uh oh!

cdevadas commented Jun 25, 2025 •

edited

Loading

Uh oh!

Uh oh!

jayfoad Jun 25, 2025

Uh oh!

jayfoad Jun 25, 2025

Uh oh!

Uh oh!

		return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) \|\|
		(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));

	return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) \|\|
	(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
	return TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode());

		if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
		return applyWaitcnt(X_CNT, 0);

[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566

[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566

Conversation

cdevadas commented Jun 24, 2025

Uh oh!

cdevadas commented Jun 24, 2025

Uh oh!

github-actions bot commented Jun 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 24, 2025

Uh oh!

cdevadas commented Jun 24, 2025

Uh oh!

shiltian left a comment

Choose a reason for hiding this comment

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

rampitec Jun 24, 2025

Choose a reason for hiding this comment

Uh oh!

cdevadas Jun 25, 2025

Choose a reason for hiding this comment

Uh oh!

cdevadas commented Jun 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

jayfoad Jun 25, 2025

Choose a reason for hiding this comment

Uh oh!

jayfoad Jun 25, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

github-actions bot commented Jun 24, 2025 •

edited

Loading

cdevadas commented Jun 25, 2025 •

edited

Loading