-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
✅ With the latest revision this PR passed the undef deprecator. |
@llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) ChangesThis patch tracks the register operands of both VMEM (FLAT, MUBUF, Patch is 86.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145566.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 9a7dd3c31e498..f43831016952a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -73,6 +73,7 @@ enum InstCounterType {
SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
BVH_CNT, // gfx12+ only.
KM_CNT, // gfx12+ only.
+ X_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
};
@@ -102,6 +103,7 @@ struct HardwareLimits {
unsigned SamplecntMax; // gfx12+ only.
unsigned BvhcntMax; // gfx12+ only.
unsigned KmcntMax; // gfx12+ only.
+ unsigned XcntMax; // gfx1250.
};
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
@@ -111,10 +113,12 @@ struct HardwareLimits {
DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
+ DECL(VMEM_GROUP) /* vmem group */ \
DECL(LDS_ACCESS) /* lds read & write */ \
DECL(GDS_ACCESS) /* gds read & write */ \
DECL(SQ_MESSAGE) /* send message */ \
DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
+ DECL(SMEM_GROUP) /* scalar-memory group */ \
DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
DECL(EXP_POS_ACCESS) /* write to export position */ \
@@ -178,7 +182,7 @@ enum VmemType {
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
- AMDGPU::S_WAIT_KMCNT};
+ AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
static bool updateVMCntOnly(const MachineInstr &Inst) {
return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
@@ -223,6 +227,8 @@ unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
return Wait.BvhCnt;
case KM_CNT:
return Wait.KmCnt;
+ case X_CNT:
+ return Wait.XCnt;
default:
llvm_unreachable("bad InstCounterType");
}
@@ -283,12 +289,27 @@ class WaitcntBrackets {
return Limits.BvhcntMax;
case KM_CNT:
return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
default:
break;
}
return 0;
}
+ bool isSmemCounter(InstCounterType T) const {
+ return T == SmemAccessCounter || T == X_CNT;
+ }
+
+ unsigned getSgprScoresIdx(InstCounterType T) const {
+ if (T == SmemAccessCounter)
+ return 0;
+ if (T == X_CNT)
+ return 1;
+
+ llvm_unreachable("Invalid SMEM counter");
+ }
+
unsigned getScoreLB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
return ScoreLBs[T];
@@ -307,8 +328,8 @@ class WaitcntBrackets {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
- assert(T == SmemAccessCounter);
- return SgprScores[GprNo - NUM_ALL_VGPRS];
+ assert(isSmemCounter(T));
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
}
bool merge(const WaitcntBrackets &Other);
@@ -331,6 +352,7 @@ class WaitcntBrackets {
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
+ void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, WaitEventType E,
MachineInstr &MI);
@@ -462,9 +484,11 @@ class WaitcntBrackets {
int VgprUB = -1;
int SgprUB = -1;
unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
- unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+ // X_CNT score.
+ unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
@@ -572,6 +596,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
0,
0,
+ 0,
0};
return WaitEventMaskForInstPreGFX12;
@@ -607,7 +632,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
eventMask({VMEM_SAMPLER_READ_ACCESS}),
eventMask({VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, SQ_MESSAGE})};
+ eventMask({SMEM_ACCESS, SQ_MESSAGE}),
+ eventMask({VMEM_GROUP, SMEM_GROUP})};
return WaitEventMaskForInstGFX12Plus;
}
@@ -743,9 +769,12 @@ class SIInsertWaitcnts {
return VmemReadMapping[getVmemType(Inst)];
}
+ bool hasXcnt() const { return ST->hasWaitXCnt(); }
+
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+ bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
@@ -837,9 +866,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
VgprUB = std::max(VgprUB, RegNo);
VgprScores[CntTy][RegNo] = Score;
} else {
- assert(CntTy == SmemAccessCounter);
+ assert(isSmemCounter(CntTy));
SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
- SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
+ SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
}
}
}
@@ -976,6 +1005,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
}
}
+ } else if (T == X_CNT) {
+ for (const MachineOperand &Op : Inst.all_uses()) {
+ RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ setRegScore(RegNo, T, CurrScore);
+ }
+ }
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
//
@@ -1080,6 +1116,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
case KM_CNT:
OS << " KM_CNT(" << SR << "): ";
break;
+ case X_CNT:
+ OS << " X_CNT(" << SR << "): ";
+ break;
default:
OS << " UNKNOWN(" << SR << "): ";
break;
@@ -1100,8 +1139,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
OS << RelScore << ":ds ";
}
}
- // Also need to print sgpr scores for lgkm_cnt.
- if (T == SmemAccessCounter) {
+ // Also need to print sgpr scores for lgkm_cnt or xcnt.
+ if (isSmemCounter(T)) {
for (int J = 0; J <= SgprUB; J++) {
unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
if (RegScore <= LB)
@@ -1140,6 +1179,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
+ simplifyWaitcnt(X_CNT, Wait.XCnt);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1191,6 +1231,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
+ applyXcnt(Wait);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1207,11 +1248,29 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
}
+void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+ // Wait on XCNT is redundant if we are already waiting for a load to complete.
+ // SMEM can return out of order, so only omit XCNT wait if we are waiting till
+ // zero.
+ if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+ return applyWaitcnt(X_CNT, 0);
+
+ // If we have pending store we cannot optimize XCnt because we do not wait for
+ // stores. VMEM loads retun in order, so if we only have loads XCnt is
+ // decremented to the same number as LOADCnt.
+ if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+ !hasPendingEvent(STORE_CNT))
+ return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+
+ applyWaitcnt(X_CNT, Wait.XCnt);
+}
+
// Where there are multiple types of event in the bracket of a counter,
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
- if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
+ if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+ (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
return true;
return hasMixedPendingEvents(T);
}
@@ -1263,6 +1322,8 @@ static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
return DS_CNT;
case AMDGPU::S_WAIT_KMCNT:
return KM_CNT;
+ case AMDGPU::S_WAIT_XCNT:
+ return X_CNT;
default:
return {};
}
@@ -1427,7 +1488,8 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
- return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+ return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
+ ~0u /* XCNT */);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1909,6 +1971,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
ScoreBrackets.clearVgprVmemTypes(Interval);
}
+
if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
}
@@ -1916,6 +1979,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
} else {
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
}
+
+ if (hasXcnt() && Op.isDef())
+ ScoreBrackets.determineWait(X_CNT, Interval, Wait);
}
}
}
@@ -1958,6 +2024,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.BvhCnt = 0;
if (ForceEmitWaitcnt[KM_CNT])
Wait.KmCnt = 0;
+ if (ForceEmitWaitcnt[X_CNT])
+ Wait.XCnt = 0;
if (FlushVmCnt) {
if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
@@ -2007,6 +2075,21 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}
+ // XCnt may be already consumed by a load wait.
+ if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
+ !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+ Wait.XCnt = ~0u;
+
+ if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
+ !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+ Wait.XCnt = ~0u;
+
+ // Since the translation for VMEM addresses occur in-order, we can skip the
+ // XCnt if the current instruction is of VMEM type and has a memory dependency
+ // with another VMEM instruction in flight.
+ if (Wait.XCnt != ~0u && isVmemAccess(*It))
+ Wait.XCnt = ~0u;
+
if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
@@ -2096,6 +2179,11 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
});
}
+bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
+ return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+ (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
+}
+
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
auto Opc = Inst.getOpcode();
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
@@ -2167,6 +2255,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// bracket and the destination operand scores.
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
+ bool IsVMEMAccess = false;
+ bool IsSMEMAccess = false;
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
if (TII->isAlwaysGDS(Inst.getOpcode()) ||
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -2189,6 +2279,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (mayAccessVMEMThroughFlat(Inst)) {
++FlatASCount;
+ IsVMEMAccess = true;
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
Inst);
}
@@ -2208,6 +2299,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
!llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+ IsVMEMAccess = true;
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
Inst);
@@ -2216,6 +2308,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
}
} else if (TII->isSMRD(Inst)) {
+ IsSMEMAccess = true;
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
} else if (Inst.isCall()) {
if (callWaitsOnFunctionReturn(Inst)) {
@@ -2258,6 +2351,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
}
}
+
+ if (!hasXcnt())
+ return;
+
+ if (IsVMEMAccess)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
+
+ if (IsSMEMAccess)
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
}
bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
@@ -2311,9 +2413,11 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
- if (T == SmemAccessCounter) {
+ if (isSmemCounter(T)) {
+ unsigned Idx = getSgprScoresIdx(T);
for (int J = 0; J <= SgprUB; J++)
- StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
+ StrictDom |=
+ mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
}
}
@@ -2651,6 +2755,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
+ Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
[[maybe_unused]] unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
@@ -2679,7 +2784,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
.addImm(0);
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
- if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+ if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
continue;
if (!ST->hasImageInsts() &&
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0e5493259edb9..13549e5c4e58b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -129,6 +129,11 @@ unsigned getKmcntBitWidth(unsigned VersionMajor) {
return VersionMajor >= 12 ? 5 : 0;
}
+/// \returns Xcnt bit width.
+unsigned getXcntBitWidth(unsigned VersionMajor, unsigned VersionMinor) {
+ return VersionMajor == 12 && VersionMinor == 5 ? 6 : 0;
+}
+
/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
return VersionMajor >= 12 ? 8 : 0;
@@ -1493,6 +1498,10 @@ unsigned getKmcntBitMask(const IsaVersion &Version) {
return (1 << getKmcntBitWidth(Version.Major)) - 1;
}
+unsigned getXcntBitMask(const IsaVersion &Version) {
+ return (1 << getXcntBitWidth(Version.Major, Version.Minor)) - 1;
+}
+
unsigned getStorecntBitMask(const IsaVersion &Version) {
return (1 << getStorecntBitWidth(Version.Major)) - 1;
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ac7c5100be3d4..e6078d6918ac2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -980,6 +980,7 @@ struct Waitcnt {
unsigned SampleCnt = ~0u; // gfx12+ only.
unsigned BvhCnt = ~0u; // gfx12+ only.
unsigned KmCnt = ~0u; // gfx12+ only.
+ unsigned XCnt = ~0u; // gfx1250.
Waitcnt() = default;
// Pre-gfx12 constructor.
@@ -988,15 +989,15 @@ struct Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
- unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt)
+ unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt)
: LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
- SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
+ SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {}
bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
bool hasWaitExceptStoreCnt() const {
return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
- SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u;
+ SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u;
}
bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
@@ -1008,7 +1009,7 @@ struct Waitcnt {
std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
- std::min(KmCnt, Other.KmCnt));
+ std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt));
}
};
@@ -1114,6 +1115,10 @@ unsigned getDscntBitMask(const IsaVersion &Version);
/// Returns 0 for versions that do not support KMcnt
unsigned getKmcntBitMask(const IsaVersion &Version);
+/// \returns Xcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support Xcnt.
+unsigned getXcntBitMask(const IsaVersion &Version);
+
/// \return STOREcnt or VScnt bit mask for given isa \p Version.
/// returns 0 for versions that do not support STOREcnt or VScnt.
/// STOREcnt and VScnt are the same counter, the name used
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
new file mode 100644
index 0000000000000..f86216837fe0e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -0,0 +1,396 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Introduced additional operations in between the clauses to have the register dependency
+; between the operands of VMEM operations and the def ops of VALU instructions that followed.
+
+define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %ptr_c, ptr addrspace(1) %ptr_d, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: test_i8load...
[truncated]
|
I removed the verify-machineinstrs from the tests. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!
} | ||
|
||
;TODO: This test should be enabled in the upstream later. It currently causes a crash | ||
; during branch relaxation as the gfx1250 real opcode definition for V_LSHL_ADD_U64 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The instruction is just added. Will the test pass now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it did. I re-enabled the test.
This patch tracks the register operands of both VMEM (FLAT, MUBUF, MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT instruction with sufficient wait-count before potentially redefining them. For VMEM instructions, XNACK is returned in the same order as they were issued and hence non-zero counter values can be inserted. However, SMEM execution is out-of-order and so is their XNACK reception. Thus, only zero counter value can be inserted to capture SMEM dependencies.
d1f8c45
to
4169c88
Compare
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || | ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isVMEM
now includes isFLAT
, so I think this simplifies to:
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || | |
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); | |
return TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()); |
which is maybe not what you want because it has lost the mayAccessVMEMThroughFlat
check. See #137148. Cc @ro-i.
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) | ||
return applyWaitcnt(X_CNT, 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the logic here. What if there is also an pending VMEM? Then it is not safe to assume that waiting for SMEM to complete will also imply a wait for XCNT. See #145681 for examples that I think are miscompiled because of this.
Maybe you intended that there would never be pending SMEM_GROUP and VMEM_GROUP at the same time?
…lvm#145566) This patch tracks the register operands of both VMEM (FLAT, MUBUF, MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT instruction with sufficient wait-count before potentially redefining them. For VMEM instructions, XNACK is returned in the same order as they were issued and hence non-zero counter values can be inserted. However, SMEM execution is out-of-order and so is their XNACK reception. Thus, only zero counter value can be inserted to capture SMEM dependencies.
This patch tracks the register operands of both VMEM (FLAT, MUBUF,
MTBUF) and SMEM load-store operations and inserts a S_WAIT_XCNT
instruction with sufficient wait-count before potentially redefining
them. For VMEM instructions, XNACK is returned in the same order as
they were issued and hence non-zero counter values can be inserted.
However, SMEM execution is out-of-order and so is their XNACK reception.
Thus, only zero counter value can be inserted to capture SMEM dependencies.