@@ -73,6 +73,7 @@ enum InstCounterType {
73
73
SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
74
74
BVH_CNT, // gfx12+ only.
75
75
KM_CNT, // gfx12+ only.
76
+ X_CNT, // gfx1250.
76
77
NUM_EXTENDED_INST_CNTS,
77
78
NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
78
79
};
@@ -102,6 +103,7 @@ struct HardwareLimits {
102
103
unsigned SamplecntMax; // gfx12+ only.
103
104
unsigned BvhcntMax; // gfx12+ only.
104
105
unsigned KmcntMax; // gfx12+ only.
106
+ unsigned XcntMax; // gfx1250.
105
107
};
106
108
107
109
#define AMDGPU_DECLARE_WAIT_EVENTS (DECL ) \
@@ -111,10 +113,12 @@ struct HardwareLimits {
111
113
DECL (VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
112
114
DECL (VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
113
115
DECL (SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
116
+ DECL (VMEM_GROUP) /* vmem group */ \
114
117
DECL (LDS_ACCESS) /* lds read & write */ \
115
118
DECL (GDS_ACCESS) /* gds read & write */ \
116
119
DECL (SQ_MESSAGE) /* send message */ \
117
120
DECL (SMEM_ACCESS) /* scalar-memory read & write */ \
121
+ DECL (SMEM_GROUP) /* scalar-memory group */ \
118
122
DECL (EXP_GPR_LOCK) /* export holding on its data src */ \
119
123
DECL (GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
120
124
DECL (EXP_POS_ACCESS) /* write to export position */ \
@@ -178,7 +182,7 @@ enum VmemType {
178
182
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
179
183
AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
180
184
AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
181
- AMDGPU::S_WAIT_KMCNT};
185
+ AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT };
182
186
183
187
static bool updateVMCntOnly (const MachineInstr &Inst) {
184
188
return (SIInstrInfo::isVMEM (Inst) && !SIInstrInfo::isFLAT (Inst)) ||
@@ -223,6 +227,8 @@ unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
223
227
return Wait.BvhCnt ;
224
228
case KM_CNT:
225
229
return Wait.KmCnt ;
230
+ case X_CNT:
231
+ return Wait.XCnt ;
226
232
default :
227
233
llvm_unreachable (" bad InstCounterType" );
228
234
}
@@ -283,12 +289,27 @@ class WaitcntBrackets {
283
289
return Limits.BvhcntMax ;
284
290
case KM_CNT:
285
291
return Limits.KmcntMax ;
292
+ case X_CNT:
293
+ return Limits.XcntMax ;
286
294
default :
287
295
break ;
288
296
}
289
297
return 0 ;
290
298
}
291
299
300
+ bool isSmemCounter (InstCounterType T) const {
301
+ return T == SmemAccessCounter || T == X_CNT;
302
+ }
303
+
304
+ unsigned getSgprScoresIdx (InstCounterType T) const {
305
+ if (T == SmemAccessCounter)
306
+ return 0 ;
307
+ if (T == X_CNT)
308
+ return 1 ;
309
+
310
+ llvm_unreachable (" Invalid SMEM counter" );
311
+ }
312
+
292
313
unsigned getScoreLB (InstCounterType T) const {
293
314
assert (T < NUM_INST_CNTS);
294
315
return ScoreLBs[T];
@@ -307,8 +328,8 @@ class WaitcntBrackets {
307
328
if (GprNo < NUM_ALL_VGPRS) {
308
329
return VgprScores[T][GprNo];
309
330
}
310
- assert (T == SmemAccessCounter );
311
- return SgprScores[GprNo - NUM_ALL_VGPRS];
331
+ assert (isSmemCounter (T) );
332
+ return SgprScores[getSgprScoresIdx (T)][ GprNo - NUM_ALL_VGPRS];
312
333
}
313
334
314
335
bool merge (const WaitcntBrackets &Other);
@@ -331,6 +352,7 @@ class WaitcntBrackets {
331
352
332
353
void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
333
354
void applyWaitcnt (InstCounterType T, unsigned Count);
355
+ void applyXcnt (const AMDGPU::Waitcnt &Wait);
334
356
void updateByEvent (const SIInstrInfo *TII, const SIRegisterInfo *TRI,
335
357
const MachineRegisterInfo *MRI, WaitEventType E,
336
358
MachineInstr &MI);
@@ -462,9 +484,11 @@ class WaitcntBrackets {
462
484
int VgprUB = -1 ;
463
485
int SgprUB = -1 ;
464
486
unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0 }};
465
- // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
466
- // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
467
- unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0 };
487
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
488
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
489
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
490
+ // X_CNT score.
491
+ unsigned SgprScores[2 ][SQ_MAX_PGM_SGPRS] = {{0 }};
468
492
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
469
493
// write to each vgpr.
470
494
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0 };
@@ -572,6 +596,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
572
596
eventMask ({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
573
597
0 ,
574
598
0 ,
599
+ 0 ,
575
600
0 };
576
601
577
602
return WaitEventMaskForInstPreGFX12;
@@ -607,7 +632,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
607
632
eventMask ({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
608
633
eventMask ({VMEM_SAMPLER_READ_ACCESS}),
609
634
eventMask ({VMEM_BVH_READ_ACCESS}),
610
- eventMask ({SMEM_ACCESS, SQ_MESSAGE})};
635
+ eventMask ({SMEM_ACCESS, SQ_MESSAGE}),
636
+ eventMask ({VMEM_GROUP, SMEM_GROUP})};
611
637
612
638
return WaitEventMaskForInstGFX12Plus;
613
639
}
@@ -743,9 +769,12 @@ class SIInsertWaitcnts {
743
769
return VmemReadMapping[getVmemType (Inst)];
744
770
}
745
771
772
+ bool hasXcnt () const { return ST->hasWaitXCnt (); }
773
+
746
774
bool mayAccessVMEMThroughFlat (const MachineInstr &MI) const ;
747
775
bool mayAccessLDSThroughFlat (const MachineInstr &MI) const ;
748
776
bool mayAccessScratchThroughFlat (const MachineInstr &MI) const ;
777
+ bool isVmemAccess (const MachineInstr &MI) const ;
749
778
bool generateWaitcntInstBefore (MachineInstr &MI,
750
779
WaitcntBrackets &ScoreBrackets,
751
780
MachineInstr *OldWaitcntInstr,
@@ -837,9 +866,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
837
866
VgprUB = std::max (VgprUB, RegNo);
838
867
VgprScores[CntTy][RegNo] = Score;
839
868
} else {
840
- assert (CntTy == SmemAccessCounter );
869
+ assert (isSmemCounter ( CntTy) );
841
870
SgprUB = std::max (SgprUB, RegNo - NUM_ALL_VGPRS);
842
- SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
871
+ SgprScores[getSgprScoresIdx (CntTy)][ RegNo - NUM_ALL_VGPRS] = Score;
843
872
}
844
873
}
845
874
}
@@ -976,6 +1005,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
976
1005
setScoreByOperand (&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
977
1006
}
978
1007
}
1008
+ } else if (T == X_CNT) {
1009
+ for (const MachineOperand &Op : Inst.all_uses ()) {
1010
+ RegInterval Interval = getRegInterval (&Inst, MRI, TRI, Op);
1011
+ for (int RegNo = Interval.first ; RegNo < Interval.second ; ++RegNo) {
1012
+ setRegScore (RegNo, T, CurrScore);
1013
+ }
1014
+ }
979
1015
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
980
1016
// Match the score to the destination registers.
981
1017
//
@@ -1080,6 +1116,9 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
1080
1116
case KM_CNT:
1081
1117
OS << " KM_CNT(" << SR << " ): " ;
1082
1118
break ;
1119
+ case X_CNT:
1120
+ OS << " X_CNT(" << SR << " ): " ;
1121
+ break ;
1083
1122
default :
1084
1123
OS << " UNKNOWN(" << SR << " ): " ;
1085
1124
break ;
@@ -1100,8 +1139,8 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
1100
1139
OS << RelScore << " :ds " ;
1101
1140
}
1102
1141
}
1103
- // Also need to print sgpr scores for lgkm_cnt.
1104
- if (T == SmemAccessCounter ) {
1142
+ // Also need to print sgpr scores for lgkm_cnt or xcnt .
1143
+ if (isSmemCounter (T) ) {
1105
1144
for (int J = 0 ; J <= SgprUB; J++) {
1106
1145
unsigned RegScore = getRegScore (J + NUM_ALL_VGPRS, T);
1107
1146
if (RegScore <= LB)
@@ -1140,6 +1179,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1140
1179
simplifyWaitcnt (SAMPLE_CNT, Wait.SampleCnt );
1141
1180
simplifyWaitcnt (BVH_CNT, Wait.BvhCnt );
1142
1181
simplifyWaitcnt (KM_CNT, Wait.KmCnt );
1182
+ simplifyWaitcnt (X_CNT, Wait.XCnt );
1143
1183
}
1144
1184
1145
1185
void WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
@@ -1191,6 +1231,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1191
1231
applyWaitcnt (SAMPLE_CNT, Wait.SampleCnt );
1192
1232
applyWaitcnt (BVH_CNT, Wait.BvhCnt );
1193
1233
applyWaitcnt (KM_CNT, Wait.KmCnt );
1234
+ applyXcnt (Wait);
1194
1235
}
1195
1236
1196
1237
void WaitcntBrackets::applyWaitcnt (InstCounterType T, unsigned Count) {
@@ -1207,11 +1248,29 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1207
1248
}
1208
1249
}
1209
1250
1251
+ void WaitcntBrackets::applyXcnt (const AMDGPU::Waitcnt &Wait) {
1252
+ // Wait on XCNT is redundant if we are already waiting for a load to complete.
1253
+ // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1254
+ // zero.
1255
+ if (Wait.KmCnt == 0 && hasPendingEvent (SMEM_GROUP))
1256
+ return applyWaitcnt (X_CNT, 0 );
1257
+
1258
+ // If we have pending store we cannot optimize XCnt because we do not wait for
1259
+ // stores. VMEM loads retun in order, so if we only have loads XCnt is
1260
+ // decremented to the same number as LOADCnt.
1261
+ if (Wait.LoadCnt != ~0u && hasPendingEvent (VMEM_GROUP) &&
1262
+ !hasPendingEvent (STORE_CNT))
1263
+ return applyWaitcnt (X_CNT, std::min (Wait.XCnt , Wait.LoadCnt ));
1264
+
1265
+ applyWaitcnt (X_CNT, Wait.XCnt );
1266
+ }
1267
+
1210
1268
// Where there are multiple types of event in the bracket of a counter,
1211
1269
// the decrement may go out of order.
1212
1270
bool WaitcntBrackets::counterOutOfOrder (InstCounterType T) const {
1213
1271
// Scalar memory read always can go out of order.
1214
- if (T == SmemAccessCounter && hasPendingEvent (SMEM_ACCESS))
1272
+ if ((T == SmemAccessCounter && hasPendingEvent (SMEM_ACCESS)) ||
1273
+ (T == X_CNT && hasPendingEvent (SMEM_GROUP)))
1215
1274
return true ;
1216
1275
return hasMixedPendingEvents (T);
1217
1276
}
@@ -1263,6 +1322,8 @@ static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1263
1322
return DS_CNT;
1264
1323
case AMDGPU::S_WAIT_KMCNT:
1265
1324
return KM_CNT;
1325
+ case AMDGPU::S_WAIT_XCNT:
1326
+ return X_CNT;
1266
1327
default :
1267
1328
return {};
1268
1329
}
@@ -1427,7 +1488,8 @@ WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1427
1488
1428
1489
AMDGPU::Waitcnt
1429
1490
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt (bool IncludeVSCnt) const {
1430
- return AMDGPU::Waitcnt (0 , 0 , 0 , IncludeVSCnt ? 0 : ~0u , 0 , 0 , 0 );
1491
+ return AMDGPU::Waitcnt (0 , 0 , 0 , IncludeVSCnt ? 0 : ~0u , 0 , 0 , 0 ,
1492
+ ~0u /* XCNT */ );
1431
1493
}
1432
1494
1433
1495
// / Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1909,13 +1971,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1909
1971
ScoreBrackets.determineWait (BVH_CNT, Interval, Wait);
1910
1972
ScoreBrackets.clearVgprVmemTypes (Interval);
1911
1973
}
1974
+
1912
1975
if (Op.isDef () || ScoreBrackets.hasPendingEvent (EXP_LDS_ACCESS)) {
1913
1976
ScoreBrackets.determineWait (EXP_CNT, Interval, Wait);
1914
1977
}
1915
1978
ScoreBrackets.determineWait (DS_CNT, Interval, Wait);
1916
1979
} else {
1917
1980
ScoreBrackets.determineWait (SmemAccessCounter, Interval, Wait);
1918
1981
}
1982
+
1983
+ if (hasXcnt () && Op.isDef ())
1984
+ ScoreBrackets.determineWait (X_CNT, Interval, Wait);
1919
1985
}
1920
1986
}
1921
1987
}
@@ -1958,6 +2024,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1958
2024
Wait.BvhCnt = 0 ;
1959
2025
if (ForceEmitWaitcnt[KM_CNT])
1960
2026
Wait.KmCnt = 0 ;
2027
+ if (ForceEmitWaitcnt[X_CNT])
2028
+ Wait.XCnt = 0 ;
1961
2029
1962
2030
if (FlushVmCnt) {
1963
2031
if (ScoreBrackets.hasPendingEvent (LOAD_CNT))
@@ -2007,6 +2075,21 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2007
2075
<< " Update Instr: " << *It);
2008
2076
}
2009
2077
2078
+ // XCnt may be already consumed by a load wait.
2079
+ if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
2080
+ !ScoreBrackets.hasPendingEvent (SMEM_GROUP))
2081
+ Wait.XCnt = ~0u ;
2082
+
2083
+ if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
2084
+ !ScoreBrackets.hasPendingEvent (VMEM_GROUP))
2085
+ Wait.XCnt = ~0u ;
2086
+
2087
+ // Since the translation for VMEM addresses occur in-order, we can skip the
2088
+ // XCnt if the current instruction is of VMEM type and has a memory dependency
2089
+ // with another VMEM instruction in flight.
2090
+ if (Wait.XCnt != ~0u && isVmemAccess (*It))
2091
+ Wait.XCnt = ~0u ;
2092
+
2010
2093
if (WCG->createNewWaitcnt (Block, It, Wait))
2011
2094
Modified = true ;
2012
2095
@@ -2096,6 +2179,11 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2096
2179
});
2097
2180
}
2098
2181
2182
+ bool SIInsertWaitcnts::isVmemAccess (const MachineInstr &MI) const {
2183
+ return (TII->isFLAT (MI) && mayAccessVMEMThroughFlat (MI)) ||
2184
+ (TII->isVMEM (MI) && !AMDGPU::getMUBUFIsBufferInv (MI.getOpcode ()));
2185
+ }
2186
+
2099
2187
static bool isGFX12CacheInvOrWBInst (MachineInstr &Inst) {
2100
2188
auto Opc = Inst.getOpcode ();
2101
2189
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
@@ -2167,6 +2255,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2167
2255
// bracket and the destination operand scores.
2168
2256
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2169
2257
2258
+ bool IsVMEMAccess = false ;
2259
+ bool IsSMEMAccess = false ;
2170
2260
if (TII->isDS (Inst) && TII->usesLGKM_CNT (Inst)) {
2171
2261
if (TII->isAlwaysGDS (Inst.getOpcode ()) ||
2172
2262
TII->hasModifiersSet (Inst, AMDGPU::OpName::gds)) {
@@ -2189,6 +2279,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2189
2279
2190
2280
if (mayAccessVMEMThroughFlat (Inst)) {
2191
2281
++FlatASCount;
2282
+ IsVMEMAccess = true ;
2192
2283
ScoreBrackets->updateByEvent (TII, TRI, MRI, getVmemWaitEventType (Inst),
2193
2284
Inst);
2194
2285
}
@@ -2208,6 +2299,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2208
2299
ScoreBrackets->setPendingFlat ();
2209
2300
} else if (SIInstrInfo::isVMEM (Inst) &&
2210
2301
!llvm::AMDGPU::getMUBUFIsBufferInv (Inst.getOpcode ())) {
2302
+ IsVMEMAccess = true ;
2211
2303
ScoreBrackets->updateByEvent (TII, TRI, MRI, getVmemWaitEventType (Inst),
2212
2304
Inst);
2213
2305
@@ -2216,6 +2308,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2216
2308
ScoreBrackets->updateByEvent (TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2217
2309
}
2218
2310
} else if (TII->isSMRD (Inst)) {
2311
+ IsSMEMAccess = true ;
2219
2312
ScoreBrackets->updateByEvent (TII, TRI, MRI, SMEM_ACCESS, Inst);
2220
2313
} else if (Inst.isCall ()) {
2221
2314
if (callWaitsOnFunctionReturn (Inst)) {
@@ -2258,6 +2351,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2258
2351
break ;
2259
2352
}
2260
2353
}
2354
+
2355
+ if (!hasXcnt ())
2356
+ return ;
2357
+
2358
+ if (IsVMEMAccess)
2359
+ ScoreBrackets->updateByEvent (TII, TRI, MRI, VMEM_GROUP, Inst);
2360
+
2361
+ if (IsSMEMAccess)
2362
+ ScoreBrackets->updateByEvent (TII, TRI, MRI, SMEM_GROUP, Inst);
2261
2363
}
2262
2364
2263
2365
bool WaitcntBrackets::mergeScore (const MergeInfo &M, unsigned &Score,
@@ -2311,9 +2413,11 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2311
2413
for (int J = 0 ; J <= VgprUB; J++)
2312
2414
StrictDom |= mergeScore (M, VgprScores[T][J], Other.VgprScores [T][J]);
2313
2415
2314
- if (T == SmemAccessCounter) {
2416
+ if (isSmemCounter (T)) {
2417
+ unsigned Idx = getSgprScoresIdx (T);
2315
2418
for (int J = 0 ; J <= SgprUB; J++)
2316
- StrictDom |= mergeScore (M, SgprScores[J], Other.SgprScores [J]);
2419
+ StrictDom |=
2420
+ mergeScore (M, SgprScores[Idx][J], Other.SgprScores [Idx][J]);
2317
2421
}
2318
2422
}
2319
2423
@@ -2651,6 +2755,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
2651
2755
Limits.SamplecntMax = AMDGPU::getSamplecntBitMask (IV);
2652
2756
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask (IV);
2653
2757
Limits.KmcntMax = AMDGPU::getKmcntBitMask (IV);
2758
+ Limits.XcntMax = AMDGPU::getXcntBitMask (IV);
2654
2759
2655
2760
[[maybe_unused]] unsigned NumVGPRsMax =
2656
2761
ST->getAddressableNumVGPRs (MFI->getDynamicVGPRBlockSize ());
@@ -2679,7 +2784,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
2679
2784
BuildMI (EntryBB, I, DebugLoc (), TII->get (AMDGPU::S_WAIT_LOADCNT_DSCNT))
2680
2785
.addImm (0 );
2681
2786
for (auto CT : inst_counter_types (NUM_EXTENDED_INST_CNTS)) {
2682
- if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2787
+ if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT )
2683
2788
continue ;
2684
2789
2685
2790
if (!ST->hasImageInsts () &&
0 commit comments