@@ -292,6 +292,13 @@ class WaitcntBrackets {
292
292
VgprVmemTypes[GprNo] = 0 ;
293
293
}
294
294
295
+ void setNonKernelFunctionInitialState () {
296
+ for (InstCounterType Counter : inst_counter_types ()) {
297
+ setScoreUB (Counter, getWaitCountMax (Counter));
298
+ PendingEvents |= WaitEventMaskForInst[Counter];
299
+ }
300
+ }
301
+
295
302
void print (raw_ostream &);
296
303
void dump () { print (dbgs ()); }
297
304
@@ -364,7 +371,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
364
371
const MachineRegisterInfo *MRI = nullptr ;
365
372
AMDGPU::IsaVersion IV;
366
373
367
- DenseSet<MachineInstr *> TrackedWaitcntSet;
368
374
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
369
375
DenseMap<MachineBasicBlock *, bool > PreheadersToFlush;
370
376
MachineLoopInfo *MLI;
@@ -486,6 +492,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
486
492
MachineInstr &OldWaitcntInstr,
487
493
AMDGPU::Waitcnt &Wait,
488
494
MachineBasicBlock::instr_iterator It) const ;
495
+ bool updateWaitcntIfSoft (MachineInstr *Waitcnt) const ;
489
496
};
490
497
491
498
} // end anonymous namespace
@@ -870,6 +877,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
870
877
return true ;
871
878
}
872
879
880
+ bool SIInsertWaitcnts::updateWaitcntIfSoft (MachineInstr *Waitcnt) const {
881
+ unsigned Opcode = Waitcnt->getOpcode ();
882
+ if (!SIInstrInfo::isSoftWaitcnt (Opcode))
883
+ return false ;
884
+
885
+ Waitcnt->setDesc (TII->get (SIInstrInfo::getNonSoftWaitcntOpcode (Opcode)));
886
+ return true ;
887
+ }
888
+
873
889
// / Combine consecutive waitcnt instructions that precede \p It and follow
874
890
// / \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
875
891
// / by previous passes. Currently this pass conservatively assumes that these
@@ -886,34 +902,40 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
886
902
if (II.isMetaInstruction ())
887
903
continue ;
888
904
889
- if (II.getOpcode () == AMDGPU::S_WAITCNT) {
905
+ unsigned Opcode = II.getOpcode ();
906
+ bool CanFullyDiscardWaitcntSequence = SIInstrInfo::isSoftWaitcnt (Opcode);
907
+
908
+ if (SIInstrInfo::isWaitcnt (Opcode)) {
890
909
// Conservatively update required wait if this waitcnt was added in an
891
910
// earlier pass. In this case it will not exist in the tracked waitcnt
892
911
// set.
893
- if (!TrackedWaitcntSet. count (&II)) {
894
- unsigned IEnc = II. getOperand ( 0 ). getImm ( );
895
- AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt (IV, IEnc);
896
- Wait = Wait. combined (OldWait);
897
- }
912
+ unsigned IEnc = II. getOperand ( 0 ). getImm ();
913
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt (IV, IEnc );
914
+ if (CanFullyDiscardWaitcntSequence)
915
+ ScoreBrackets. simplifyWaitcnt (OldWait);
916
+ Wait = Wait. combined (OldWait);
898
917
899
918
// Merge consecutive waitcnt of the same type by erasing multiples.
900
- if (!WaitcntInstr) {
919
+ if (!WaitcntInstr &&
920
+ (Wait.hasWaitExceptVsCnt () || !CanFullyDiscardWaitcntSequence)) {
901
921
WaitcntInstr = &II;
902
922
} else {
903
923
II.eraseFromParent ();
904
924
Modified = true ;
905
925
}
906
926
907
927
} else {
908
- assert (II. getOpcode () == AMDGPU::S_WAITCNT_VSCNT );
928
+ assert (SIInstrInfo::isWaitcntVsCnt (Opcode) );
909
929
assert (II.getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
910
- if (!TrackedWaitcntSet.count (&II)) {
911
- unsigned OldVSCnt =
912
- TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
913
- Wait.VsCnt = std::min (Wait.VsCnt , OldVSCnt);
914
- }
915
930
916
- if (!WaitcntVsCntInstr) {
931
+ unsigned OldVSCnt =
932
+ TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
933
+ if (CanFullyDiscardWaitcntSequence)
934
+ ScoreBrackets.simplifyWaitcnt (InstCounterType::VS_CNT, OldVSCnt);
935
+ Wait.VsCnt = std::min (Wait.VsCnt , OldVSCnt);
936
+
937
+ if (!WaitcntVsCntInstr &&
938
+ (Wait.hasWaitVsCnt () || !CanFullyDiscardWaitcntSequence)) {
917
939
WaitcntVsCntInstr = &II;
918
940
} else {
919
941
II.eraseFromParent ();
@@ -924,48 +946,38 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
924
946
925
947
// Updated encoding of merged waitcnt with the required wait.
926
948
if (WaitcntInstr) {
927
- if (Wait.hasWaitExceptVsCnt ()) {
928
- Modified |=
929
- updateOperandIfDifferent (*WaitcntInstr, AMDGPU::OpName::simm16,
930
- AMDGPU::encodeWaitcnt (IV, Wait));
931
- ScoreBrackets.applyWaitcnt (Wait);
932
- Wait.VmCnt = ~0u ;
933
- Wait.LgkmCnt = ~0u ;
934
- Wait.ExpCnt = ~0u ;
935
-
936
- LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
937
- ? dbgs () << " applyPreexistingWaitcnt\n "
938
- << " New Instr at block end: " << *WaitcntInstr
939
- << ' \n '
940
- : dbgs () << " applyPreexistingWaitcnt\n "
941
- << " Old Instr: " << *It
942
- << " New Instr: " << *WaitcntInstr << ' \n ' );
949
+ Modified |= updateOperandIfDifferent (*WaitcntInstr, AMDGPU::OpName::simm16,
950
+ AMDGPU::encodeWaitcnt (IV, Wait));
951
+ Modified |= updateWaitcntIfSoft (WaitcntInstr);
943
952
944
- } else {
945
- WaitcntInstr->eraseFromParent ();
946
- Modified = true ;
947
- }
953
+ ScoreBrackets.applyWaitcnt (Wait);
954
+ Wait.VmCnt = ~0u ;
955
+ Wait.LgkmCnt = ~0u ;
956
+ Wait.ExpCnt = ~0u ;
957
+
958
+ LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
959
+ ? dbgs ()
960
+ << " applyPreexistingWaitcnt\n "
961
+ << " New Instr at block end: " << *WaitcntInstr << ' \n '
962
+ : dbgs () << " applyPreexistingWaitcnt\n "
963
+ << " Old Instr: " << *It
964
+ << " New Instr: " << *WaitcntInstr << ' \n ' );
948
965
}
949
966
950
967
if (WaitcntVsCntInstr) {
951
- if (Wait.hasWaitVsCnt ()) {
952
- assert (ST->hasVscnt ());
953
- Modified |= updateOperandIfDifferent (*WaitcntVsCntInstr,
954
- AMDGPU::OpName::simm16, Wait.VsCnt );
955
- ScoreBrackets.applyWaitcnt (Wait);
956
- Wait.VsCnt = ~0u ;
957
-
958
- LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
959
- ? dbgs () << " applyPreexistingWaitcnt\n "
960
- << " New Instr at block end: "
961
- << *WaitcntVsCntInstr << ' \n '
962
- : dbgs () << " applyPreexistingWaitcnt\n "
963
- << " Old Instr: " << *It
964
- << " New Instr: " << *WaitcntVsCntInstr << ' \n ' );
965
- } else {
966
- WaitcntVsCntInstr->eraseFromParent ();
967
- Modified = true ;
968
- }
968
+ Modified |= updateOperandIfDifferent (*WaitcntVsCntInstr,
969
+ AMDGPU::OpName::simm16, Wait.VsCnt );
970
+ Modified |= updateWaitcntIfSoft (WaitcntVsCntInstr);
971
+ ScoreBrackets.applyWaitcnt (Wait);
972
+ Wait.VsCnt = ~0u ;
973
+
974
+ LLVM_DEBUG (It == OldWaitcntInstr.getParent ()->end ()
975
+ ? dbgs () << " applyPreexistingWaitcnt\n "
976
+ << " New Instr at block end: " << *WaitcntVsCntInstr
977
+ << ' \n '
978
+ : dbgs () << " applyPreexistingWaitcnt\n "
979
+ << " Old Instr: " << *It
980
+ << " New Instr: " << *WaitcntVsCntInstr << ' \n ' );
969
981
}
970
982
971
983
return Modified;
@@ -1317,7 +1329,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1317
1329
unsigned Enc = AMDGPU::encodeWaitcnt (IV, Wait);
1318
1330
auto SWaitInst =
1319
1331
BuildMI (Block, It, DL, TII->get (AMDGPU::S_WAITCNT)).addImm (Enc);
1320
- TrackedWaitcntSet.insert (SWaitInst);
1321
1332
Modified = true ;
1322
1333
1323
1334
LLVM_DEBUG (dbgs () << " generateWaitcnt\n " ;
@@ -1331,7 +1342,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1331
1342
auto SWaitInst = BuildMI (Block, It, DL, TII->get (AMDGPU::S_WAITCNT_VSCNT))
1332
1343
.addReg (AMDGPU::SGPR_NULL, RegState::Undef)
1333
1344
.addImm (Wait.VsCnt );
1334
- TrackedWaitcntSet.insert (SWaitInst);
1335
1345
Modified = true ;
1336
1346
1337
1347
LLVM_DEBUG (dbgs () << " generateWaitcnt\n " ;
@@ -1574,9 +1584,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1574
1584
}
1575
1585
1576
1586
static bool isWaitInstr (MachineInstr &Inst) {
1577
- return Inst.getOpcode () == AMDGPU::S_WAITCNT ||
1578
- (Inst. getOpcode () == AMDGPU::S_WAITCNT_VSCNT &&
1579
- Inst.getOperand (0 ).isReg () &&
1587
+ auto Opcode = Inst.getOpcode ();
1588
+ return SIInstrInfo::isWaitcnt (Opcode) ||
1589
+ ( SIInstrInfo::isWaitcntVsCnt (Opcode) && Inst.getOperand (0 ).isReg () &&
1580
1590
Inst.getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
1581
1591
}
1582
1592
@@ -1845,7 +1855,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1845
1855
TRI->getEncodingValue (AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
1846
1856
Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1 ;
1847
1857
1848
- TrackedWaitcntSet.clear ();
1849
1858
BlockInfos.clear ();
1850
1859
bool Modified = false ;
1851
1860
@@ -1863,6 +1872,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1863
1872
;
1864
1873
BuildMI (EntryBB, I, DebugLoc (), TII->get (AMDGPU::S_WAITCNT)).addImm (0 );
1865
1874
1875
+ auto NonKernelInitialState =
1876
+ std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1877
+ NonKernelInitialState->setNonKernelFunctionInitialState ();
1878
+ BlockInfos[&EntryBB].Incoming = std::move (NonKernelInitialState);
1879
+
1866
1880
Modified = true ;
1867
1881
}
1868
1882
0 commit comments