Skip to content

Commit c6ecbcb

Browse files
authored
[AMDGPU] Fix no waitcnt produced between LDS DMA and ds_read on gfx10 (#75245)
BUFFER_LOAD_DWORD_LDS was incorrectly touching vscnt instead of the vmcnt. This is VMEM load and DS store, so it shall use vmcnt.
1 parent a5ffabc commit c6ecbcb

File tree

3 files changed

+23
-19
lines changed

3 files changed

+23
-19
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
452452
// FLAT instruction.
453453
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
454454
assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
455-
if (!ST->hasVscnt())
455+
// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
456+
// these should use VM_CNT.
457+
if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
456458
return VMEM_ACCESS;
457459
if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
458460
// FLAT and SCRATCH instructions may access scratch. Other VMEM
@@ -544,14 +546,6 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
544546
}
545547
}
546548

547-
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
548-
// can be accessed. A load from LDS to VMEM does not need a wait.
549-
static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
550-
return SIInstrInfo::isVALU(MI) &&
551-
(SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
552-
MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
553-
}
554-
555549
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
556550
const SIRegisterInfo *TRI,
557551
const MachineRegisterInfo *MRI,
@@ -703,7 +697,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
703697
setRegScore(RegNo, T, CurrScore);
704698
}
705699
}
706-
if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
700+
if (Inst.mayStore() &&
701+
(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
702+
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
703+
// written can be accessed. A load from LDS to VMEM does not need a wait.
707704
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
708705
}
709706
}
@@ -1178,7 +1175,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
11781175
if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
11791176
continue;
11801177
// No need to wait before load from VMEM to LDS.
1181-
if (mayWriteLDSThroughDMA(MI))
1178+
if (TII->mayWriteLDSThroughDMA(MI))
11821179
continue;
11831180
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
11841181
// VM_CNT is only relevant to vgpr or LDS.

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
546546
return get(Opcode).TSFlags & SIInstrFlags::DS;
547547
}
548548

549+
static bool isLDSDMA(const MachineInstr &MI) {
550+
return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
551+
}
552+
553+
bool isLDSDMA(uint16_t Opcode) {
554+
return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
555+
}
556+
549557
static bool isGWS(const MachineInstr &MI) {
550558
return MI.getDesc().TSFlags & SIInstrFlags::GWS;
551559
}
@@ -667,6 +675,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
667675
SIInstrFlags::IsAtomicNoRet);
668676
}
669677

678+
static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
679+
return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
680+
}
681+
670682
static bool isWQM(const MachineInstr &MI) {
671683
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
672684
}

llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,10 @@ declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr
99

1010
; FIXME: vmcnt(0) is too strong, it shall use vmcnt(2) before the first
1111
; ds_read_b32 and vmcnt(0) before the second.
12-
; FIXME: GFX10 does not get a waitcount at all.
1312

1413
; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
1514
; GCN-COUNT-4: buffer_load_dword
16-
; GFX9: s_waitcnt vmcnt(0)
17-
18-
; FIXME:
19-
; GFX10-NOT: s_waitcnt
20-
15+
; GCN: s_waitcnt vmcnt(0)
2116
; GCN: ds_read_b32
2217

2318
; FIXME:
@@ -49,9 +44,9 @@ main_body:
4944
; GFX9: s_waitcnt vmcnt(0)
5045
; GFX9-COUNT-2: ds_read_b32
5146

52-
; FIXME:
53-
; GFX10-NOT: s_waitcnt
47+
; FIXME: can be vmcnt(2)
5448

49+
; GFX10: s_waitcnt vmcnt(0)
5550
; GFX10: ds_read_b32
5651

5752
; FIXME:

0 commit comments

Comments
 (0)