Skip to content

Commit e75f586

Browse files
authored
[AMDGPU] Relax lds dma waitcnt with no aliasing pair (#131842)
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.
1 parent 061b1d1 commit e75f586

File tree

2 files changed

+28
-8
lines changed

2 files changed

+28
-8
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1768,7 +1768,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17681768

17691769
// LOAD_CNT is only relevant to vgpr or LDS.
17701770
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1771-
bool FoundAliasingStore = false;
17721771
// Only objects with alias scope info were added to LDSDMAScopes array.
17731772
// In the absense of the scope info we will not be able to disambiguate
17741773
// aliasing here. There is no need to try searching for a corresponding
@@ -1778,14 +1777,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17781777
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
17791778
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17801779
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1781-
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1782-
FoundAliasingStore = true;
1780+
if (MI.mayAlias(AA, *LDSDMAStores[I], true))
17831781
ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1784-
}
17851782
}
1786-
}
1787-
if (!FoundAliasingStore)
1783+
} else {
17881784
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1785+
}
17891786
if (Memop->isStore()) {
17901787
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17911788
}

llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ main_body:
6767
}
6868

6969
; There are 8 pseudo registers defined to track LDS DMA dependencies.
70-
; When exhausted we default to vmcnt(0).
7170

7271
; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
7372
; GCN-COUNT-10: buffer_load_dword
@@ -86,7 +85,6 @@ main_body:
8685
; GCN: s_waitcnt vmcnt(2)
8786
; GCN-NOT: s_waitcnt vmcnt
8887
; GCN: ds_read_b32
89-
; GCN: s_waitcnt vmcnt(0)
9088
; GCN: ds_read_b32
9189
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
9290
main_body:
@@ -151,4 +149,29 @@ main_body:
151149
ret void
152150
}
153151

152+
define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
153+
; GFX9-LABEL: global_load_lds_no_alias_ds_read:
154+
; GFX9: global_load_dword
155+
; GFX9: global_load_dword
156+
; GFX9: s_waitcnt vmcnt(1)
157+
; GFX9-NOT: s_waitcnt vmcnt(0)
158+
; GFX9: ds_read_b32
159+
; GFX9: s_waitcnt vmcnt(0)
160+
; GFX9: ds_read_b32
161+
; GFX9: s_endpgm
162+
body:
163+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
164+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
165+
call void @llvm.amdgcn.s.waitcnt(i32 3953)
166+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
167+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
168+
call void @llvm.amdgcn.s.waitcnt(i32 3952)
169+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
170+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
171+
%tmp = insertelement <2 x float> poison, float %val.0, i32 0
172+
%res = insertelement <2 x float> %tmp, float %val.1, i32 1
173+
store <2 x float> %res, ptr addrspace(1) %out
174+
ret void
175+
}
176+
154177
declare void @llvm.amdgcn.wave.barrier()

0 commit comments

Comments
 (0)