Skip to content

Commit 58faa78

Browse files
committed
[AMDGPU] Relax lds dma waitcnt with no aliasing pair
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.
1 parent 5c02f1a commit 58faa78

File tree

2 files changed

+28
-8
lines changed

2 files changed

+28
-8
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1759,7 +1759,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17591759

17601760
// LOAD_CNT is only relevant to vgpr or LDS.
17611761
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1762-
bool FoundAliasingStore = false;
17631762
// Only objects with alias scope info were added to LDSDMAScopes array.
17641763
// In the absense of the scope info we will not be able to disambiguate
17651764
// aliasing here. There is no need to try searching for a corresponding
@@ -1769,14 +1768,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17691768
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
17701769
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17711770
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1772-
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1773-
FoundAliasingStore = true;
1771+
if (MI.mayAlias(AA, *LDSDMAStores[I], true))
17741772
ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1775-
}
17761773
}
1777-
}
1778-
if (!FoundAliasingStore)
1774+
} else {
17791775
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1776+
}
17801777
if (Memop->isStore()) {
17811778
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17821779
}

llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ main_body:
6767
}
6868

6969
; There are 8 pseudo registers defined to track LDS DMA dependencies.
70-
; When exhausted we default to vmcnt(0).
7170

7271
; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
7372
; GCN-COUNT-10: buffer_load_dword
@@ -86,7 +85,6 @@ main_body:
8685
; GCN: s_waitcnt vmcnt(2)
8786
; GCN-NOT: s_waitcnt vmcnt
8887
; GCN: ds_read_b32
89-
; GCN: s_waitcnt vmcnt(0)
9088
; GCN: ds_read_b32
9189
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
9290
main_body:
@@ -151,4 +149,29 @@ main_body:
151149
ret void
152150
}
153151

152+
define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
153+
; GFX9-LABEL: global_load_lds_no_alias_ds_read:
154+
; GFX9: global_load_dword
155+
; GFX9: global_load_dword
156+
; GFX9: s_waitcnt vmcnt(1)
157+
; GFX9-NOT: s_waitcnt vmcnt(0)
158+
; GFX9: ds_read_b32
159+
; GFX9: s_waitcnt vmcnt(0)
160+
; GFX9: ds_read_b32
161+
; GFX9: s_endpgm
162+
body:
163+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
164+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
165+
call void @llvm.amdgcn.s.waitcnt(i32 3953)
166+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
167+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
168+
call void @llvm.amdgcn.s.waitcnt(i32 3952)
169+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
170+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
171+
%tmp = insertelement <2 x float> poison, float %val.0, i32 0
172+
%res = insertelement <2 x float> %tmp, float %val.1, i32 1
173+
store <2 x float> %res, ptr addrspace(1) %out
174+
ret void
175+
}
176+
154177
declare void @llvm.amdgcn.wave.barrier()

0 commit comments

Comments
 (0)