Skip to content

Commit faf1611

Browse files
bcahoonkerbowa
andauthored
[AMDGPU] Relax lds dma waitcnt with no aliasing pair (llvm#131842) (llvm#2463)
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled. cherry-pick: e75f586 to amd-mainline (cherry picked from commit 96fba51) Co-authored-by: Austin Kerbow <[email protected]>
1 parent a06c2c8 commit faf1611

File tree

2 files changed

+28
-8
lines changed

2 files changed

+28
-8
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1734,7 +1734,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17341734

17351735
// LOAD_CNT is only relevant to vgpr or LDS.
17361736
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1737-
bool FoundAliasingStore = false;
17381737
// Only objects with alias scope info were added to LDSDMAScopes array.
17391738
// In the absense of the scope info we will not be able to disambiguate
17401739
// aliasing here. There is no need to try searching for a corresponding
@@ -1744,14 +1743,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17441743
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
17451744
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17461745
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1747-
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1748-
FoundAliasingStore = true;
1746+
if (MI.mayAlias(AA, *LDSDMAStores[I], true))
17491747
ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1750-
}
17511748
}
1752-
}
1753-
if (!FoundAliasingStore)
1749+
} else {
17541750
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1751+
}
17551752
if (Memop->isStore()) {
17561753
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17571754
}

llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ main_body:
6767
}
6868

6969
; There are 8 pseudo registers defined to track LDS DMA dependencies.
70-
; When exhausted we default to vmcnt(0).
7170

7271
; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
7372
; GCN-COUNT-10: buffer_load_dword
@@ -86,7 +85,6 @@ main_body:
8685
; GCN: s_waitcnt vmcnt(2)
8786
; GCN-NOT: s_waitcnt vmcnt
8887
; GCN: ds_read_b32
89-
; GCN: s_waitcnt vmcnt(0)
9088
; GCN: ds_read_b32
9189
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
9290
main_body:
@@ -151,4 +149,29 @@ main_body:
151149
ret void
152150
}
153151

152+
define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
153+
; GFX9-LABEL: global_load_lds_no_alias_ds_read:
154+
; GFX9: global_load_dword
155+
; GFX9: global_load_dword
156+
; GFX9: s_waitcnt vmcnt(1)
157+
; GFX9-NOT: s_waitcnt vmcnt(0)
158+
; GFX9: ds_read_b32
159+
; GFX9: s_waitcnt vmcnt(0)
160+
; GFX9: ds_read_b32
161+
; GFX9: s_endpgm
162+
body:
163+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
164+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
165+
call void @llvm.amdgcn.s.waitcnt(i32 3953)
166+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
167+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
168+
call void @llvm.amdgcn.s.waitcnt(i32 3952)
169+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
170+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
171+
%tmp = insertelement <2 x float> poison, float %val.0, i32 0
172+
%res = insertelement <2 x float> %tmp, float %val.1, i32 1
173+
store <2 x float> %res, ptr addrspace(1) %out
174+
ret void
175+
}
176+
154177
declare void @llvm.amdgcn.wave.barrier()

0 commit comments

Comments
 (0)