Skip to content

Commit cfd1d48

Browse files
committed
[AMDGPU] Relax lds dma waitcnt with no aliasing pair
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.
1 parent fe7776e commit cfd1d48

File tree

2 files changed

+27
-6
lines changed

2 files changed

+27
-6
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1757,7 +1757,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17571757

17581758
// LOAD_CNT is only relevant to vgpr or LDS.
17591759
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1760-
bool FoundAliasingStore = false;
17611760
// Only objects with alias scope info were added to LDSDMAScopes array.
17621761
// In the absense of the scope info we will not be able to disambiguate
17631762
// aliasing here. There is no need to try searching for a corresponding
@@ -1768,13 +1767,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17681767
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17691768
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
17701769
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1771-
FoundAliasingStore = true;
17721770
ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
17731771
}
17741772
}
1775-
}
1776-
if (!FoundAliasingStore)
1773+
} else {
17771774
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1775+
}
17781776
if (Memop->isStore()) {
17791777
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17801778
}

llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ main_body:
6767
}
6868

6969
; There are 8 pseudo registers defined to track LDS DMA dependencies.
70-
; When exhausted we default to vmcnt(0).
7170

7271
; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
7372
; GCN-COUNT-10: buffer_load_dword
@@ -86,7 +85,6 @@ main_body:
8685
; GCN: s_waitcnt vmcnt(2)
8786
; GCN-NOT: s_waitcnt vmcnt
8887
; GCN: ds_read_b32
89-
; GCN: s_waitcnt vmcnt(0)
9088
; GCN: ds_read_b32
9189
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
9290
main_body:
@@ -151,4 +149,29 @@ main_body:
151149
ret void
152150
}
153151

152+
define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
153+
; GFX9-LABEL: global_load_lds_no_alias_ds_read:
154+
; GFX9: global_load_dword
155+
; GFX9: global_load_dword
156+
; GFX9: s_waitcnt vmcnt(1)
157+
; GFX9-NOT: s_waitcnt vmcnt(0)
158+
; GFX9: ds_read_b32
159+
; GFX9: s_waitcnt vmcnt(0)
160+
; GFX9: ds_read_b32
161+
; GFX9: s_endpgm
162+
body:
163+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
164+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
165+
call void @llvm.amdgcn.s.waitcnt(i32 3953)
166+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
167+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
168+
call void @llvm.amdgcn.s.waitcnt(i32 3952)
169+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
170+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
171+
%tmp = insertelement <2 x float> poison, float %val.0, i32 0
172+
%res = insertelement <2 x float> %tmp, float %val.1, i32 1
173+
store <2 x float> %res, ptr addrspace(1) %out
174+
ret void
175+
}
176+
154177
declare void @llvm.amdgcn.wave.barrier()

0 commit comments

Comments
 (0)