Skip to content

Commit 8c04208

Browse files
authored
cherry-pick: [AMDGPU] Relax lds dma waitcnt with no aliasing pair (llvm#131842) (llvm#2292)
2 parents 627f715 + 96fba51 commit 8c04208

File tree

5 files changed

+120
-11
lines changed

5 files changed

+120
-11
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -853,9 +853,12 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
853853
}
854854

855855
if (TII->isFLAT(MI)) {
856-
int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
857-
if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
858-
return DataIdx;
856+
// There is no hazard if the instruction does not use vector regs
857+
if (VDataIdx == -1)
858+
return -1;
859+
860+
if (AMDGPU::getRegBitWidth(VDataRCID) > 64)
861+
return VDataIdx;
859862
}
860863

861864
return -1;

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,7 +1747,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17471747

17481748
// LOAD_CNT is only relevant to vgpr or LDS.
17491749
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1750-
bool FoundAliasingStore = false;
17511750
// Only objects with alias scope info were added to LDSDMAScopes array.
17521751
// In the absense of the scope info we will not be able to disambiguate
17531752
// aliasing here. There is no need to try searching for a corresponding
@@ -1757,14 +1756,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17571756
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
17581757
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
17591758
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1760-
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1761-
FoundAliasingStore = true;
1759+
if (MI.mayAlias(AA, *LDSDMAStores[I], true))
17621760
ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1763-
}
17641761
}
1765-
}
1766-
if (!FoundAliasingStore)
1762+
} else {
17671763
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1764+
}
17681765
if (Memop->isStore()) {
17691766
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
17701767
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 < %s | FileCheck -check-prefix=GFX90A %s
4+
5+
@G = addrspace(1) global <2 x i32> splat (i32 5)
6+
7+
define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
8+
; GFX942-LABEL: global_load_lds_dword_saddr:
9+
; GFX942: ; %bb.0: ; %main_body
10+
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
11+
; GFX942-NEXT: v_mov_b32_e32 v2, 0
12+
; GFX942-NEXT: s_mov_b32 m0, s2
13+
; GFX942-NEXT: s_nop 0
14+
; GFX942-NEXT: global_load_lds_dword v2, s[0:1] offset:32 nt
15+
; GFX942-NEXT: s_getpc_b64 s[0:1]
16+
; GFX942-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
17+
; GFX942-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
18+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
19+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
20+
; GFX942-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
21+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
22+
; GFX942-NEXT: s_mul_i32 s3, s3, 10
23+
; GFX942-NEXT: s_mul_i32 s2, s2, 10
24+
; GFX942-NEXT: v_mov_b32_e32 v0, s2
25+
; GFX942-NEXT: v_mov_b32_e32 v1, s3
26+
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
27+
; GFX942-NEXT: s_endpgm
28+
;
29+
; GFX90A-LABEL: global_load_lds_dword_saddr:
30+
; GFX90A: ; %bb.0: ; %main_body
31+
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
32+
; GFX90A-NEXT: s_mov_b32 s2, s0
33+
; GFX90A-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
34+
; GFX90A-NEXT: s_mov_b32 s3, s1
35+
; GFX90A-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr2_sgpr3
36+
; GFX90A-NEXT: s_getpc_b64 s[0:1]
37+
; GFX90A-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
38+
; GFX90A-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
39+
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
40+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
41+
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
42+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
43+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
44+
; GFX90A-NEXT: s_mov_b32 s4, s9
45+
; GFX90A-NEXT: s_mov_b32 s6, 10
46+
; GFX90A-NEXT: s_mul_i32 s4, s4, s6
47+
; GFX90A-NEXT: s_mov_b32 s5, s8
48+
; GFX90A-NEXT: s_mul_i32 s5, s5, s6
49+
; GFX90A-NEXT: v_mov_b32_e32 v2, s5
50+
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
51+
; GFX90A-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
52+
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
53+
; GFX90A-NEXT: ; implicit-def: $sgpr4
54+
; GFX90A-NEXT: v_readfirstlane_b32 s4, v1
55+
; GFX90A-NEXT: s_mov_b32 m0, s4
56+
; GFX90A-NEXT: s_nop 0
57+
; GFX90A-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds
58+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
59+
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
60+
; GFX90A-NEXT: s_endpgm
61+
main_body:
62+
%LGV = load <2 x i32>, ptr addrspace(1) @G, align 8
63+
%B = mul <2 x i32> %LGV, splat (i32 10)
64+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 32, i32 2)
65+
store <2 x i32> %B, ptr addrspace(1) @G, align 8
66+
ret void
67+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
3+
4+
---
5+
name: test_flat_valu_hazard
6+
body: |
7+
bb.0:
8+
liveins: $vgpr0, $vgpr1
9+
10+
; GCN-LABEL: name: test_flat_valu_hazard
11+
; GCN: liveins: $vgpr0, $vgpr1
12+
; GCN-NEXT: {{ $}}
13+
; GCN-NEXT: GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, implicit $m0, implicit $exec
14+
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
15+
; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
16+
GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, implicit $m0, implicit $exec
17+
$vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
18+
FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
19+
...

llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ main_body:
6767
}
6868

6969
; There are 8 pseudo registers defined to track LDS DMA dependencies.
70-
; When exhausted we default to vmcnt(0).
7170

7271
; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
7372
; GCN-COUNT-10: buffer_load_dword
@@ -86,7 +85,6 @@ main_body:
8685
; GCN: s_waitcnt vmcnt(2)
8786
; GCN-NOT: s_waitcnt vmcnt
8887
; GCN: ds_read_b32
89-
; GCN: s_waitcnt vmcnt(0)
9088
; GCN: ds_read_b32
9189
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
9290
main_body:
@@ -151,4 +149,29 @@ main_body:
151149
ret void
152150
}
153151

152+
define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
153+
; GFX9-LABEL: global_load_lds_no_alias_ds_read:
154+
; GFX9: global_load_dword
155+
; GFX9: global_load_dword
156+
; GFX9: s_waitcnt vmcnt(1)
157+
; GFX9-NOT: s_waitcnt vmcnt(0)
158+
; GFX9: ds_read_b32
159+
; GFX9: s_waitcnt vmcnt(0)
160+
; GFX9: ds_read_b32
161+
; GFX9: s_endpgm
162+
body:
163+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
164+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
165+
call void @llvm.amdgcn.s.waitcnt(i32 3953)
166+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
167+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
168+
call void @llvm.amdgcn.s.waitcnt(i32 3952)
169+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
170+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
171+
%tmp = insertelement <2 x float> poison, float %val.0, i32 0
172+
%res = insertelement <2 x float> %tmp, float %val.1, i32 1
173+
store <2 x float> %res, ptr addrspace(1) %out
174+
ret void
175+
}
176+
154177
declare void @llvm.amdgcn.wave.barrier()

0 commit comments

Comments
 (0)