-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Relax lds dma waitcnt with no aliasing pair #131842
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
d491d2b
to
ba44b44
Compare
I'm also debating on doing this by default instead of supplying a cl option. |
@llvm/pr-subscribers-backend-amdgpu Author: Austin Kerbow (kerbowa) ChangesFull diff: https://github.com/llvm/llvm-project/pull/131842.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 239f2664f59f3..51cfc3f005c19 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -58,6 +58,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> RelaxLDSDMA(
+ "amdgpu-relax-lds-dma-waitcnt",
+ cl::desc("Relax the waitcnt for LDS DMA instructions that do not alias"),
+ cl::init(false), cl::ReallyHidden);
+
namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
@@ -1748,7 +1753,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
}
}
}
- if (!FoundAliasingStore)
+ if (!FoundAliasingStore && !RelaxLDSDMA)
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
if (Memop->isStore()) {
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
diff --git a/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll
new file mode 100644
index 0000000000000..f3fe4946b26c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-relax-lds-dma-waitcnt < %s | FileCheck %s --check-prefix=RELAXED
+
+; In relaxed mode don't wait on vmcnt(0) if the global_laod_lds and ds_reads do not alias
+
+define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; DEFAULT-LABEL: global_load_lds_no_alias_ds_read:
+; DEFAULT: ; %bb.0: ; %main_body
+; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; DEFAULT-NEXT: v_mov_b32_e32 v2, 0
+; DEFAULT-NEXT: s_mov_b32 m0, 0
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1]
+; DEFAULT-NEXT: s_movk_i32 m0, 0x100
+; DEFAULT-NEXT: s_nop 0
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; DEFAULT-NEXT: s_lshl_b32 s0, s2, 2
+; DEFAULT-NEXT: v_mov_b32_e32 v0, s0
+; DEFAULT-NEXT: s_lshl_b32 s0, s3, 2
+; DEFAULT-NEXT: v_mov_b32_e32 v1, s0
+; DEFAULT-NEXT: s_waitcnt vmcnt(1)
+; DEFAULT-NEXT: s_barrier
+; DEFAULT-NEXT: s_waitcnt vmcnt(0)
+; DEFAULT-NEXT: ds_read_b32 v0, v0 offset:512
+; DEFAULT-NEXT: s_waitcnt vmcnt(0)
+; DEFAULT-NEXT: s_barrier
+; DEFAULT-NEXT: ds_read_b32 v1, v1 offset:768
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; DEFAULT-NEXT: s_endpgm
+;
+; RELAXED-LABEL: global_load_lds_no_alias_ds_read:
+; RELAXED: ; %bb.0: ; %main_body
+; RELAXED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; RELAXED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; RELAXED-NEXT: v_mov_b32_e32 v2, 0
+; RELAXED-NEXT: s_mov_b32 m0, 0
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1]
+; RELAXED-NEXT: s_movk_i32 m0, 0x100
+; RELAXED-NEXT: s_nop 0
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; RELAXED-NEXT: s_lshl_b32 s0, s2, 2
+; RELAXED-NEXT: v_mov_b32_e32 v0, s0
+; RELAXED-NEXT: s_lshl_b32 s0, s3, 2
+; RELAXED-NEXT: v_mov_b32_e32 v1, s0
+; RELAXED-NEXT: s_waitcnt vmcnt(1)
+; RELAXED-NEXT: s_barrier
+; RELAXED-NEXT: ds_read_b32 v0, v0 offset:512
+; RELAXED-NEXT: s_waitcnt vmcnt(0)
+; RELAXED-NEXT: s_barrier
+; RELAXED-NEXT: ds_read_b32 v1, v1 offset:768
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; RELAXED-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
+ call void @llvm.amdgcn.s.waitcnt(i32 3953)
+ call void @llvm.amdgcn.s.barrier()
+ %gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
+ %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+ call void @llvm.amdgcn.s.waitcnt(i32 3952)
+ call void @llvm.amdgcn.s.barrier()
+ %gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
+ %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+ %tmp = insertelement <2 x float> poison, float %val.0, i32 0
+ %res = insertelement <2 x float> %tmp, float %val.1, i32 1
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; Always wait on vmcnt(0) if the global_laod_lds and ds_reads alias
+
+define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; DEFAULT-LABEL: global_load_lds_dword_2_arrays:
+; DEFAULT: ; %bb.0: ; %main_body
+; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; DEFAULT-NEXT: v_mov_b32_e32 v2, 0
+; DEFAULT-NEXT: s_mov_b32 m0, 0
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1]
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; DEFAULT-NEXT: s_movk_i32 m0, 0x100
+; DEFAULT-NEXT: s_nop 0
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:8
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:12
+; DEFAULT-NEXT: s_lshl_b32 s0, s2, 2
+; DEFAULT-NEXT: s_lshl_b32 s1, s3, 2
+; DEFAULT-NEXT: v_mov_b32_e32 v0, s0
+; DEFAULT-NEXT: v_mov_b32_e32 v1, s1
+; DEFAULT-NEXT: s_waitcnt vmcnt(0)
+; DEFAULT-NEXT: ds_read_b32 v0, v0
+; DEFAULT-NEXT: ; wave barrier
+; DEFAULT-NEXT: ds_read_b32 v1, v1 offset:256
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; DEFAULT-NEXT: s_endpgm
+;
+; RELAXED-LABEL: global_load_lds_dword_2_arrays:
+; RELAXED: ; %bb.0: ; %main_body
+; RELAXED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; RELAXED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; RELAXED-NEXT: v_mov_b32_e32 v2, 0
+; RELAXED-NEXT: s_mov_b32 m0, 0
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1]
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; RELAXED-NEXT: s_movk_i32 m0, 0x100
+; RELAXED-NEXT: s_nop 0
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:8
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:12
+; RELAXED-NEXT: s_lshl_b32 s0, s2, 2
+; RELAXED-NEXT: s_lshl_b32 s1, s3, 2
+; RELAXED-NEXT: v_mov_b32_e32 v0, s0
+; RELAXED-NEXT: v_mov_b32_e32 v1, s1
+; RELAXED-NEXT: s_waitcnt vmcnt(0)
+; RELAXED-NEXT: ds_read_b32 v0, v0
+; RELAXED-NEXT: ; wave barrier
+; RELAXED-NEXT: ds_read_b32 v1, v1 offset:256
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; RELAXED-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
+ %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+ %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+ %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+ call void @llvm.amdgcn.wave.barrier()
+ %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+ %tmp.0 = insertelement <2 x float> poison, float %val.0, i32 0
+ %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
+@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
+@lds.2 = internal addrspace(3) global [64 x float] poison, align 16
+@lds.3 = internal addrspace(3) global [64 x float] poison, align 16
|
@@ -0,0 +1,146 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you pre-commit the test?
@@ -1748,7 +1753,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, | |||
} | |||
} | |||
} | |||
if (!FoundAliasingStore) | |||
if (!FoundAliasingStore && !RelaxLDSDMA) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this still correct w.r.t. the check above?
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope)
If that is false, I assume we want the wait anyway because we couldn't do alias analysis. Should we still add the conservative wait in that case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After including this check, I am just enabling this relaxation by default instead of as a cl option.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a testcase for it too, please?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The else case is tested in lds-dma-waitcnt.mir already.
ba44b44
to
cfd1d48
Compare
@@ -1768,13 +1767,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, | |||
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); | |||
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { | |||
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: don't need the braces
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.
cfd1d48
to
58faa78
Compare
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled. cherry-pick: e75f586 to amd-mainline
…lvm#2463) If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled. cherry-pick: e75f586 to amd-mainline (cherry picked from commit 96fba51) Co-authored-by: Austin Kerbow <[email protected]>
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.