[AMDGPU] Relax lds dma waitcnt with no aliasing pair #131842

kerbowa · 2025-03-18T16:16:50Z

If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.

kerbowa · 2025-03-18T16:17:05Z

[AMDGPU] Relax lds dma waitcnt with no aliasing pair #131842 👈 (View in Graphite)
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

kerbowa · 2025-03-20T21:14:40Z

I'm also debating on doing this by default instead of supplying a cl option.

llvmbot · 2025-03-20T21:16:21Z

@llvm/pr-subscribers-backend-amdgpu

Author: Austin Kerbow (kerbowa)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/131842.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+6-1)
(added) llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll (+146)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 239f2664f59f3..51cfc3f005c19 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -58,6 +58,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
     cl::desc("Force all waitcnt load counters to wait until 0"),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool> RelaxLDSDMA(
+    "amdgpu-relax-lds-dma-waitcnt",
+    cl::desc("Relax the waitcnt for LDS DMA instructions that do not alias"),
+    cl::init(false), cl::ReallyHidden);
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -1748,7 +1753,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
             }
           }
         }
-        if (!FoundAliasingStore)
+        if (!FoundAliasingStore && !RelaxLDSDMA)
           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
         if (Memop->isStore()) {
           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
diff --git a/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll
new file mode 100644
index 0000000000000..f3fe4946b26c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-relax-lds-dma-waitcnt < %s | FileCheck %s --check-prefix=RELAXED
+
+; In relaxed mode don't wait on vmcnt(0) if the global_laod_lds and ds_reads do not alias
+
+define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; DEFAULT-LABEL: global_load_lds_no_alias_ds_read:
+; DEFAULT:       ; %bb.0: ; %main_body
+; DEFAULT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; DEFAULT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; DEFAULT-NEXT:    v_mov_b32_e32 v2, 0
+; DEFAULT-NEXT:    s_mov_b32 m0, 0
+; DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT:    global_load_lds_dword v2, s[0:1]
+; DEFAULT-NEXT:    s_movk_i32 m0, 0x100
+; DEFAULT-NEXT:    s_nop 0
+; DEFAULT-NEXT:    global_load_lds_dword v2, s[0:1] offset:4
+; DEFAULT-NEXT:    s_lshl_b32 s0, s2, 2
+; DEFAULT-NEXT:    v_mov_b32_e32 v0, s0
+; DEFAULT-NEXT:    s_lshl_b32 s0, s3, 2
+; DEFAULT-NEXT:    v_mov_b32_e32 v1, s0
+; DEFAULT-NEXT:    s_waitcnt vmcnt(1)
+; DEFAULT-NEXT:    s_barrier
+; DEFAULT-NEXT:    s_waitcnt vmcnt(0)
+; DEFAULT-NEXT:    ds_read_b32 v0, v0 offset:512
+; DEFAULT-NEXT:    s_waitcnt vmcnt(0)
+; DEFAULT-NEXT:    s_barrier
+; DEFAULT-NEXT:    ds_read_b32 v1, v1 offset:768
+; DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; DEFAULT-NEXT:    s_endpgm
+;
+; RELAXED-LABEL: global_load_lds_no_alias_ds_read:
+; RELAXED:       ; %bb.0: ; %main_body
+; RELAXED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; RELAXED-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; RELAXED-NEXT:    v_mov_b32_e32 v2, 0
+; RELAXED-NEXT:    s_mov_b32 m0, 0
+; RELAXED-NEXT:    s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT:    global_load_lds_dword v2, s[0:1]
+; RELAXED-NEXT:    s_movk_i32 m0, 0x100
+; RELAXED-NEXT:    s_nop 0
+; RELAXED-NEXT:    global_load_lds_dword v2, s[0:1] offset:4
+; RELAXED-NEXT:    s_lshl_b32 s0, s2, 2
+; RELAXED-NEXT:    v_mov_b32_e32 v0, s0
+; RELAXED-NEXT:    s_lshl_b32 s0, s3, 2
+; RELAXED-NEXT:    v_mov_b32_e32 v1, s0
+; RELAXED-NEXT:    s_waitcnt vmcnt(1)
+; RELAXED-NEXT:    s_barrier
+; RELAXED-NEXT:    ds_read_b32 v0, v0 offset:512
+; RELAXED-NEXT:    s_waitcnt vmcnt(0)
+; RELAXED-NEXT:    s_barrier
+; RELAXED-NEXT:    ds_read_b32 v1, v1 offset:768
+; RELAXED-NEXT:    s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; RELAXED-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 3953)
+  call void @llvm.amdgcn.s.barrier()
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.s.waitcnt(i32 3952)
+  call void @llvm.amdgcn.s.barrier()
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp = insertelement <2 x float> poison, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; Always wait on vmcnt(0) if the global_laod_lds and ds_reads alias
+
+define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; DEFAULT-LABEL: global_load_lds_dword_2_arrays:
+; DEFAULT:       ; %bb.0: ; %main_body
+; DEFAULT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; DEFAULT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; DEFAULT-NEXT:    v_mov_b32_e32 v2, 0
+; DEFAULT-NEXT:    s_mov_b32 m0, 0
+; DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT:    global_load_lds_dword v2, s[0:1]
+; DEFAULT-NEXT:    global_load_lds_dword v2, s[0:1] offset:4
+; DEFAULT-NEXT:    s_movk_i32 m0, 0x100
+; DEFAULT-NEXT:    s_nop 0
+; DEFAULT-NEXT:    global_load_lds_dword v2, s[0:1] offset:8
+; DEFAULT-NEXT:    global_load_lds_dword v2, s[0:1] offset:12
+; DEFAULT-NEXT:    s_lshl_b32 s0, s2, 2
+; DEFAULT-NEXT:    s_lshl_b32 s1, s3, 2
+; DEFAULT-NEXT:    v_mov_b32_e32 v0, s0
+; DEFAULT-NEXT:    v_mov_b32_e32 v1, s1
+; DEFAULT-NEXT:    s_waitcnt vmcnt(0)
+; DEFAULT-NEXT:    ds_read_b32 v0, v0
+; DEFAULT-NEXT:    ; wave barrier
+; DEFAULT-NEXT:    ds_read_b32 v1, v1 offset:256
+; DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; DEFAULT-NEXT:    s_endpgm
+;
+; RELAXED-LABEL: global_load_lds_dword_2_arrays:
+; RELAXED:       ; %bb.0: ; %main_body
+; RELAXED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; RELAXED-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; RELAXED-NEXT:    v_mov_b32_e32 v2, 0
+; RELAXED-NEXT:    s_mov_b32 m0, 0
+; RELAXED-NEXT:    s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT:    global_load_lds_dword v2, s[0:1]
+; RELAXED-NEXT:    global_load_lds_dword v2, s[0:1] offset:4
+; RELAXED-NEXT:    s_movk_i32 m0, 0x100
+; RELAXED-NEXT:    s_nop 0
+; RELAXED-NEXT:    global_load_lds_dword v2, s[0:1] offset:8
+; RELAXED-NEXT:    global_load_lds_dword v2, s[0:1] offset:12
+; RELAXED-NEXT:    s_lshl_b32 s0, s2, 2
+; RELAXED-NEXT:    s_lshl_b32 s1, s3, 2
+; RELAXED-NEXT:    v_mov_b32_e32 v0, s0
+; RELAXED-NEXT:    v_mov_b32_e32 v1, s1
+; RELAXED-NEXT:    s_waitcnt vmcnt(0)
+; RELAXED-NEXT:    ds_read_b32 v0, v0
+; RELAXED-NEXT:    ; wave barrier
+; RELAXED-NEXT:    ds_read_b32 v1, v1 offset:256
+; RELAXED-NEXT:    s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; RELAXED-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp.0 = insertelement <2 x float> poison, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
+@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
+@lds.2 = internal addrspace(3) global [64 x float] poison, align 16
+@lds.3 = internal addrspace(3) global [64 x float] poison, align 16

cdevadas · 2025-03-21T05:17:31Z

llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll

@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5


Can you pre-commit the test?

Pierre-vh · 2025-03-21T14:05:19Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

@@ -1748,7 +1753,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
            }
          }
        }
-        if (!FoundAliasingStore)
+        if (!FoundAliasingStore && !RelaxLDSDMA)


Is this still correct w.r.t. the check above?

if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope)

If that is false, I assume we want the wait anyway because we couldn't do alias analysis. Should we still add the conservative wait in that case?

After including this check, I am just enabling this relaxation by default instead of as a cl option.

Can you add a testcase for it too, please?

The else case is tested in lds-dma-waitcnt.mir already.

jayfoad · 2025-03-24T09:51:27Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

@@ -1768,13 +1767,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
          const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
          for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {


Nit: don't need the braces

If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.

If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled. cherry-pick: e75f586 to amd-mainline

…vm#131842) (llvm#2292)

…lvm#2463) If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled. cherry-pick: e75f586 to amd-mainline (cherry picked from commit 96fba51) Co-authored-by: Austin Kerbow <[email protected]>

kerbowa force-pushed the users/kerbowa/rexlax-lds-dma-wait branch from d491d2b to ba44b44 Compare March 20, 2025 21:12

kerbowa assigned bcahoon, rampitec, arsenm, ssahasra and Pierre-vh Mar 20, 2025

kerbowa marked this pull request as ready for review March 20, 2025 21:15

llvmbot added the backend:AMDGPU label Mar 20, 2025

kerbowa unassigned arsenm, rampitec, Pierre-vh, ssahasra and bcahoon Mar 20, 2025

kerbowa requested review from arsenm, rampitec, Pierre-vh, ssahasra and bcahoon March 20, 2025 21:16

kerbowa changed the title ~~[AMDGPU] Add cl option to relax lds dma waitcnt~~ [AMDGPU] Relax lds dma waitcnt with no aliasing pair Mar 20, 2025

cdevadas reviewed Mar 21, 2025

View reviewed changes

Pierre-vh reviewed Mar 21, 2025

View reviewed changes

kerbowa force-pushed the users/kerbowa/rexlax-lds-dma-wait branch from ba44b44 to cfd1d48 Compare March 22, 2025 03:46

arsenm approved these changes Mar 22, 2025

View reviewed changes

jayfoad reviewed Mar 24, 2025

View reviewed changes

kerbowa force-pushed the users/kerbowa/rexlax-lds-dma-wait branch from cfd1d48 to 58faa78 Compare March 24, 2025 15:49

kerbowa merged commit e75f586 into main Mar 24, 2025
11 checks passed

kerbowa deleted the users/kerbowa/rexlax-lds-dma-wait branch March 24, 2025 17:38

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request May 27, 2025

cherry-pick: [AMDGPU] Relax lds dma waitcnt with no aliasing pair (ll…

8c04208

…vm#131842) (llvm#2292)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Relax lds dma waitcnt with no aliasing pair #131842

[AMDGPU] Relax lds dma waitcnt with no aliasing pair #131842

Uh oh!

kerbowa commented Mar 18, 2025 •

edited

Loading

Uh oh!

kerbowa commented Mar 18, 2025

Uh oh!

kerbowa commented Mar 20, 2025

Uh oh!

llvmbot commented Mar 20, 2025

Uh oh!

cdevadas Mar 21, 2025

Uh oh!

Pierre-vh Mar 21, 2025

Uh oh!

kerbowa Mar 22, 2025

Uh oh!

Pierre-vh Mar 24, 2025

Uh oh!

kerbowa Mar 24, 2025

Uh oh!

jayfoad Mar 24, 2025

Uh oh!

Uh oh!

Uh oh!

		@@ -0,0 +1,146 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5

[AMDGPU] Relax lds dma waitcnt with no aliasing pair #131842

[AMDGPU] Relax lds dma waitcnt with no aliasing pair #131842

Uh oh!

Conversation

kerbowa commented Mar 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

kerbowa commented Mar 18, 2025

Uh oh!

kerbowa commented Mar 20, 2025

Uh oh!

llvmbot commented Mar 20, 2025

Uh oh!

cdevadas Mar 21, 2025

Choose a reason for hiding this comment

Uh oh!

Pierre-vh Mar 21, 2025

Choose a reason for hiding this comment

Uh oh!

kerbowa Mar 22, 2025

Choose a reason for hiding this comment

Uh oh!

Pierre-vh Mar 24, 2025

Choose a reason for hiding this comment

Uh oh!

kerbowa Mar 24, 2025

Choose a reason for hiding this comment

Uh oh!

jayfoad Mar 24, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

kerbowa commented Mar 18, 2025 •

edited

Loading