Skip to content

[AMDGPU][AMDGPURegBankInfo] Map S_BUFFER_LOAD_XXX to its corresponding BUFFER_LOAD_XXX #117574

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 16, 2024

Conversation

jmmartinez
Copy link
Contributor

In one test code generation diverged between GISEL and DAG

For example, this intrinsic

%ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)

would be lowered into these two cases:

  • buffer_load_u8 v2, v2, s[0:3], null offen
  • buffer_load_b32 v2, v2, s[0:3], null offen

This patch fixes this issue.

@jmmartinez jmmartinez requested review from jayfoad and arsenm November 25, 2024 16:38
@jmmartinez jmmartinez self-assigned this Nov 25, 2024
@llvmbot
Copy link
Member

llvmbot commented Nov 25, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Juan Manuel Martinez Caamaño (jmmartinez)

Changes

In one test code generation diverged between GISEL and DAG

For example, this intrinsic

> %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)

would be lowered into these two cases:

  • buffer_load_u8 v2, v2, s[0:3], null offen
  • buffer_load_b32 v2, v2, s[0:3], null offen

This patch fixes this issue.


Full diff: https://github.com/llvm/llvm-project/pull/117574.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+31-10)
  • (modified) llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll (+24-54)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8c050348f753bb..d3d4062c34f88c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1406,16 +1406,37 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
     if (i != 0)
       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
 
-    B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
-      .addDef(LoadParts[i])       // vdata
-      .addUse(RSrc)               // rsrc
-      .addUse(VIndex)             // vindex
-      .addUse(VOffset)            // voffset
-      .addUse(SOffset)            // soffset
-      .addImm(ImmOffset + 16 * i) // offset(imm)
-      .addImm(0)                  // cachepolicy, swizzled buffer(imm)
-      .addImm(0)                  // idxen(imm)
-      .addMemOperand(MMO);
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+      break;
+    case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
+      break;
+    default:
+      llvm_unreachable("Unexpected opcode");
+    }
+
+    B.buildInstr(Opc)
+        .addDef(LoadParts[i])       // vdata
+        .addUse(RSrc)               // rsrc
+        .addUse(VIndex)             // vindex
+        .addUse(VOffset)            // voffset
+        .addUse(SOffset)            // soffset
+        .addImm(ImmOffset + 16 * i) // offset(imm)
+        .addImm(0)                  // cachepolicy, swizzled buffer(imm)
+        .addImm(0)                  // idxen(imm)
+        .addMemOperand(MMO);
   }
 
   // TODO: If only the resource is a VGPR, it may be better to execute the
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 020c9dc130bb2a..61ae9639c52d00 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -465,19 +465,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_i8 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_i8 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i8 %ld to i32
@@ -538,20 +531,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_u8 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_u8 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i8 %ld to i32
@@ -606,19 +591,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_i16 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_i16 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
   %sext = sext i16 %ld to i32
@@ -679,20 +657,12 @@ main_body:
 }
 
 define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) {
-; DAG-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; DAG:       ; %bb.0: ; %main_body
-; DAG-NEXT:    buffer_load_u16 v2, v2, s[0:3], null offen
-; DAG-NEXT:    s_wait_loadcnt 0x0
-; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
-; GISEL:       ; %bb.0: ; %main_body
-; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
-; GISEL-NEXT:    s_wait_loadcnt 0x0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    buffer_load_u16 v2, v2, s[0:3], null offen
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
   %zext = zext i16 %ld to i32

…g BUFFER_LOAD_XXX

In some tests code generation diverged between isel and selection-dag

For exmaple, this intrinsic

    call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32
0)

would be lowered into these two cases:
* buffer_load_u16 v2, v2, s[0:3], null offen
* buffer_load_b32 v2, v2, s[0:3], null offen

This patch fixes this issue.
Copy link
Contributor

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

This is a bug fix so maybe it should be backported to 19.1.x. But I guess no-one is using GlobalISel for real, so maybe it does not matter.

@jmmartinez jmmartinez merged commit ace87ec into llvm:main Dec 16, 2024
8 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Dec 16, 2024

LLVM Buildbot has detected a new failure on builder lldb-aarch64-ubuntu running on linaro-lldb-aarch64-ubuntu while building llvm at step 6 "test".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/59/builds/9870

Here is the relevant piece of the build log for the reference
Step 6 (test) failure: build (failure)
...
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentDelayWatchBreak.py (616 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentDelayedCrashWithBreakpointSignal.py (617 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentDelayedCrashWithBreakpointWatchpoint.py (618 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentManyBreakpoints.py (619 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentNWatchNBreak.py (620 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentSignalBreak.py (621 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentSignalDelayBreak.py (622 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentManyCrash.py (623 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentManySignals.py (624 of 2060)
PASS: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentManyWatchpoints.py (625 of 2060)
FAIL: lldb-api :: functionalities/thread/concurrent_events/TestConcurrentSignalWatch.py (626 of 2060)
******************** TEST 'lldb-api :: functionalities/thread/concurrent_events/TestConcurrentSignalWatch.py' FAILED ********************
Script:
--
/usr/bin/python3.10 /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/test/API/dotest.py -u CXXFLAGS -u CFLAGS --env LLVM_LIBS_DIR=/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/./lib --env LLVM_INCLUDE_DIR=/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/include --env LLVM_TOOLS_DIR=/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/./bin --arch aarch64 --build-dir /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/lldb-test-build.noindex --lldb-module-cache-dir /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/lldb-test-build.noindex/module-cache-lldb/lldb-api --clang-module-cache-dir /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/lldb-test-build.noindex/module-cache-clang/lldb-api --executable /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/./bin/lldb --compiler /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/./bin/clang --dsymutil /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/./bin/dsymutil --make /usr/bin/gmake --llvm-tools-dir /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/./bin --lldb-obj-root /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/tools/lldb --lldb-libs-dir /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/./lib /home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/test/API/functionalities/thread/concurrent_events -p TestConcurrentSignalWatch.py
--
Exit Code: 1

Command Output (stdout):
--
lldb version 20.0.0git (https://github.com/llvm/llvm-project.git revision ace87ec04cd588e5fbe393c3b642bd759a7abadb)
  clang revision ace87ec04cd588e5fbe393c3b642bd759a7abadb
  llvm revision ace87ec04cd588e5fbe393c3b642bd759a7abadb
Skipping the following test categories: ['libc++', 'dsym', 'gmodules', 'debugserver', 'objc']

Watchpoint 1 hit:
old value: 0
new value: 1

--
Command Output (stderr):
--
FAIL: LLDB (/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/bin/clang-aarch64) :: test (TestConcurrentSignalWatch.ConcurrentSignalWatch)
======================================================================
FAIL: test (TestConcurrentSignalWatch.ConcurrentSignalWatch)
   Test a watchpoint and a signal in multiple threads.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/decorators.py", line 148, in wrapper
    return func(*args, **kwargs)
  File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/test/API/functionalities/thread/concurrent_events/TestConcurrentSignalWatch.py", line 14, in test
    self.do_thread_actions(num_signal_threads=1, num_watchpoint_threads=1)
  File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/concurrent_base.py", line 333, in do_thread_actions
    self.assertEqual(
AssertionError: 1 != 2 : Expected 1 stops due to signal delivery, but got 2
Config=aarch64-/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/build/bin/clang
----------------------------------------------------------------------
Ran 1 test in 0.660s


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants