-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Add some more GFX12 test coverage #120581
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-mc Author: Jay Foad (jayfoad) ChangesPatch is 277.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120581.diff 19 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index dc9e1f24438302..d62da6921b3479 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -4,6 +4,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
; FIXME: Merge with other test. DS offset folding doesn't work due to
; register bank copies, and no return optimization is missing.
@@ -85,6 +86,18 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_ret_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: ds_inc_rtn_u32 v0, v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
store i32 %result, ptr addrspace(1) %out, align 4
ret void
@@ -162,6 +175,18 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_ret_i32_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
store i32 %result, ptr addrspace(1) %out, align 4
@@ -221,6 +246,16 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_noret_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: ds_inc_u32 v0, v1
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
ret void
}
@@ -278,6 +313,16 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_noret_i32_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: ds_inc_u32 v1, v0 offset:16
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
ret void
@@ -350,6 +395,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_ret_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
store i32 %result, ptr addrspace(1) %out, align 4
ret void
@@ -426,6 +482,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_ret_i32_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
store i32 %result, ptr addrspace(1) %out, align 4
@@ -503,6 +570,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_ret_i32_offset_sistem:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
store i32 %result, ptr addrspace(1) %out, align 4
@@ -567,6 +646,16 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_noret_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
ret void
}
@@ -633,6 +722,16 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_noret_i32_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
ret void
@@ -700,6 +799,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_noret_i32_offset_system:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
ret void
@@ -788,6 +898,19 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_ret_i32_offset_addr64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
@@ -867,6 +990,18 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_noret_i32_offset_addr64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
@@ -956,6 +1091,23 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: global_store_b32 v2, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_inc_shl_base_lds_0_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX12-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX12-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b32 v2, v0, s[2:3]
+; GFX12-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
@@ -1042,6 +1194,19 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_ret_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
store i64 %result, ptr addrspace(1) %out, align 4
ret void
@@ -1124,6 +1289,19 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_ret_i64_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
store i64 %result, ptr addrspace(1) %out, align 4
@@ -1188,6 +1366,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_noret_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: ds_inc_u64 v2, v[0:1]
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
ret void
}
@@ -1250,6 +1439,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: lds_atomic_inc_noret_i64_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: ds_inc_u64 v2, v[0:1] offset:32
+; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
ret void
@@ -1327,6 +1527,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_ret_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
store i64 %result, ptr addrspace(1) %out, align 4
ret void
@@ -1408,6 +1620,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_ret_i64_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
store i64 %result, ptr addrspace(1) %out, align 4
@@ -1490,6 +1714,19 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_ret_i64_offset_system:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
store i64 %result, ptr addrspace(1) %out, align 4
@@ -1559,6 +1796,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_noret_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
ret void
}
@@ -1630,6 +1878,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_noret_i64_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
ret void
@@ -1702,6 +1961,18 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_atomic_inc_noret_i64_offset_system:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 42
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT: ...
[truncated]
|
%26 = fcmp oeq float %25, 1.000000e+00 | ||
%27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %10, i32 0, i32 0, i32 0) | ||
%.not2 = icmp eq i32 %27, 2752 | ||
%28 = select i1 %.not2, i1 %26, i1 false |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use named values
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
@@ -3766,3 +3766,69 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double | |||
%insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 | |||
ret <2 x double> %insert.1 | |||
} | |||
|
|||
; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is achieved by v_fmaximum3_v2f16 already? This one just has a second instance
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. This had been removed upstream but for some reason lived on in our downstream branch.
No description provided.