Skip to content

Commit 570f362

Browse files
committed
AMDGPU: Fix legalization for llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16
We somehow ended up with llvm.amdgcn.raw.ptr.buffer.atomic.fadd, and llvm.amdgcn.raw.buffer.ptr.atomic.fadd.v2bf16 intrinsic definitions, despite the second being the canonical mangling for the first intrinsic with v2bf16. This requires us to handle it as a separate case. Surprisingly, cases generating the regular 1st intrinsic with bfloat end up getting ID'd as the second. The selection is still broken for the gfx9 cases.
1 parent 55696db commit 570f362

File tree

2 files changed

+110
-0
lines changed

2 files changed

+110
-0
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8833,6 +8833,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
88338833
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
88348834
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
88358835
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8836+
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
88368837
case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
88378838
return lowerRawBufferAtomicIntrin(Op, DAG,
88388839
AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; FIXME: Test 90a, 940. 908 should fail to select.
3+
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
4+
5+
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
6+
; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
7+
; GFX12: ; %bb.0:
8+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
9+
; GFX12-NEXT: s_wait_expcnt 0x0
10+
; GFX12-NEXT: s_wait_samplecnt 0x0
11+
; GFX12-NEXT: s_wait_bvhcnt 0x0
12+
; GFX12-NEXT: s_wait_kmcnt 0x0
13+
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128 th:TH_ATOMIC_RETURN
14+
; GFX12-NEXT: s_wait_loadcnt 0x0
15+
; GFX12-NEXT: s_setpc_b64 s[30:31]
16+
%voffset.add = add i32 %voffset, 128
17+
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
18+
ret <2 x bfloat> %ret
19+
}
20+
21+
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
22+
; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
23+
; GFX12: ; %bb.0:
24+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
25+
; GFX12-NEXT: s_wait_expcnt 0x0
26+
; GFX12-NEXT: s_wait_samplecnt 0x0
27+
; GFX12-NEXT: s_wait_bvhcnt 0x0
28+
; GFX12-NEXT: s_wait_kmcnt 0x0
29+
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT_RETURN
30+
; GFX12-NEXT: s_wait_loadcnt 0x0
31+
; GFX12-NEXT: s_setpc_b64 s[30:31]
32+
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
33+
ret <2 x bfloat> %ret
34+
}
35+
36+
define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
37+
; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
38+
; GFX12: ; %bb.0:
39+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
40+
; GFX12-NEXT: s_wait_expcnt 0x0
41+
; GFX12-NEXT: s_wait_samplecnt 0x0
42+
; GFX12-NEXT: s_wait_bvhcnt 0x0
43+
; GFX12-NEXT: s_wait_kmcnt 0x0
44+
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128
45+
; GFX12-NEXT: s_setpc_b64 s[30:31]
46+
%voffset.add = add i32 %voffset, 128
47+
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
48+
ret void
49+
}
50+
51+
define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
52+
; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
53+
; GFX12: ; %bb.0:
54+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
55+
; GFX12-NEXT: s_wait_expcnt 0x0
56+
; GFX12-NEXT: s_wait_samplecnt 0x0
57+
; GFX12-NEXT: s_wait_bvhcnt 0x0
58+
; GFX12-NEXT: s_wait_kmcnt 0x0
59+
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT
60+
; GFX12-NEXT: s_setpc_b64 s[30:31]
61+
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
62+
ret void
63+
}
64+
65+
; Test waterfall loop
66+
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) #0 {
67+
; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
68+
; GFX12: ; %bb.0:
69+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
70+
; GFX12-NEXT: s_wait_expcnt 0x0
71+
; GFX12-NEXT: s_wait_samplecnt 0x0
72+
; GFX12-NEXT: s_wait_bvhcnt 0x0
73+
; GFX12-NEXT: s_wait_kmcnt 0x0
74+
; GFX12-NEXT: s_mov_b32 s2, exec_lo
75+
; GFX12-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
76+
; GFX12-NEXT: v_readfirstlane_b32 s4, v1
77+
; GFX12-NEXT: v_readfirstlane_b32 s5, v2
78+
; GFX12-NEXT: v_readfirstlane_b32 s6, v3
79+
; GFX12-NEXT: v_readfirstlane_b32 s7, v4
80+
; GFX12-NEXT: v_readfirstlane_b32 s3, v6
81+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
82+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
83+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
84+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
85+
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
86+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
87+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
88+
; GFX12-NEXT: s_and_b32 s0, s0, s1
89+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
90+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
91+
; GFX12-NEXT: s_wait_loadcnt 0x0
92+
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
93+
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
94+
; GFX12-NEXT: ; implicit-def: $vgpr6
95+
; GFX12-NEXT: ; implicit-def: $vgpr5
96+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
97+
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
98+
; GFX12-NEXT: ; %bb.2:
99+
; GFX12-NEXT: s_mov_b32 exec_lo, s2
100+
; GFX12-NEXT: s_wait_loadcnt 0x0
101+
; GFX12-NEXT: s_setpc_b64 s[30:31]
102+
%voffset.add = add i32 %voffset, 128
103+
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
104+
ret <2 x bfloat> %ret
105+
}
106+
107+
declare <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat>, ptr addrspace(8), i32, i32, i32 immarg)
108+
109+
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)