Skip to content

Commit 2a95022

Browse files
committed
AMDGPU: Add atomic bfloat load/store codegen tests
1 parent 80628ee commit 2a95022

File tree

4 files changed

+659
-0
lines changed

4 files changed

+659
-0
lines changed

llvm/test/CodeGen/AMDGPU/atomic_load_local.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,57 @@ define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr
152152
%load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
153153
ret ptr addrspace(3) %load
154154
}
155+
156+
; GCN-LABEL: {{^}}atomic_load_monotonic_f16:
157+
; GCN: s_waitcnt
158+
; GFX9-NOT: s_mov_b32 m0
159+
; CI-NEXT: s_mov_b32 m0
160+
; GCN-NEXT: ds_read_u16 v0, v0{{$}}
161+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
162+
; GCN-NEXT: s_setpc_b64
163+
define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) {
164+
%load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2
165+
%ret = bitcast half %load to i16
166+
ret i16 %ret
167+
}
168+
169+
; GCN-LABEL: {{^}}atomic_load_monotonic_f16_offset:
170+
; GCN: s_waitcnt
171+
; GFX9-NOT: s_mov_b32 m0
172+
; CI-NEXT: s_mov_b32 m0
173+
; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
174+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
175+
; GCN-NEXT: s_setpc_b64
176+
define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) {
177+
%gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
178+
%load = load atomic half, ptr addrspace(3) %gep monotonic, align 2
179+
%ret = bitcast half %load to i16
180+
ret i16 %ret
181+
}
182+
183+
; GCN-LABEL: {{^}}atomic_load_monotonic_bf16:
184+
; GCN: s_waitcnt
185+
; GFX9-NOT: s_mov_b32 m0
186+
; CI-NEXT: s_mov_b32 m0
187+
; GCN-NEXT: ds_read_u16 v0, v0{{$}}
188+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
189+
; GCN-NEXT: s_setpc_b64
190+
define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) {
191+
%load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
192+
%ret = bitcast bfloat %load to i16
193+
ret i16 %ret
194+
}
195+
196+
; GCN-LABEL: {{^}}atomic_load_monotonic_bf16_offset:
197+
; GCN: s_waitcnt
198+
; GFX9-NOT: s_mov_b32 m0
199+
; CI-NEXT: s_mov_b32 m0
200+
; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
201+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
202+
; GCN-NEXT: s_setpc_b64
203+
define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
204+
%gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
205+
%load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2
206+
%ret = bitcast bfloat %load to i16
207+
ret i16 %ret
208+
}

llvm/test/CodeGen/AMDGPU/atomic_store_local.ll

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,56 @@ define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val)
101101
ret void
102102
}
103103

104+
; GCN-LABEL: {{^}}atomic_store_monotonic_f16:
105+
; GCN: s_waitcnt
106+
; GFX9-NOT: s_mov_b32 m0
107+
; CI-NEXT: s_mov_b32 m0
108+
; GCN-NEXT: ds_write_b16 v0, v1{{$}}
109+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
110+
; GCN-NEXT: s_setpc_b64
111+
define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
112+
%val = bitcast i16 %arg.val to half
113+
store atomic half %val, ptr addrspace(3) %ptr monotonic, align 2
114+
ret void
115+
}
116+
117+
; GCN-LABEL: {{^}}atomic_store_monotonic_offset_f16:
118+
; GCN: s_waitcnt
119+
; GFX9-NOT: s_mov_b32 m0
120+
; CI-NEXT: s_mov_b32 m0
121+
; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
122+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
123+
; GCN-NEXT: s_setpc_b64
124+
define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
125+
%val = bitcast i16 %arg.val to half
126+
%gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
127+
store atomic half %val, ptr addrspace(3) %gep monotonic, align 2
128+
ret void
129+
}
130+
131+
; GCN-LABEL: {{^}}atomic_store_monotonic_bf16:
132+
; GCN: s_waitcnt
133+
; GFX9-NOT: s_mov_b32 m0
134+
; CI-NEXT: s_mov_b32 m0
135+
; GCN-NEXT: ds_write_b16 v0, v1{{$}}
136+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
137+
; GCN-NEXT: s_setpc_b64
138+
define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
139+
%val = bitcast i16 %arg.val to bfloat
140+
store atomic bfloat %val, ptr addrspace(3) %ptr monotonic, align 2
141+
ret void
142+
}
143+
144+
; GCN-LABEL: {{^}}atomic_store_monotonic_offset_bf16:
145+
; GCN: s_waitcnt
146+
; GFX9-NOT: s_mov_b32 m0
147+
; CI-NEXT: s_mov_b32 m0
148+
; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
149+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
150+
; GCN-NEXT: s_setpc_b64
151+
define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
152+
%val = bitcast i16 %arg.val to bfloat
153+
%gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
154+
store atomic bfloat %val, ptr addrspace(3) %gep monotonic, align 2
155+
ret void
156+
}

llvm/test/CodeGen/AMDGPU/flat_atomics.ll

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6741,6 +6741,81 @@ entry:
67416741
ret void
67426742
}
67436743

6744+
define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) {
6745+
; GCN1-LABEL: atomic_store_bf16_offset:
6746+
; GCN1: ; %bb.0:
6747+
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6748+
; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6749+
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6750+
; GCN1-NEXT: v_mov_b32_e32 v0, s2
6751+
; GCN1-NEXT: v_mov_b32_e32 v1, s3
6752+
; GCN1-NEXT: v_mov_b32_e32 v2, s0
6753+
; GCN1-NEXT: flat_store_short v[0:1], v2
6754+
; GCN1-NEXT: s_endpgm
6755+
;
6756+
; GCN2-LABEL: atomic_store_bf16_offset:
6757+
; GCN2: ; %bb.0:
6758+
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6759+
; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6760+
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6761+
; GCN2-NEXT: v_mov_b32_e32 v0, s2
6762+
; GCN2-NEXT: v_mov_b32_e32 v1, s3
6763+
; GCN2-NEXT: v_mov_b32_e32 v2, s0
6764+
; GCN2-NEXT: flat_store_short v[0:1], v2
6765+
; GCN2-NEXT: s_endpgm
6766+
;
6767+
; GCN3-LABEL: atomic_store_bf16_offset:
6768+
; GCN3: ; %bb.0:
6769+
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6770+
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6771+
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6772+
; GCN3-NEXT: v_mov_b32_e32 v0, s2
6773+
; GCN3-NEXT: v_mov_b32_e32 v1, s3
6774+
; GCN3-NEXT: v_mov_b32_e32 v2, s4
6775+
; GCN3-NEXT: flat_store_short v[0:1], v2
6776+
; GCN3-NEXT: s_endpgm
6777+
%gep = getelementptr bfloat, ptr %out, i64 8
6778+
store atomic bfloat %in, ptr %out seq_cst, align 2
6779+
ret void
6780+
}
6781+
6782+
define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) {
6783+
; GCN1-LABEL: atomic_store_bf16:
6784+
; GCN1: ; %bb.0:
6785+
; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6786+
; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6787+
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6788+
; GCN1-NEXT: v_mov_b32_e32 v0, s2
6789+
; GCN1-NEXT: v_mov_b32_e32 v1, s3
6790+
; GCN1-NEXT: v_mov_b32_e32 v2, s0
6791+
; GCN1-NEXT: flat_store_short v[0:1], v2
6792+
; GCN1-NEXT: s_endpgm
6793+
;
6794+
; GCN2-LABEL: atomic_store_bf16:
6795+
; GCN2: ; %bb.0:
6796+
; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6797+
; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6798+
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6799+
; GCN2-NEXT: v_mov_b32_e32 v0, s2
6800+
; GCN2-NEXT: v_mov_b32_e32 v1, s3
6801+
; GCN2-NEXT: v_mov_b32_e32 v2, s0
6802+
; GCN2-NEXT: flat_store_short v[0:1], v2
6803+
; GCN2-NEXT: s_endpgm
6804+
;
6805+
; GCN3-LABEL: atomic_store_bf16:
6806+
; GCN3: ; %bb.0:
6807+
; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6808+
; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6809+
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6810+
; GCN3-NEXT: v_mov_b32_e32 v0, s2
6811+
; GCN3-NEXT: v_mov_b32_e32 v1, s3
6812+
; GCN3-NEXT: v_mov_b32_e32 v2, s4
6813+
; GCN3-NEXT: flat_store_short v[0:1], v2
6814+
; GCN3-NEXT: s_endpgm
6815+
store atomic bfloat %in, ptr %out seq_cst, align 2
6816+
ret void
6817+
}
6818+
67446819
define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
67456820
; GCN1-LABEL: atomic_inc_i32_offset:
67466821
; GCN1: ; %bb.0: ; %entry
@@ -7868,3 +7943,201 @@ entry:
78687943
store i32 %val, ptr %out2
78697944
ret void
78707945
}
7946+
7947+
define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
7948+
; GCN1-LABEL: atomic_load_f16_offset:
7949+
; GCN1: ; %bb.0:
7950+
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7951+
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7952+
; GCN1-NEXT: s_add_u32 s0, s0, 16
7953+
; GCN1-NEXT: s_addc_u32 s1, s1, 0
7954+
; GCN1-NEXT: v_mov_b32_e32 v0, s0
7955+
; GCN1-NEXT: v_mov_b32_e32 v1, s1
7956+
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
7957+
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7958+
; GCN1-NEXT: buffer_wbinvl1_vol
7959+
; GCN1-NEXT: v_mov_b32_e32 v0, s2
7960+
; GCN1-NEXT: v_mov_b32_e32 v1, s3
7961+
; GCN1-NEXT: flat_store_short v[0:1], v2
7962+
; GCN1-NEXT: s_endpgm
7963+
;
7964+
; GCN2-LABEL: atomic_load_f16_offset:
7965+
; GCN2: ; %bb.0:
7966+
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7967+
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7968+
; GCN2-NEXT: s_add_u32 s0, s0, 16
7969+
; GCN2-NEXT: s_addc_u32 s1, s1, 0
7970+
; GCN2-NEXT: v_mov_b32_e32 v0, s0
7971+
; GCN2-NEXT: v_mov_b32_e32 v1, s1
7972+
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
7973+
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7974+
; GCN2-NEXT: buffer_wbinvl1_vol
7975+
; GCN2-NEXT: v_mov_b32_e32 v0, s2
7976+
; GCN2-NEXT: v_mov_b32_e32 v1, s3
7977+
; GCN2-NEXT: flat_store_short v[0:1], v2
7978+
; GCN2-NEXT: s_endpgm
7979+
;
7980+
; GCN3-LABEL: atomic_load_f16_offset:
7981+
; GCN3: ; %bb.0:
7982+
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7983+
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7984+
; GCN3-NEXT: v_mov_b32_e32 v0, s0
7985+
; GCN3-NEXT: v_mov_b32_e32 v1, s1
7986+
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
7987+
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7988+
; GCN3-NEXT: buffer_wbinvl1_vol
7989+
; GCN3-NEXT: v_mov_b32_e32 v0, s2
7990+
; GCN3-NEXT: v_mov_b32_e32 v1, s3
7991+
; GCN3-NEXT: flat_store_short v[0:1], v2
7992+
; GCN3-NEXT: s_endpgm
7993+
%gep = getelementptr half, ptr %in, i64 8
7994+
%val = load atomic half, ptr %gep seq_cst, align 2
7995+
store half %val, ptr %out
7996+
ret void
7997+
}
7998+
7999+
define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
8000+
; GCN1-LABEL: atomic_load_f16:
8001+
; GCN1: ; %bb.0:
8002+
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8003+
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8004+
; GCN1-NEXT: v_mov_b32_e32 v0, s0
8005+
; GCN1-NEXT: v_mov_b32_e32 v1, s1
8006+
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8007+
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8008+
; GCN1-NEXT: buffer_wbinvl1_vol
8009+
; GCN1-NEXT: v_mov_b32_e32 v0, s2
8010+
; GCN1-NEXT: v_mov_b32_e32 v1, s3
8011+
; GCN1-NEXT: flat_store_short v[0:1], v2
8012+
; GCN1-NEXT: s_endpgm
8013+
;
8014+
; GCN2-LABEL: atomic_load_f16:
8015+
; GCN2: ; %bb.0:
8016+
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8017+
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8018+
; GCN2-NEXT: v_mov_b32_e32 v0, s0
8019+
; GCN2-NEXT: v_mov_b32_e32 v1, s1
8020+
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8021+
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8022+
; GCN2-NEXT: buffer_wbinvl1_vol
8023+
; GCN2-NEXT: v_mov_b32_e32 v0, s2
8024+
; GCN2-NEXT: v_mov_b32_e32 v1, s3
8025+
; GCN2-NEXT: flat_store_short v[0:1], v2
8026+
; GCN2-NEXT: s_endpgm
8027+
;
8028+
; GCN3-LABEL: atomic_load_f16:
8029+
; GCN3: ; %bb.0:
8030+
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8031+
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8032+
; GCN3-NEXT: v_mov_b32_e32 v0, s0
8033+
; GCN3-NEXT: v_mov_b32_e32 v1, s1
8034+
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
8035+
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8036+
; GCN3-NEXT: buffer_wbinvl1_vol
8037+
; GCN3-NEXT: v_mov_b32_e32 v0, s2
8038+
; GCN3-NEXT: v_mov_b32_e32 v1, s3
8039+
; GCN3-NEXT: flat_store_short v[0:1], v2
8040+
; GCN3-NEXT: s_endpgm
8041+
%val = load atomic half, ptr %in seq_cst, align 2
8042+
store half %val, ptr %out
8043+
ret void
8044+
}
8045+
8046+
define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
8047+
; GCN1-LABEL: atomic_load_bf16_offset:
8048+
; GCN1: ; %bb.0:
8049+
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8050+
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8051+
; GCN1-NEXT: s_add_u32 s0, s0, 16
8052+
; GCN1-NEXT: s_addc_u32 s1, s1, 0
8053+
; GCN1-NEXT: v_mov_b32_e32 v0, s0
8054+
; GCN1-NEXT: v_mov_b32_e32 v1, s1
8055+
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8056+
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8057+
; GCN1-NEXT: buffer_wbinvl1_vol
8058+
; GCN1-NEXT: v_mov_b32_e32 v0, s2
8059+
; GCN1-NEXT: v_mov_b32_e32 v1, s3
8060+
; GCN1-NEXT: flat_store_short v[0:1], v2
8061+
; GCN1-NEXT: s_endpgm
8062+
;
8063+
; GCN2-LABEL: atomic_load_bf16_offset:
8064+
; GCN2: ; %bb.0:
8065+
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8066+
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8067+
; GCN2-NEXT: s_add_u32 s0, s0, 16
8068+
; GCN2-NEXT: s_addc_u32 s1, s1, 0
8069+
; GCN2-NEXT: v_mov_b32_e32 v0, s0
8070+
; GCN2-NEXT: v_mov_b32_e32 v1, s1
8071+
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8072+
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8073+
; GCN2-NEXT: buffer_wbinvl1_vol
8074+
; GCN2-NEXT: v_mov_b32_e32 v0, s2
8075+
; GCN2-NEXT: v_mov_b32_e32 v1, s3
8076+
; GCN2-NEXT: flat_store_short v[0:1], v2
8077+
; GCN2-NEXT: s_endpgm
8078+
;
8079+
; GCN3-LABEL: atomic_load_bf16_offset:
8080+
; GCN3: ; %bb.0:
8081+
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8082+
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8083+
; GCN3-NEXT: v_mov_b32_e32 v0, s0
8084+
; GCN3-NEXT: v_mov_b32_e32 v1, s1
8085+
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
8086+
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8087+
; GCN3-NEXT: buffer_wbinvl1_vol
8088+
; GCN3-NEXT: v_mov_b32_e32 v0, s2
8089+
; GCN3-NEXT: v_mov_b32_e32 v1, s3
8090+
; GCN3-NEXT: flat_store_short v[0:1], v2
8091+
; GCN3-NEXT: s_endpgm
8092+
%gep = getelementptr bfloat, ptr %in, i64 8
8093+
%val = load atomic bfloat, ptr %gep seq_cst, align 2
8094+
store bfloat %val, ptr %out
8095+
ret void
8096+
}
8097+
8098+
define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
8099+
; GCN1-LABEL: atomic_load_bf16:
8100+
; GCN1: ; %bb.0:
8101+
; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8102+
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8103+
; GCN1-NEXT: v_mov_b32_e32 v0, s0
8104+
; GCN1-NEXT: v_mov_b32_e32 v1, s1
8105+
; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8106+
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8107+
; GCN1-NEXT: buffer_wbinvl1_vol
8108+
; GCN1-NEXT: v_mov_b32_e32 v0, s2
8109+
; GCN1-NEXT: v_mov_b32_e32 v1, s3
8110+
; GCN1-NEXT: flat_store_short v[0:1], v2
8111+
; GCN1-NEXT: s_endpgm
8112+
;
8113+
; GCN2-LABEL: atomic_load_bf16:
8114+
; GCN2: ; %bb.0:
8115+
; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8116+
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8117+
; GCN2-NEXT: v_mov_b32_e32 v0, s0
8118+
; GCN2-NEXT: v_mov_b32_e32 v1, s1
8119+
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8120+
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8121+
; GCN2-NEXT: buffer_wbinvl1_vol
8122+
; GCN2-NEXT: v_mov_b32_e32 v0, s2
8123+
; GCN2-NEXT: v_mov_b32_e32 v1, s3
8124+
; GCN2-NEXT: flat_store_short v[0:1], v2
8125+
; GCN2-NEXT: s_endpgm
8126+
;
8127+
; GCN3-LABEL: atomic_load_bf16:
8128+
; GCN3: ; %bb.0:
8129+
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8130+
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8131+
; GCN3-NEXT: v_mov_b32_e32 v0, s0
8132+
; GCN3-NEXT: v_mov_b32_e32 v1, s1
8133+
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
8134+
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8135+
; GCN3-NEXT: buffer_wbinvl1_vol
8136+
; GCN3-NEXT: v_mov_b32_e32 v0, s2
8137+
; GCN3-NEXT: v_mov_b32_e32 v1, s3
8138+
; GCN3-NEXT: flat_store_short v[0:1], v2
8139+
; GCN3-NEXT: s_endpgm
8140+
%val = load atomic bfloat, ptr %in seq_cst, align 2
8141+
store bfloat %val, ptr %out
8142+
ret void
8143+
}

0 commit comments

Comments
 (0)