Skip to content

Commit cc3aab5

Browse files
authored
[AMDGPU] Handle nontemporal and amdgpu.last.use metadata in amdgpu-lower-buffer-fat-pointers (#120139)
1 parent 9d7d8d2 commit cc3aab5

File tree

5 files changed

+1233
-22
lines changed

5 files changed

+1233
-22
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,18 +1074,6 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
10741074
Args.push_back(IRB.getInt32(0));
10751075

10761076
uint32_t Aux = 0;
1077-
bool IsInvariant =
1078-
(isa<LoadInst>(I) && I->getMetadata(LLVMContext::MD_invariant_load));
1079-
bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal);
1080-
// Atomic loads and stores need glc, atomic read-modify-write doesn't.
1081-
bool IsOneWayAtomic =
1082-
!isa<AtomicRMWInst>(I) && Order != AtomicOrdering::NotAtomic;
1083-
if (IsOneWayAtomic)
1084-
Aux |= AMDGPU::CPol::GLC;
1085-
if (IsNonTemporal && !IsInvariant)
1086-
Aux |= AMDGPU::CPol::SLC;
1087-
if (isa<LoadInst>(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10)
1088-
Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0);
10891077
if (IsVolatile)
10901078
Aux |= AMDGPU::CPol::VOLATILE;
10911079
Args.push_back(IRB.getInt32(Aux));

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12021202
Info.flags = MachineMemOperand::MONone;
12031203
if (CI.hasMetadata(LLVMContext::MD_invariant_load))
12041204
Info.flags |= MachineMemOperand::MOInvariant;
1205+
if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1206+
Info.flags |= MachineMemOperand::MONonTemporal;
1207+
Info.flags |= getTargetMMOFlags(CI);
12051208

12061209
if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
12071210
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
4+
5+
6+
define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addrspace(7) %out) {
7+
; GFX12-LABEL: buffer_last_use_load_0:
8+
; GFX12: ; %bb.0: ; %entry
9+
; GFX12-NEXT: s_clause 0x2
10+
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
11+
; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
12+
; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
13+
; GFX12-NEXT: s_wait_kmcnt 0x0
14+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
15+
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
16+
; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
17+
; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
18+
; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32
19+
; GFX12-NEXT: s_clause 0x1
20+
; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40
21+
; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36
22+
; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
23+
; GFX12-NEXT: scratch_store_b128 off, v[7:10], off
24+
; GFX12-NEXT: s_clause 0x1
25+
; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8
26+
; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4
27+
; GFX12-NEXT: v_mov_b32_e32 v7, s6
28+
; GFX12-NEXT: v_mov_b32_e32 v9, s0
29+
; GFX12-NEXT: s_wait_kmcnt 0x0
30+
; GFX12-NEXT: v_mov_b32_e32 v3, s1
31+
; GFX12-NEXT: s_mov_b32 s1, exec_lo
32+
; GFX12-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
33+
; GFX12-NEXT: s_wait_loadcnt 0x2
34+
; GFX12-NEXT: v_readfirstlane_b32 s4, v4
35+
; GFX12-NEXT: v_readfirstlane_b32 s5, v5
36+
; GFX12-NEXT: v_readfirstlane_b32 s6, v6
37+
; GFX12-NEXT: v_readfirstlane_b32 s7, v7
38+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
39+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
40+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
41+
; GFX12-NEXT: s_wait_alu 0xfffe
42+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
43+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
44+
; GFX12-NEXT: s_wait_alu 0xfffe
45+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
46+
; GFX12-NEXT: s_wait_loadcnt 0x0
47+
; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
48+
; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
49+
; GFX12-NEXT: ; implicit-def: $vgpr9
50+
; GFX12-NEXT: s_wait_alu 0xfffe
51+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
52+
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
53+
; GFX12-NEXT: ; %bb.2:
54+
; GFX12-NEXT: s_mov_b32 exec_lo, s1
55+
; GFX12-NEXT: v_mov_b32_e32 v4, s8
56+
; GFX12-NEXT: s_mov_b32 s0, exec_lo
57+
; GFX12-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1
58+
; GFX12-NEXT: s_wait_loadcnt 0x1
59+
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
60+
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
61+
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
62+
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
63+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
64+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
65+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
66+
; GFX12-NEXT: s_wait_alu 0xfffe
67+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
68+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
69+
; GFX12-NEXT: s_wait_alu 0xfffe
70+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
71+
; GFX12-NEXT: s_wait_loadcnt 0x0
72+
; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen
73+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
74+
; GFX12-NEXT: ; implicit-def: $vgpr8
75+
; GFX12-NEXT: ; implicit-def: $vgpr4
76+
; GFX12-NEXT: s_wait_alu 0xfffe
77+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
78+
; GFX12-NEXT: s_cbranch_execnz .LBB0_3
79+
; GFX12-NEXT: ; %bb.4:
80+
; GFX12-NEXT: s_endpgm
81+
entry:
82+
%val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
83+
store i32 %val, ptr addrspace(7) %out
84+
ret void
85+
}
86+
87+
define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addrspace(7) %out) {
88+
; GFX12-LABEL: buffer_last_use_load_1:
89+
; GFX12: ; %bb.0: ; %entry
90+
; GFX12-NEXT: s_clause 0x2
91+
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
92+
; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
93+
; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
94+
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
95+
; GFX12-NEXT: s_wait_kmcnt 0x0
96+
; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
97+
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
98+
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
99+
; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
100+
; GFX12-NEXT: scratch_store_b128 off, v[1:4], off offset:32
101+
; GFX12-NEXT: s_clause 0x1
102+
; GFX12-NEXT: scratch_load_b64 v[6:7], off, off offset:40
103+
; GFX12-NEXT: scratch_load_b32 v5, off, off offset:36
104+
; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
105+
; GFX12-NEXT: scratch_store_b128 off, v[8:11], off
106+
; GFX12-NEXT: s_clause 0x1
107+
; GFX12-NEXT: scratch_load_b64 v[2:3], off, off offset:8
108+
; GFX12-NEXT: scratch_load_b32 v1, off, off offset:4
109+
; GFX12-NEXT: v_mov_b32_e32 v8, s6
110+
; GFX12-NEXT: v_lshl_add_u32 v9, v0, 2, s0
111+
; GFX12-NEXT: s_wait_kmcnt 0x0
112+
; GFX12-NEXT: v_mov_b32_e32 v4, s1
113+
; GFX12-NEXT: s_mov_b32 s1, exec_lo
114+
; GFX12-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
115+
; GFX12-NEXT: s_wait_loadcnt 0x2
116+
; GFX12-NEXT: v_readfirstlane_b32 s4, v5
117+
; GFX12-NEXT: v_readfirstlane_b32 s5, v6
118+
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
119+
; GFX12-NEXT: v_readfirstlane_b32 s7, v8
120+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
121+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
122+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
123+
; GFX12-NEXT: s_wait_alu 0xfffe
124+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
125+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
126+
; GFX12-NEXT: s_wait_alu 0xfffe
127+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
128+
; GFX12-NEXT: s_wait_loadcnt 0x0
129+
; GFX12-NEXT: buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU
130+
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8
131+
; GFX12-NEXT: ; implicit-def: $vgpr9
132+
; GFX12-NEXT: s_wait_alu 0xfffe
133+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
134+
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
135+
; GFX12-NEXT: ; %bb.2:
136+
; GFX12-NEXT: s_mov_b32 exec_lo, s1
137+
; GFX12-NEXT: v_mov_b32_e32 v5, s8
138+
; GFX12-NEXT: s_mov_b32 s0, exec_lo
139+
; GFX12-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1
140+
; GFX12-NEXT: s_wait_loadcnt 0x1
141+
; GFX12-NEXT: v_readfirstlane_b32 s4, v1
142+
; GFX12-NEXT: v_readfirstlane_b32 s5, v2
143+
; GFX12-NEXT: v_readfirstlane_b32 s6, v3
144+
; GFX12-NEXT: v_readfirstlane_b32 s7, v4
145+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
146+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
147+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
148+
; GFX12-NEXT: s_wait_alu 0xfffe
149+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
150+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
151+
; GFX12-NEXT: s_wait_alu 0xfffe
152+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
153+
; GFX12-NEXT: s_wait_loadcnt 0x0
154+
; GFX12-NEXT: buffer_store_b32 v0, v5, s[4:7], null offen
155+
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
156+
; GFX12-NEXT: ; implicit-def: $vgpr0
157+
; GFX12-NEXT: ; implicit-def: $vgpr5
158+
; GFX12-NEXT: s_wait_alu 0xfffe
159+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
160+
; GFX12-NEXT: s_cbranch_execnz .LBB1_3
161+
; GFX12-NEXT: ; %bb.4:
162+
; GFX12-NEXT: s_endpgm
163+
entry:
164+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
165+
%val.gep = getelementptr inbounds i32, ptr addrspace(7) %in, i32 %tid
166+
%val = load i32, ptr addrspace(7) %val.gep, align 4, !amdgpu.last.use !{}
167+
store i32 %val, ptr addrspace(7) %out
168+
ret void
169+
}
170+
171+
define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
172+
; GFX12-LABEL: buffer_last_use_and_volatile_load:
173+
; GFX12: ; %bb.0: ; %entry
174+
; GFX12-NEXT: s_clause 0x2
175+
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
176+
; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
177+
; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
178+
; GFX12-NEXT: s_wait_kmcnt 0x0
179+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
180+
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
181+
; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
182+
; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
183+
; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32
184+
; GFX12-NEXT: s_clause 0x1
185+
; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40
186+
; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36
187+
; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
188+
; GFX12-NEXT: scratch_store_b128 off, v[7:10], off
189+
; GFX12-NEXT: s_clause 0x1
190+
; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8
191+
; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4
192+
; GFX12-NEXT: v_mov_b32_e32 v7, s6
193+
; GFX12-NEXT: v_mov_b32_e32 v9, s0
194+
; GFX12-NEXT: s_wait_kmcnt 0x0
195+
; GFX12-NEXT: v_mov_b32_e32 v3, s1
196+
; GFX12-NEXT: s_mov_b32 s1, exec_lo
197+
; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
198+
; GFX12-NEXT: s_wait_loadcnt 0x2
199+
; GFX12-NEXT: v_readfirstlane_b32 s4, v4
200+
; GFX12-NEXT: v_readfirstlane_b32 s5, v5
201+
; GFX12-NEXT: v_readfirstlane_b32 s6, v6
202+
; GFX12-NEXT: v_readfirstlane_b32 s7, v7
203+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
204+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
205+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
206+
; GFX12-NEXT: s_wait_alu 0xfffe
207+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
208+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
209+
; GFX12-NEXT: s_wait_alu 0xfffe
210+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
211+
; GFX12-NEXT: s_wait_loadcnt 0x0
212+
; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
213+
; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
214+
; GFX12-NEXT: ; implicit-def: $vgpr9
215+
; GFX12-NEXT: s_wait_alu 0xfffe
216+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
217+
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
218+
; GFX12-NEXT: ; %bb.2:
219+
; GFX12-NEXT: s_mov_b32 exec_lo, s1
220+
; GFX12-NEXT: v_mov_b32_e32 v4, s8
221+
; GFX12-NEXT: s_mov_b32 s0, exec_lo
222+
; GFX12-NEXT: .LBB2_3: ; =>This Inner Loop Header: Depth=1
223+
; GFX12-NEXT: s_wait_loadcnt 0x1
224+
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
225+
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
226+
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
227+
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
228+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
229+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
230+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
231+
; GFX12-NEXT: s_wait_alu 0xfffe
232+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
233+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
234+
; GFX12-NEXT: s_wait_alu 0xfffe
235+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
236+
; GFX12-NEXT: s_wait_loadcnt 0x0
237+
; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen
238+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
239+
; GFX12-NEXT: ; implicit-def: $vgpr8
240+
; GFX12-NEXT: ; implicit-def: $vgpr4
241+
; GFX12-NEXT: s_wait_alu 0xfffe
242+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
243+
; GFX12-NEXT: s_cbranch_execnz .LBB2_3
244+
; GFX12-NEXT: ; %bb.4:
245+
; GFX12-NEXT: s_endpgm
246+
entry:
247+
%val = load volatile i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
248+
store i32 %val, ptr addrspace(7) %out
249+
ret void
250+
}
251+
252+
define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
253+
; GFX12-LABEL: buffer_last_use_and_nontemporal_load:
254+
; GFX12: ; %bb.0: ; %entry
255+
; GFX12-NEXT: s_clause 0x2
256+
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
257+
; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x20
258+
; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x10
259+
; GFX12-NEXT: s_wait_kmcnt 0x0
260+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
261+
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
262+
; GFX12-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
263+
; GFX12-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
264+
; GFX12-NEXT: scratch_store_b128 off, v[0:3], off offset:32
265+
; GFX12-NEXT: s_clause 0x1
266+
; GFX12-NEXT: scratch_load_b64 v[5:6], off, off offset:40
267+
; GFX12-NEXT: scratch_load_b32 v4, off, off offset:36
268+
; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x30
269+
; GFX12-NEXT: scratch_store_b128 off, v[7:10], off
270+
; GFX12-NEXT: s_clause 0x1
271+
; GFX12-NEXT: scratch_load_b64 v[1:2], off, off offset:8
272+
; GFX12-NEXT: scratch_load_b32 v0, off, off offset:4
273+
; GFX12-NEXT: v_mov_b32_e32 v7, s6
274+
; GFX12-NEXT: v_mov_b32_e32 v9, s0
275+
; GFX12-NEXT: s_wait_kmcnt 0x0
276+
; GFX12-NEXT: v_mov_b32_e32 v3, s1
277+
; GFX12-NEXT: s_mov_b32 s1, exec_lo
278+
; GFX12-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
279+
; GFX12-NEXT: s_wait_loadcnt 0x2
280+
; GFX12-NEXT: v_readfirstlane_b32 s4, v4
281+
; GFX12-NEXT: v_readfirstlane_b32 s5, v5
282+
; GFX12-NEXT: v_readfirstlane_b32 s6, v6
283+
; GFX12-NEXT: v_readfirstlane_b32 s7, v7
284+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
285+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
286+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
287+
; GFX12-NEXT: s_wait_alu 0xfffe
288+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
289+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
290+
; GFX12-NEXT: s_wait_alu 0xfffe
291+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
292+
; GFX12-NEXT: s_wait_loadcnt 0x0
293+
; GFX12-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
294+
; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
295+
; GFX12-NEXT: ; implicit-def: $vgpr9
296+
; GFX12-NEXT: s_wait_alu 0xfffe
297+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
298+
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
299+
; GFX12-NEXT: ; %bb.2:
300+
; GFX12-NEXT: s_mov_b32 exec_lo, s1
301+
; GFX12-NEXT: v_mov_b32_e32 v4, s8
302+
; GFX12-NEXT: s_mov_b32 s0, exec_lo
303+
; GFX12-NEXT: .LBB3_3: ; =>This Inner Loop Header: Depth=1
304+
; GFX12-NEXT: s_wait_loadcnt 0x1
305+
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
306+
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
307+
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
308+
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
309+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
310+
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
311+
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
312+
; GFX12-NEXT: s_wait_alu 0xfffe
313+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
314+
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
315+
; GFX12-NEXT: s_wait_alu 0xfffe
316+
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
317+
; GFX12-NEXT: s_wait_loadcnt 0x0
318+
; GFX12-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen
319+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
320+
; GFX12-NEXT: ; implicit-def: $vgpr8
321+
; GFX12-NEXT: ; implicit-def: $vgpr4
322+
; GFX12-NEXT: s_wait_alu 0xfffe
323+
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
324+
; GFX12-NEXT: s_cbranch_execnz .LBB3_3
325+
; GFX12-NEXT: ; %bb.4:
326+
; GFX12-NEXT: s_endpgm
327+
entry:
328+
%val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}, !nontemporal !0
329+
store i32 %val, ptr addrspace(7) %out
330+
ret void
331+
}
332+
333+
!0 = !{i32 1}
334+
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)