Skip to content

Commit 67c7932

Browse files
rtayljayfoadmariusz-sikora-at-amdOutOfCache
committed
[AMDGPU] Add raw.atomic.buffer.load intrinsic
This adds llvm.amdgcn.raw.atomic.buffer.load intrinsic to support OpAtomicLoad lowering on AMDGPU. Previously this was lowered to llvm.amdgcn.raw.buffer.load which caused the load in some cases to be marked as invariant and hoisted in LICM. Co-authored-by: Jay Foad <[email protected]> Co-authored-by: Mariusz Sikora <[email protected]> Co-authored-by: Jessica Del <[email protected]>
1 parent c02e8f7 commit 67c7932

File tree

6 files changed

+329
-0
lines changed

6 files changed

+329
-0
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,6 +1116,19 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
11161116
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
11171117
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
11181118

1119+
class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1120+
[data_ty],
1121+
[llvm_v4i32_ty, // rsrc(SGPR)
1122+
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
1123+
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1124+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
1125+
// bit 1 = slc,
1126+
// bit 2 = dlc on gfx10+),
1127+
// swizzled buffer (bit 3 = swz))
1128+
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1129+
AMDGPURsrcIntrinsic<0>;
1130+
def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad;
1131+
11191132
class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
11201133
[data_ty],
11211134
[AMDGPUBufferRsrcTy, // rsrc(SGPR)

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
12451245
unsigned OffsetIdx;
12461246
switch (II.getIntrinsicID()) {
12471247
case Intrinsic::amdgcn_raw_buffer_load:
1248+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
12481249
case Intrinsic::amdgcn_raw_ptr_buffer_load:
12491250
OffsetIdx = 1;
12501251
break;
@@ -1378,6 +1379,7 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
13781379
case Intrinsic::amdgcn_raw_ptr_buffer_load:
13791380
case Intrinsic::amdgcn_raw_buffer_load_format:
13801381
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1382+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
13811383
case Intrinsic::amdgcn_raw_tbuffer_load:
13821384
case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
13831385
case Intrinsic::amdgcn_s_buffer_load:

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7345,6 +7345,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
73457345
return legalizeBufferStore(MI, MRI, B, true, true);
73467346
case Intrinsic::amdgcn_raw_buffer_load:
73477347
case Intrinsic::amdgcn_raw_ptr_buffer_load:
7348+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
73487349
case Intrinsic::amdgcn_struct_buffer_load:
73497350
case Intrinsic::amdgcn_struct_ptr_buffer_load:
73507351
return legalizeBufferLoad(MI, MRI, B, false, false);

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4984,6 +4984,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49844984
}
49854985
case Intrinsic::amdgcn_raw_buffer_load:
49864986
case Intrinsic::amdgcn_raw_ptr_buffer_load:
4987+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
49874988
case Intrinsic::amdgcn_raw_tbuffer_load:
49884989
case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
49894990
// FIXME: Should make intrinsic ID the last operand of the instruction,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12721272
Info.ptrVal = CI.getArgOperand(1);
12731273
return true;
12741274
}
1275+
case Intrinsic::amdgcn_raw_atomic_buffer_load: {
1276+
Info.memVT =
1277+
memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1278+
std::numeric_limits<unsigned>::max());
1279+
Info.flags &= ~MachineMemOperand::MOStore;
1280+
return true;
1281+
}
12751282
}
12761283
}
12771284
return true;
@@ -8897,6 +8904,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
88978904
}
88988905
case Intrinsic::amdgcn_raw_buffer_load:
88998906
case Intrinsic::amdgcn_raw_ptr_buffer_load:
8907+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
89008908
case Intrinsic::amdgcn_raw_buffer_load_format:
89018909
case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
89028910
const bool IsFormat =
Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -global-isel=0 | FileCheck %s -check-prefix=CHECK
3+
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -global-isel=1 | FileCheck %s -check-prefix=CHECK
4+
5+
define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
6+
; CHECK-LABEL: raw_atomic_buffer_load_i32:
7+
; CHECK: ; %bb.0: ; %bb
8+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
9+
; CHECK-NEXT: s_mov_b32 s4, 0
10+
; CHECK-NEXT: .LBB0_1: ; %bb1
11+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
14+
; CHECK-NEXT: s_waitcnt vmcnt(0)
15+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
16+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
17+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
19+
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
20+
; CHECK-NEXT: ; %bb.2: ; %bb2
21+
; CHECK-NEXT: s_endpgm
22+
bb:
23+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
24+
br label %bb1
25+
bb1:
26+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
27+
%cmp = icmp eq i32 %load, %id
28+
br i1 %cmp, label %bb1, label %bb2
29+
bb2:
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
34+
; CHECK-LABEL: raw_atomic_buffer_load_i32_off:
35+
; CHECK: ; %bb.0: ; %bb
36+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
37+
; CHECK-NEXT: s_mov_b32 s4, 0
38+
; CHECK-NEXT: .LBB1_1: ; %bb1
39+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
40+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
41+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
42+
; CHECK-NEXT: s_waitcnt vmcnt(0)
43+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
44+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
45+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
46+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
47+
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
48+
; CHECK-NEXT: ; %bb.2: ; %bb2
49+
; CHECK-NEXT: s_endpgm
50+
bb:
51+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
52+
br label %bb1
53+
bb1:
54+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
55+
%cmp = icmp eq i32 %load, %id
56+
br i1 %cmp, label %bb1, label %bb2
57+
bb2:
58+
ret void
59+
}
60+
define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
61+
; CHECK-LABEL: raw_atomic_buffer_load_i32_soff:
62+
; CHECK: ; %bb.0: ; %bb
63+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
64+
; CHECK-NEXT: s_mov_b32 s4, 0
65+
; CHECK-NEXT: .LBB2_1: ; %bb1
66+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
67+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
68+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
69+
; CHECK-NEXT: s_waitcnt vmcnt(0)
70+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
71+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
72+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
73+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
74+
; CHECK-NEXT: s_cbranch_execnz .LBB2_1
75+
; CHECK-NEXT: ; %bb.2: ; %bb2
76+
; CHECK-NEXT: s_endpgm
77+
bb:
78+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
79+
br label %bb1
80+
bb1:
81+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1)
82+
%cmp = icmp eq i32 %load, %id
83+
br i1 %cmp, label %bb1, label %bb2
84+
bb2:
85+
ret void
86+
}
87+
define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
88+
; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc:
89+
; CHECK: ; %bb.0: ; %bb
90+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
91+
; CHECK-NEXT: s_mov_b32 s4, 0
92+
; CHECK-NEXT: .LBB3_1: ; %bb1
93+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
94+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
95+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
96+
; CHECK-NEXT: s_waitcnt vmcnt(0)
97+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
98+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
99+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
100+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
101+
; CHECK-NEXT: s_cbranch_execnz .LBB3_1
102+
; CHECK-NEXT: ; %bb.2: ; %bb2
103+
; CHECK-NEXT: s_endpgm
104+
bb:
105+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
106+
br label %bb1
107+
bb1:
108+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4)
109+
%cmp = icmp eq i32 %load, %id
110+
br i1 %cmp, label %bb1, label %bb2
111+
bb2:
112+
ret void
113+
}
114+
115+
define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
116+
; CHECK-LABEL: raw_nonatomic_buffer_load_i32:
117+
; CHECK: ; %bb.0: ; %bb
118+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
119+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
120+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
121+
; CHECK-NEXT: s_mov_b32 s0, 0
122+
; CHECK-NEXT: s_waitcnt vmcnt(0)
123+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
124+
; CHECK-NEXT: .LBB4_1: ; %bb1
125+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
126+
; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
127+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
128+
; CHECK-NEXT: s_or_b32 s0, s1, s0
129+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
130+
; CHECK-NEXT: s_cbranch_execnz .LBB4_1
131+
; CHECK-NEXT: ; %bb.2: ; %bb2
132+
; CHECK-NEXT: s_endpgm
133+
bb:
134+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
135+
br label %bb1
136+
bb1:
137+
%load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
138+
%cmp = icmp eq i32 %load, %id
139+
br i1 %cmp, label %bb1, label %bb2
140+
bb2:
141+
ret void
142+
}
143+
144+
define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
145+
; CHECK-LABEL: raw_atomic_buffer_load_i64:
146+
; CHECK: ; %bb.0: ; %bb
147+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
148+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
149+
; CHECK-NEXT: s_mov_b32 s4, 0
150+
; CHECK-NEXT: .LBB5_1: ; %bb1
151+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
152+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
153+
; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
154+
; CHECK-NEXT: s_waitcnt vmcnt(0)
155+
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
156+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
157+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
158+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
159+
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
160+
; CHECK-NEXT: ; %bb.2: ; %bb2
161+
; CHECK-NEXT: s_endpgm
162+
bb:
163+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
164+
%id.zext = zext i32 %id to i64
165+
br label %bb1
166+
bb1:
167+
%load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1)
168+
%cmp = icmp eq i64 %load, %id.zext
169+
br i1 %cmp, label %bb1, label %bb2
170+
bb2:
171+
ret void
172+
}
173+
174+
define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
175+
; CHECK-LABEL: raw_atomic_buffer_load_v2i16:
176+
; CHECK: ; %bb.0: ; %bb
177+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
178+
; CHECK-NEXT: s_mov_b32 s4, 0
179+
; CHECK-NEXT: .LBB6_1: ; %bb1
180+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
181+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
182+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
183+
; CHECK-NEXT: s_waitcnt vmcnt(0)
184+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
185+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
186+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
187+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
188+
; CHECK-NEXT: s_cbranch_execnz .LBB6_1
189+
; CHECK-NEXT: ; %bb.2: ; %bb2
190+
; CHECK-NEXT: s_endpgm
191+
bb:
192+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
193+
br label %bb1
194+
bb1:
195+
%load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1)
196+
%bitcast = bitcast <2 x i16> %load to i32
197+
%cmp = icmp eq i32 %bitcast, %id
198+
br i1 %cmp, label %bb1, label %bb2
199+
bb2:
200+
ret void
201+
}
202+
203+
define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
204+
; CHECK-LABEL: raw_atomic_buffer_load_v4i16:
205+
; CHECK: ; %bb.0: ; %bb
206+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
207+
; CHECK-NEXT: s_mov_b32 s4, 0
208+
; CHECK-NEXT: .LBB7_1: ; %bb1
209+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
210+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
211+
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
212+
; CHECK-NEXT: s_waitcnt vmcnt(0)
213+
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
214+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
215+
; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1
216+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
217+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
218+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
219+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
220+
; CHECK-NEXT: s_cbranch_execnz .LBB7_1
221+
; CHECK-NEXT: ; %bb.2: ; %bb2
222+
; CHECK-NEXT: s_endpgm
223+
bb:
224+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
225+
br label %bb1
226+
bb1:
227+
%load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1)
228+
%shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
229+
%bitcast = bitcast <2 x i16> %shortened to i32
230+
%cmp = icmp eq i32 %bitcast, %id
231+
br i1 %cmp, label %bb1, label %bb2
232+
bb2:
233+
ret void
234+
}
235+
236+
define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
237+
; CHECK-LABEL: raw_atomic_buffer_load_v4i32:
238+
; CHECK: ; %bb.0: ; %bb
239+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
240+
; CHECK-NEXT: s_mov_b32 s4, 0
241+
; CHECK-NEXT: .LBB8_1: ; %bb1
242+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
243+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
244+
; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
245+
; CHECK-NEXT: s_waitcnt vmcnt(0)
246+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
247+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
248+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
249+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
250+
; CHECK-NEXT: s_cbranch_execnz .LBB8_1
251+
; CHECK-NEXT: ; %bb.2: ; %bb2
252+
; CHECK-NEXT: s_endpgm
253+
bb:
254+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
255+
br label %bb1
256+
bb1:
257+
%load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
258+
%extracted = extractelement <4 x i32> %load, i32 3
259+
%cmp = icmp eq i32 %extracted, %id
260+
br i1 %cmp, label %bb1, label %bb2
261+
bb2:
262+
ret void
263+
}
264+
265+
define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
266+
; CHECK-LABEL: raw_atomic_buffer_load_ptr:
267+
; CHECK: ; %bb.0: ; %bb
268+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
269+
; CHECK-NEXT: s_mov_b32 s4, 0
270+
; CHECK-NEXT: .LBB9_1: ; %bb1
271+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
272+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
273+
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
274+
; CHECK-NEXT: s_waitcnt vmcnt(0)
275+
; CHECK-NEXT: flat_load_b32 v1, v[1:2]
276+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
277+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
278+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
279+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
281+
; CHECK-NEXT: s_cbranch_execnz .LBB9_1
282+
; CHECK-NEXT: ; %bb.2: ; %bb2
283+
; CHECK-NEXT: s_endpgm
284+
bb:
285+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
286+
br label %bb1
287+
bb1:
288+
%load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1)
289+
%elem = load i32, ptr %load
290+
%cmp = icmp eq i32 %elem, %id
291+
br i1 %cmp, label %bb1, label %bb2
292+
bb2:
293+
ret void
294+
}
295+
296+
; Function Attrs: nounwind readonly
297+
declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
298+
declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg)
299+
declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg)
300+
declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg)
301+
declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
302+
declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg)
303+
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
304+
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)