Skip to content

Commit ec7f8e1

Browse files
OutOfCachejayfoadmariusz-sikora-at-amd
authored
[AMDGPU] Add intrinsic for raw atomic buffer loads (#97707)
Upstream the intrinsics `llvm.amdgcn.raw.atomic.buffer.load` and `llvm.amdgcn.raw.atomic.ptr.buffer.load`. These additional intrinsics mark atomic buffer loads as atomic to LLVM by removing the `IntrReadMem` attribute. Otherwise, it could hoist these intrinsics out of loops in cases where LLVM marks them as invariant. That can cause issues such as infinite loops. Continuation of https://reviews.llvm.org/D138786 with the additional use in the fat buffer lowering, more test cases and the additional ptr versions of these intrinsics. --------- Co-authored-by: rtayl <> Co-authored-by: Jay Foad <[email protected]> Co-authored-by: Mariusz Sikora <[email protected]>
1 parent 4010ddf commit ec7f8e1

8 files changed

+654
-5
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,19 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
11381138
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
11391139
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
11401140

1141+
class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1142+
[data_ty],
1143+
[llvm_v4i32_ty, // rsrc(SGPR)
1144+
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
1145+
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1146+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
1147+
// bit 1 = slc,
1148+
// bit 2 = dlc on gfx10+),
1149+
// swizzled buffer (bit 3 = swz))
1150+
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1151+
AMDGPURsrcIntrinsic<0>;
1152+
def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad;
1153+
11411154
class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
11421155
[data_ty],
11431156
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
@@ -1156,6 +1169,19 @@ class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
11561169
def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>;
11571170
def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad;
11581171

1172+
class AMDGPURawPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1173+
[data_ty],
1174+
[AMDGPUBufferRsrcTy,// rsrc(SGPR)
1175+
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
1176+
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1177+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
1178+
// bit 1 = slc,
1179+
// bit 2 = dlc on gfx10+),
1180+
// swizzled buffer (bit 3 = swz))
1181+
[IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1182+
AMDGPURsrcIntrinsic<0>;
1183+
def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad;
1184+
11591185
class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
11601186
[data_ty],
11611187
[llvm_v4i32_ty, // rsrc(SGPR)

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7366,6 +7366,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
73667366
return legalizeBufferStore(MI, MRI, B, true, true);
73677367
case Intrinsic::amdgcn_raw_buffer_load:
73687368
case Intrinsic::amdgcn_raw_ptr_buffer_load:
7369+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
7370+
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
73697371
case Intrinsic::amdgcn_struct_buffer_load:
73707372
case Intrinsic::amdgcn_struct_ptr_buffer_load:
73717373
return legalizeBufferLoad(MI, MRI, B, false, false);

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,8 +1092,9 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
10921092

10931093
Intrinsic::ID IID = Intrinsic::not_intrinsic;
10941094
if (isa<LoadInst>(I))
1095-
// TODO: Do we need to do something about atomic loads?
1096-
IID = Intrinsic::amdgcn_raw_ptr_buffer_load;
1095+
IID = Order == AtomicOrdering::NotAtomic
1096+
? Intrinsic::amdgcn_raw_ptr_buffer_load
1097+
: Intrinsic::amdgcn_raw_ptr_atomic_buffer_load;
10971098
else if (isa<StoreInst>(I))
10981099
IID = Intrinsic::amdgcn_raw_ptr_buffer_store;
10991100
else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) {

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4985,6 +4985,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49854985
}
49864986
case Intrinsic::amdgcn_raw_buffer_load:
49874987
case Intrinsic::amdgcn_raw_ptr_buffer_load:
4988+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
4989+
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
49884990
case Intrinsic::amdgcn_raw_tbuffer_load:
49894991
case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
49904992
// FIXME: Should make intrinsic ID the last operand of the instruction,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12771277
Info.ptrVal = CI.getArgOperand(1);
12781278
return true;
12791279
}
1280+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
1281+
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: {
1282+
Info.memVT =
1283+
memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
1284+
std::numeric_limits<unsigned>::max());
1285+
Info.flags &= ~MachineMemOperand::MOStore;
1286+
return true;
1287+
}
12801288
}
12811289
}
12821290
return true;
@@ -8889,6 +8897,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
88898897
}
88908898
case Intrinsic::amdgcn_raw_buffer_load:
88918899
case Intrinsic::amdgcn_raw_ptr_buffer_load:
8900+
case Intrinsic::amdgcn_raw_atomic_buffer_load:
8901+
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
88928902
case Intrinsic::amdgcn_raw_buffer_load_format:
88938903
case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
88948904
const bool IsFormat =
Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
3+
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
4+
5+
define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
6+
; CHECK-LABEL: raw_atomic_buffer_load_i32:
7+
; CHECK: ; %bb.0: ; %bb
8+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
9+
; CHECK-NEXT: s_mov_b32 s4, 0
10+
; CHECK-NEXT: .LBB0_1: ; %bb1
11+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
14+
; CHECK-NEXT: s_waitcnt vmcnt(0)
15+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
16+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
17+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
19+
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
20+
; CHECK-NEXT: ; %bb.2: ; %bb2
21+
; CHECK-NEXT: s_endpgm
22+
bb:
23+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
24+
br label %bb1
25+
bb1:
26+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
27+
%cmp = icmp eq i32 %load, %id
28+
br i1 %cmp, label %bb1, label %bb2
29+
bb2:
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
34+
; CHECK-LABEL: raw_atomic_buffer_load_i32_off:
35+
; CHECK: ; %bb.0: ; %bb
36+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
37+
; CHECK-NEXT: s_mov_b32 s4, 0
38+
; CHECK-NEXT: .LBB1_1: ; %bb1
39+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
40+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
41+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
42+
; CHECK-NEXT: s_waitcnt vmcnt(0)
43+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
44+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
45+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
46+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
47+
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
48+
; CHECK-NEXT: ; %bb.2: ; %bb2
49+
; CHECK-NEXT: s_endpgm
50+
bb:
51+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
52+
br label %bb1
53+
bb1:
54+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
55+
%cmp = icmp eq i32 %load, %id
56+
br i1 %cmp, label %bb1, label %bb2
57+
bb2:
58+
ret void
59+
}
60+
define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
61+
; CHECK-LABEL: raw_atomic_buffer_load_i32_soff:
62+
; CHECK: ; %bb.0: ; %bb
63+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
64+
; CHECK-NEXT: s_mov_b32 s4, 0
65+
; CHECK-NEXT: .LBB2_1: ; %bb1
66+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
67+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
68+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
69+
; CHECK-NEXT: s_waitcnt vmcnt(0)
70+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
71+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
72+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
73+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
74+
; CHECK-NEXT: s_cbranch_execnz .LBB2_1
75+
; CHECK-NEXT: ; %bb.2: ; %bb2
76+
; CHECK-NEXT: s_endpgm
77+
bb:
78+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
79+
br label %bb1
80+
bb1:
81+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1)
82+
%cmp = icmp eq i32 %load, %id
83+
br i1 %cmp, label %bb1, label %bb2
84+
bb2:
85+
ret void
86+
}
87+
define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
88+
; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc:
89+
; CHECK: ; %bb.0: ; %bb
90+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
91+
; CHECK-NEXT: s_mov_b32 s4, 0
92+
; CHECK-NEXT: .LBB3_1: ; %bb1
93+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
94+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
95+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
96+
; CHECK-NEXT: s_waitcnt vmcnt(0)
97+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
98+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
99+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
100+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
101+
; CHECK-NEXT: s_cbranch_execnz .LBB3_1
102+
; CHECK-NEXT: ; %bb.2: ; %bb2
103+
; CHECK-NEXT: s_endpgm
104+
bb:
105+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
106+
br label %bb1
107+
bb1:
108+
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4)
109+
%cmp = icmp eq i32 %load, %id
110+
br i1 %cmp, label %bb1, label %bb2
111+
bb2:
112+
ret void
113+
}
114+
115+
define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
116+
; CHECK-LABEL: raw_nonatomic_buffer_load_i32:
117+
; CHECK: ; %bb.0: ; %bb
118+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
119+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
120+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
121+
; CHECK-NEXT: s_mov_b32 s0, 0
122+
; CHECK-NEXT: s_waitcnt vmcnt(0)
123+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
124+
; CHECK-NEXT: .LBB4_1: ; %bb1
125+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
126+
; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
127+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
128+
; CHECK-NEXT: s_or_b32 s0, s1, s0
129+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
130+
; CHECK-NEXT: s_cbranch_execnz .LBB4_1
131+
; CHECK-NEXT: ; %bb.2: ; %bb2
132+
; CHECK-NEXT: s_endpgm
133+
bb:
134+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
135+
br label %bb1
136+
bb1:
137+
%load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
138+
%cmp = icmp eq i32 %load, %id
139+
br i1 %cmp, label %bb1, label %bb2
140+
bb2:
141+
ret void
142+
}
143+
144+
define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
145+
; CHECK-LABEL: raw_atomic_buffer_load_i64:
146+
; CHECK: ; %bb.0: ; %bb
147+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
148+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
149+
; CHECK-NEXT: s_mov_b32 s4, 0
150+
; CHECK-NEXT: .LBB5_1: ; %bb1
151+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
152+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
153+
; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
154+
; CHECK-NEXT: s_waitcnt vmcnt(0)
155+
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
156+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
157+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
158+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
159+
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
160+
; CHECK-NEXT: ; %bb.2: ; %bb2
161+
; CHECK-NEXT: s_endpgm
162+
bb:
163+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
164+
%id.zext = zext i32 %id to i64
165+
br label %bb1
166+
bb1:
167+
%load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1)
168+
%cmp = icmp eq i64 %load, %id.zext
169+
br i1 %cmp, label %bb1, label %bb2
170+
bb2:
171+
ret void
172+
}
173+
174+
define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
175+
; CHECK-LABEL: raw_atomic_buffer_load_v2i16:
176+
; CHECK: ; %bb.0: ; %bb
177+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
178+
; CHECK-NEXT: s_mov_b32 s4, 0
179+
; CHECK-NEXT: .LBB6_1: ; %bb1
180+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
181+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
182+
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
183+
; CHECK-NEXT: s_waitcnt vmcnt(0)
184+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
185+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
186+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
187+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
188+
; CHECK-NEXT: s_cbranch_execnz .LBB6_1
189+
; CHECK-NEXT: ; %bb.2: ; %bb2
190+
; CHECK-NEXT: s_endpgm
191+
bb:
192+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
193+
br label %bb1
194+
bb1:
195+
%load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1)
196+
%bitcast = bitcast <2 x i16> %load to i32
197+
%cmp = icmp eq i32 %bitcast, %id
198+
br i1 %cmp, label %bb1, label %bb2
199+
bb2:
200+
ret void
201+
}
202+
203+
define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
204+
; CHECK-LABEL: raw_atomic_buffer_load_v4i16:
205+
; CHECK: ; %bb.0: ; %bb
206+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
207+
; CHECK-NEXT: s_mov_b32 s4, 0
208+
; CHECK-NEXT: .LBB7_1: ; %bb1
209+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
210+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
211+
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
212+
; CHECK-NEXT: s_waitcnt vmcnt(0)
213+
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
214+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
215+
; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1
216+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
217+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
218+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
219+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
220+
; CHECK-NEXT: s_cbranch_execnz .LBB7_1
221+
; CHECK-NEXT: ; %bb.2: ; %bb2
222+
; CHECK-NEXT: s_endpgm
223+
bb:
224+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
225+
br label %bb1
226+
bb1:
227+
%load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1)
228+
%shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
229+
%bitcast = bitcast <2 x i16> %shortened to i32
230+
%cmp = icmp eq i32 %bitcast, %id
231+
br i1 %cmp, label %bb1, label %bb2
232+
bb2:
233+
ret void
234+
}
235+
236+
define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
237+
; CHECK-LABEL: raw_atomic_buffer_load_v4i32:
238+
; CHECK: ; %bb.0: ; %bb
239+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
240+
; CHECK-NEXT: s_mov_b32 s4, 0
241+
; CHECK-NEXT: .LBB8_1: ; %bb1
242+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
243+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
244+
; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
245+
; CHECK-NEXT: s_waitcnt vmcnt(0)
246+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
247+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
248+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
249+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
250+
; CHECK-NEXT: s_cbranch_execnz .LBB8_1
251+
; CHECK-NEXT: ; %bb.2: ; %bb2
252+
; CHECK-NEXT: s_endpgm
253+
bb:
254+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
255+
br label %bb1
256+
bb1:
257+
%load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
258+
%extracted = extractelement <4 x i32> %load, i32 3
259+
%cmp = icmp eq i32 %extracted, %id
260+
br i1 %cmp, label %bb1, label %bb2
261+
bb2:
262+
ret void
263+
}
264+
265+
define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
266+
; CHECK-LABEL: raw_atomic_buffer_load_ptr:
267+
; CHECK: ; %bb.0: ; %bb
268+
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
269+
; CHECK-NEXT: s_mov_b32 s4, 0
270+
; CHECK-NEXT: .LBB9_1: ; %bb1
271+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
272+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
273+
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
274+
; CHECK-NEXT: s_waitcnt vmcnt(0)
275+
; CHECK-NEXT: flat_load_b32 v1, v[1:2]
276+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
277+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
278+
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
279+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280+
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
281+
; CHECK-NEXT: s_cbranch_execnz .LBB9_1
282+
; CHECK-NEXT: ; %bb.2: ; %bb2
283+
; CHECK-NEXT: s_endpgm
284+
bb:
285+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
286+
br label %bb1
287+
bb1:
288+
%load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1)
289+
%elem = load i32, ptr %load
290+
%cmp = icmp eq i32 %elem, %id
291+
br i1 %cmp, label %bb1, label %bb2
292+
bb2:
293+
ret void
294+
}
295+
296+
; Function Attrs: nounwind readonly
297+
declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
298+
declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg)
299+
declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg)
300+
declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg)
301+
declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
302+
declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg)
303+
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
304+
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)