Skip to content

[AMDGPU] Add intrinsic for raw atomic buffer loads #97707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 22, 2024
26 changes: 26 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,19 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;

class AMDGPURawAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
[data_ty],
[llvm_v4i32_ty, // rsrc(SGPR)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why introduce a new buffer-as-vector intrinsic? Should use the new pointer approach?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the reasoning, however, in llpc we still use the vector variants. That's why we could still use both.

llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
// bit 1 = slc,
// bit 2 = dlc on gfx10+),
// swizzled buffer (bit 3 = swz))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, weird ask ... could we get a way to put the syncscope (maybe as a metadata arg) on the intrinsic? That feels like it'd be useful.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so it seems like that should be done for all of the buffer intrinsics, and would be orthogonal to this patch.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering if we should carry this as an operand bundle

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that'd be a change to all the buffer atomics ... but that's orthogonal to this patch, agreed

[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad;

class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[data_ty],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
Expand All @@ -1134,6 +1147,19 @@ class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>;
def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad;

class AMDGPURawPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
[data_ty],
[AMDGPUBufferRsrcTy,// rsrc(SGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
// bit 1 = slc,
// bit 2 = dlc on gfx10+),
// swizzled buffer (bit 3 = swz))
[IntrArgMemOnly, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad;

class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[data_ty],
[llvm_v4i32_ty, // rsrc(SGPR)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7345,6 +7345,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeBufferStore(MI, MRI, B, true, true);
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_atomic_buffer_load:
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_ptr_buffer_load:
return legalizeBufferLoad(MI, MRI, B, false, false);
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1092,8 +1092,9 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,

Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (isa<LoadInst>(I))
// TODO: Do we need to do something about atomic loads?
IID = Intrinsic::amdgcn_raw_ptr_buffer_load;
IID = Order == AtomicOrdering::NotAtomic
? Intrinsic::amdgcn_raw_ptr_buffer_load
: Intrinsic::amdgcn_raw_ptr_atomic_buffer_load;
else if (isa<StoreInst>(I))
IID = Intrinsic::amdgcn_raw_ptr_buffer_store;
else if (auto *RMW = dyn_cast<AtomicRMWInst>(I)) {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4984,6 +4984,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_atomic_buffer_load:
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
// FIXME: Should make intrinsic ID the last operand of the instruction,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = CI.getArgOperand(1);
return true;
}
case Intrinsic::amdgcn_raw_atomic_buffer_load:
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: {
Info.memVT =
memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
std::numeric_limits<unsigned>::max());
Info.flags &= ~MachineMemOperand::MOStore;
return true;
}
}
}
return true;
Expand Down Expand Up @@ -8897,6 +8905,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_atomic_buffer_load:
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
const bool IsFormat =
Expand Down
304 changes: 304 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK

define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_i32:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB0_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
%cmp = icmp eq i32 %load, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_i32_off:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB1_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1)
%cmp = icmp eq i32 %load, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_i32_soff:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB2_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB2_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1)
%cmp = icmp eq i32 %load, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB3_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB3_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4)
%cmp = icmp eq i32 %load, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
; CHECK-LABEL: raw_nonatomic_buffer_load_i32:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: .LBB4_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: s_or_b32 s0, s1, s0
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; CHECK-NEXT: s_cbranch_execnz .LBB4_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
%cmp = icmp eq i32 %load, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_i64:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB5_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
br label %bb1
bb1:
%load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1)
%cmp = icmp eq i64 %load, %id.zext
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_v2i16:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB6_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB6_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1)
%bitcast = bitcast <2 x i16> %load to i32
%cmp = icmp eq i32 %bitcast, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_v4i16:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB7_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB7_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1)
%shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2>
%bitcast = bitcast <2 x i16> %shortened to i32
%cmp = icmp eq i32 %bitcast, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_v4i32:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB8_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB8_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1)
%extracted = extractelement <4 x i32> %load, i32 3
%cmp = icmp eq i32 %extracted, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
; CHECK-LABEL: raw_atomic_buffer_load_ptr:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: .LBB9_1: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_load_b32 v1, v[1:2]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB9_1
; CHECK-NEXT: ; %bb.2: ; %bb2
; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1:
%load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1)
%elem = load i32, ptr %load
%cmp = icmp eq i32 %elem, %id
br i1 %cmp, label %bb1, label %bb2
bb2:
ret void
}

; Function Attrs: nounwind readonly
declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg)
declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg)
declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg)
declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg)
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()
Loading
Loading