Skip to content

[AMDGPU] Add target intrinsic for s_buffer_prefetch_data #107293

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_buffer_prefetch_data, "vQbIiUi", "nc", "gfx12-insts")

TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,8 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global
__builtin_amdgcn_s_barrier_wait(-1);
*out = *in;
}

void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off)
{
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}}
}
19 changes: 19 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
Original file line number Diff line number Diff line change
Expand Up @@ -281,3 +281,22 @@ void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned
__builtin_amdgcn_s_prefetch_data(gp, len);
__builtin_amdgcn_s_prefetch_data(cp, 31);
}

// CHECK-LABEL: @test_s_buffer_prefetch_data(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(5) [[RSRC_ADDR]], align 16
// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP0]], i32 128, i32 [[TMP1]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
// CHECK-NEXT: call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP2]], i32 0, i32 31)
// CHECK-NEXT: ret void
//
void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
{
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 128, len);
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 0, 31);
}
9 changes: 9 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1723,6 +1723,15 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS;

def int_amdgcn_s_buffer_prefetch_data : DefaultAttrsIntrinsic <
[],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
llvm_i32_ty, // offset (imm)
llvm_i32_ty], // len (SGPR/imm)
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<1>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>,
ClangBuiltin<"__builtin_amdgcn_s_buffer_prefetch_data">;

} // defset AMDGPUBufferIntrinsics

// Uses that do not set the done bit should set IntrWriteMem on the
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;

class GISelSop2Pat <
SDPatternOperator node,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5545,6 +5545,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_BYTE)
NODE_NAME_CASE(BUFFER_STORE_SHORT)
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,7 @@ enum NodeType : unsigned {
SBUFFER_LOAD_UBYTE,
SBUFFER_LOAD_SHORT,
SBUFFER_LOAD_USHORT,
SBUFFER_PREFETCH_DATA,
BUFFER_STORE,
BUFFER_STORE_BYTE,
BUFFER_STORE_SHORT,
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5237,7 +5237,8 @@ getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
std::optional<uint64_t> OffsetVal =
Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
if (!OffsetVal)
return {};

Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6797,6 +6797,18 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
return true;
}

bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
GISelChangeObserver &Observer = Helper.Observer;
Observer.changingInstr(MI);
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
MI.removeOperand(0); // Remove intrinsic ID
castBufferRsrcArgToV4I32(MI, B, 0);
Observer.changedInstr(MI);
return true;
}

// TODO: Move to selection
bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
MachineRegisterInfo &MRI,
Expand Down Expand Up @@ -7485,6 +7497,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
return legalizeLaneOp(Helper, MI, IntrID);
case Intrinsic::amdgcn_s_buffer_prefetch_data:
return legalizeSBufferPrefetch(Helper, MI);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {

bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;

bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const;

bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3101,6 +3101,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingSBufferLoad(B, OpdMapper);
return;
}
case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
constrainOpWithReadfirstlane(B, MI, 0);
constrainOpWithReadfirstlane(B, MI, 2);
return;
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
Expand Down Expand Up @@ -4464,6 +4468,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
break;
}
case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
Expand Down
34 changes: 26 additions & 8 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1210,9 +1210,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = RsrcArg;
}

auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
Info.flags |= MachineMemOperand::MOVolatile;
bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
if (!IsSPrefetch) {
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
Info.flags |= MachineMemOperand::MOVolatile;
}

Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
if (RsrcIntr->IsImage) {
Expand Down Expand Up @@ -1251,16 +1255,18 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

Info.flags |= MachineMemOperand::MOStore;
} else {
// Atomic or NoReturn Sampler
// Atomic, NoReturn Sampler or prefetch
Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
ISD::INTRINSIC_W_CHAIN;
Info.flags |= MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable;
Info.flags |=
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;

if (!IsSPrefetch)
Info.flags |= MachineMemOperand::MOStore;

switch (IntrID) {
default:
if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
// Fake memory access type for no return sampler intrinsics
Info.memVT = MVT::i32;
} else {
Expand Down Expand Up @@ -9934,6 +9940,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return Op.getOperand(0);
return Op;
}
case Intrinsic::amdgcn_s_buffer_prefetch_data: {
SDValue Ops[] = {
Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
Op.getOperand(3), // offset
Op.getOperand(4), // length
};

MemSDNode *M = cast<MemSDNode>(Op);
return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
Op->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,14 @@ class isIntType<ValueType SrcVT> {
bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
}

def SDTSBufferPrefetch : SDTypeProfile<0, 3,
[SDTCisVT<0, v4i32>, // rsrc
SDTCisVT<1, i32>, // offset(imm)
SDTCisVT<2, i32>]>; // length

def SIsbuffer_prefetch : SDNode<"AMDGPUISD::SBUFFER_PREFETCH_DATA", SDTSBufferPrefetch,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;

//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
// This is for SDNodes and PatFrag for local loads and stores to
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3978,6 +3978,16 @@ def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction;
def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction;
def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction;

class SBufferPrefetchInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs);
let InOperandList = (ins type0:$rsrc, untyped_imm_0:$offset, type1:$len);
let hasSideEffects = 0;
let mayLoad = 1;
let mayStore = 1;
}

def G_AMDGPU_S_BUFFER_PREFETCH : SBufferPrefetchInstruction;

def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src0, type0:$src1);
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/SMInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1171,6 +1171,17 @@ let SubtargetPredicate = isGFX12Plus in {
def : GCNPat <
(int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len),
(S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len))

>;

def : GCNPat <
(SIsbuffer_prefetch v4i32:$sbase, (SMRDBufferImm i32:$offset), (i32 SReg_32:$len)),
(S_BUFFER_PREFETCH_DATA SReg_128:$sbase, i32imm:$offset, $len, 0)
>;

def : GCNPat <
(SIsbuffer_prefetch v4i32:$sbase, (SMRDBufferImm i32:$offset), imm:$len),
(S_BUFFER_PREFETCH_DATA SReg_128:$sbase, i32imm:$offset, (i32 SGPR_NULL), (as_i8timm $len))
>;
} // End let SubtargetPredicate = isGFX12Plus

Expand Down
50 changes: 50 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s

declare void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) %rsrc, i32 %offset, i32 %len)

define amdgpu_ps void @buffer_prefetch_data_imm_offset_sgpr_len(ptr addrspace(8) inreg %rsrc, i32 inreg %len) {
; GCN-LABEL: buffer_prefetch_data_imm_offset_sgpr_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x80, s4, 0
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
ret void
}

define amdgpu_ps void @buffer_prefetch_data_imm_offset_imm_len(ptr addrspace(8) inreg %rsrc) {
; GCN-LABEL: buffer_prefetch_data_imm_offset_imm_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x0, null, 31
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 0, i32 31)
ret void
}

define amdgpu_ps void @buffer_prefetch_data_imm_offset_vgpr_len(ptr addrspace(8) inreg %rsrc, i32 %len) {
; GCN-LABEL: buffer_prefetch_data_imm_offset_vgpr_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x80, s4, 0
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
ret void
}

define amdgpu_ps void @buffer_prefetch_data_vgpr_rsrc_imm_offset_sgpr_len(ptr addrspace(8) %rsrc, i32 inreg %len) {
; GCN-LABEL: buffer_prefetch_data_vgpr_rsrc_imm_offset_sgpr_len:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: v_readfirstlane_b32 s5, v1
; GCN-NEXT: v_readfirstlane_b32 s6, v2
; GCN-NEXT: v_readfirstlane_b32 s7, v3
; GCN-NEXT: s_buffer_prefetch_data s[4:7], 0x80, s0, 0
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
ret void
}
Loading