Skip to content

Commit bd840a4

Browse files
authored
[AMDGPU] Add target intrinsic for s_prefetch_data (#107133)
1 parent 1e98aa4 commit bd840a4

File tree

10 files changed

+230
-3
lines changed

10 files changed

+230
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
448448
TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
449449
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
450450
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
451+
TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")
451452

452453
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
453454
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19608,6 +19608,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1960819608
F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)),
1960919609
EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))});
1961019610
}
19611+
case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
19612+
return emitBuiltinWithOneOverloadedType<2>(
19613+
*this, E, Intrinsic::amdgcn_s_prefetch_data);
1961119614
default:
1961219615
return nullptr;
1961319616
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,4 +256,28 @@ void test_s_ttracedata_imm()
256256
__builtin_amdgcn_s_ttracedata_imm(1);
257257
}
258258

259-
259+
// CHECK-LABEL: @test_s_prefetch_data(
260+
// CHECK-NEXT: entry:
261+
// CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
262+
// CHECK-NEXT: [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
263+
// CHECK-NEXT: [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
264+
// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
265+
// CHECK-NEXT: store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8
266+
// CHECK-NEXT: store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8
267+
// CHECK-NEXT: store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8
268+
// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
269+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
270+
// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
271+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8
272+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
273+
// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
274+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8
275+
// CHECK-NEXT: call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
276+
// CHECK-NEXT: ret void
277+
//
278+
void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len)
279+
{
280+
__builtin_amdgcn_s_prefetch_data(fp, 0);
281+
__builtin_amdgcn_s_prefetch_data(gp, len);
282+
__builtin_amdgcn_s_prefetch_data(cp, 31);
283+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2689,6 +2689,14 @@ def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
26892689
def int_amdgcn_wave_id :
26902690
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
26912691

2692+
def int_amdgcn_s_prefetch_data :
2693+
Intrinsic<[],
2694+
[llvm_anyptr_ty, // Pointer to a constant/global memory
2695+
llvm_i32_ty], // Length to prefetch 0-31 (1-32 chaunks, units of 128 bytes)
2696+
[IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
2697+
"", [SDNPMemOperand]
2698+
>;
2699+
26922700
//===----------------------------------------------------------------------===//
26932701
// Deep learning intrinsics.
26942702
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5541,7 +5541,12 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
55415541
void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
55425542
const MachineInstr &MI,
55435543
int OpIdx) const {
5544-
MIB.addImm(MI.getOperand(OpIdx).getImm());
5544+
const MachineOperand &Op = MI.getOperand(OpIdx);
5545+
int64_t Imm;
5546+
if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
5547+
MIB.addImm(Imm);
5548+
else
5549+
MIB.addImm(Op.getImm());
55455550
}
55465551

55475552
void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3290,6 +3290,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
32903290
constrainOpWithReadfirstlane(B, MI, 2);
32913291
return;
32923292
}
3293+
case Intrinsic::amdgcn_s_prefetch_data: {
3294+
Register PtrReg = MI.getOperand(1).getReg();
3295+
unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3296+
if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
3297+
constrainOpWithReadfirstlane(B, MI, 1);
3298+
constrainOpWithReadfirstlane(B, MI, 2);
3299+
} else
3300+
MI.eraseFromParent();
3301+
return;
3302+
}
32933303
default: {
32943304
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
32953305
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -5151,6 +5161,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
51515161
}
51525162
case Intrinsic::amdgcn_pops_exiting_wave_id:
51535163
return getDefaultMappingSOP(MI);
5164+
case Intrinsic::amdgcn_s_prefetch_data: {
5165+
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
5166+
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
5167+
break;
5168+
}
51545169
default:
51555170
return getInvalidInstructionMapping();
51565171
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1430,6 +1430,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14301430
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
14311431
return true;
14321432
}
1433+
case Intrinsic::amdgcn_s_prefetch_data: {
1434+
Info.opc = ISD::INTRINSIC_VOID;
1435+
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1436+
Info.ptrVal = CI.getArgOperand(0);
1437+
Info.flags |= MachineMemOperand::MOLoad;
1438+
return true;
1439+
}
14331440
default:
14341441
return false;
14351442
}
@@ -9921,6 +9928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
99219928
auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
99229929
return SDValue(NewMI, 0);
99239930
}
9931+
case Intrinsic::amdgcn_s_prefetch_data: {
9932+
// For non-global address space preserve the chain and remove the call.
9933+
if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))
9934+
return Op.getOperand(0);
9935+
return Op;
9936+
}
99249937
default: {
99259938
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
99269939
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6231,7 +6231,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
62316231
SBase->setReg(SGPR);
62326232
}
62336233
MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6234-
if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6234+
if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
62356235
Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
62366236
SOff->setReg(SGPR);
62376237
}

llvm/lib/Target/AMDGPU/SMInstructions.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,6 +1152,28 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
11521152
defm : SMPrefetchPat<"INST", i32imm_zero>;
11531153
defm : SMPrefetchPat<"DATA", i32imm_one>;
11541154

1155+
let SubtargetPredicate = isGFX12Plus in {
1156+
def : GCNPat <
1157+
(int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), (i32 SReg_32:$len)),
1158+
(S_PREFETCH_DATA $sbase, $offset, $len, 0)
1159+
>;
1160+
1161+
def : GCNPat <
1162+
(int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), (i32 SReg_32:$len)),
1163+
(S_PREFETCH_DATA $sbase, 0, $len, 0)
1164+
>;
1165+
1166+
def : GCNPat <
1167+
(int_amdgcn_s_prefetch_data (SMRDImm i64:$sbase, i32:$offset), imm:$len),
1168+
(S_PREFETCH_DATA $sbase, $offset, (i32 SGPR_NULL), (as_i8timm $len))
1169+
>;
1170+
1171+
def : GCNPat <
1172+
(int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len),
1173+
(S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len))
1174+
>;
1175+
} // End let SubtargetPredicate = isGFX12Plus
1176+
11551177
//===----------------------------------------------------------------------===//
11561178
// GFX10.
11571179
//===----------------------------------------------------------------------===//
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s
4+
5+
define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
6+
; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len:
7+
; GCN: ; %bb.0: ; %entry
8+
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0
9+
; GCN-NEXT: s_endpgm
10+
entry:
11+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
12+
ret void
13+
}
14+
15+
define amdgpu_ps void @prefetch_data_sgpr_imm_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) {
16+
; GCN-LABEL: prefetch_data_sgpr_imm_base_sgpr_len:
17+
; GCN: ; %bb.0: ; %entry
18+
; GCN-NEXT: s_prefetch_data s[0:1], 0x200, s2, 0
19+
; GCN-NEXT: s_endpgm
20+
entry:
21+
%gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
22+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
23+
ret void
24+
}
25+
26+
define amdgpu_ps void @prefetch_data_sgpr_base_imm_len(ptr addrspace(4) inreg %ptr) {
27+
; GCN-LABEL: prefetch_data_sgpr_base_imm_len:
28+
; GCN: ; %bb.0: ; %entry
29+
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
30+
; GCN-NEXT: s_endpgm
31+
entry:
32+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 31)
33+
ret void
34+
}
35+
36+
define amdgpu_ps void @prefetch_data_sgpr_imm_base_imm_len(ptr addrspace(4) inreg %ptr) {
37+
; GCN-LABEL: prefetch_data_sgpr_imm_base_imm_len:
38+
; GCN: ; %bb.0: ; %entry
39+
; GCN-NEXT: s_prefetch_data s[0:1], 0x200, null, 31
40+
; GCN-NEXT: s_endpgm
41+
entry:
42+
%gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
43+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 31)
44+
ret void
45+
}
46+
47+
define amdgpu_ps void @prefetch_data_vgpr_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
48+
; GCN-LABEL: prefetch_data_vgpr_base_sgpr_len:
49+
; GCN: ; %bb.0: ; %entry
50+
; GCN-NEXT: v_readfirstlane_b32 s2, v0
51+
; GCN-NEXT: v_readfirstlane_b32 s3, v1
52+
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0
53+
; GCN-NEXT: s_endpgm
54+
entry:
55+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
56+
ret void
57+
}
58+
59+
define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) {
60+
; SDAG-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
61+
; SDAG: ; %bb.0: ; %entry
62+
; SDAG-NEXT: v_readfirstlane_b32 s2, v0
63+
; SDAG-NEXT: v_readfirstlane_b32 s3, v1
64+
; SDAG-NEXT: s_prefetch_data s[2:3], 0x200, s0, 0
65+
; SDAG-NEXT: s_endpgm
66+
;
67+
; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
68+
; GISEL: ; %bb.0: ; %entry
69+
; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x200, v0
70+
; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
71+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
72+
; GISEL-NEXT: v_readfirstlane_b32 s2, v0
73+
; GISEL-NEXT: v_readfirstlane_b32 s3, v1
74+
; GISEL-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0
75+
; GISEL-NEXT: s_endpgm
76+
entry:
77+
%gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128
78+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len)
79+
ret void
80+
}
81+
82+
define amdgpu_ps void @prefetch_data_sgpr_base_vgpr_len(ptr addrspace(4) inreg %ptr, i32 %len) {
83+
; GCN-LABEL: prefetch_data_sgpr_base_vgpr_len:
84+
; GCN: ; %bb.0: ; %entry
85+
; GCN-NEXT: v_readfirstlane_b32 s2, v0
86+
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0
87+
; GCN-NEXT: s_endpgm
88+
entry:
89+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
90+
ret void
91+
}
92+
93+
define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_global(ptr addrspace(1) inreg %ptr) {
94+
; GCN-LABEL: prefetch_data_sgpr_base_imm_len_global:
95+
; GCN: ; %bb.0: ; %entry
96+
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
97+
; GCN-NEXT: s_endpgm
98+
entry:
99+
tail call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 31)
100+
ret void
101+
}
102+
103+
define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_flat(ptr inreg %ptr) {
104+
; GCN-LABEL: prefetch_data_sgpr_base_imm_len_flat:
105+
; GCN: ; %bb.0: ; %entry
106+
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31
107+
; GCN-NEXT: s_endpgm
108+
entry:
109+
tail call void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 31)
110+
ret void
111+
}
112+
113+
define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_local(ptr addrspace(3) inreg %ptr) {
114+
; GCN-LABEL: prefetch_data_sgpr_base_imm_len_local:
115+
; GCN: ; %bb.0: ; %entry
116+
; GCN-NEXT: s_endpgm
117+
entry:
118+
tail call void @llvm.amdgcn.s.prefetch.data.p3(ptr addrspace(3) %ptr, i32 31)
119+
ret void
120+
}
121+
122+
define amdgpu_ps void @prefetch_data_vgpr_base_imm_len(ptr addrspace(4) %ptr) {
123+
; GCN-LABEL: prefetch_data_vgpr_base_imm_len:
124+
; GCN: ; %bb.0: ; %entry
125+
; GCN-NEXT: v_readfirstlane_b32 s0, v0
126+
; GCN-NEXT: v_readfirstlane_b32 s1, v1
127+
; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
128+
; GCN-NEXT: s_endpgm
129+
entry:
130+
tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 0)
131+
ret void
132+
}
133+
134+
declare void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len)
135+
declare void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) %ptr, i32 %len)
136+
declare void @llvm.amdgcn.s.prefetch.data.p0(ptr %ptr, i32 %len)

0 commit comments

Comments
 (0)