-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] GFX12: select @llvm.prefetch intrinsic #74576
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] GFX12: select @llvm.prefetch intrinsic #74576
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Mariusz Sikora (mariusz-sikora-at-amd) ChangesPatch is 25.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74576.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79..f0b3ed7adc294 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 8); // M0
return;
+ case Intrinsic::prefetch: {
+ if (!Subtarget.hasPrefetch()) {
+ MI.eraseFromParent();
+ return;
+ }
+ unsigned PtrBank =
+ getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ if (PtrBank == AMDGPU::VGPRRegBankID) {
+ MI.eraseFromParent();
+ return;
+ }
+ // FIXME: There is currently no support for prefetch in global isel.
+ // There is no node equivalence and what's worse there is no MMO produced
+ // for a prefetch on global isel path.
+ // Prefetch does not affect execution so erase it for now.
+ MI.eraseFromParent();
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4830,6 +4848,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
break;
}
+ case Intrinsic::prefetch:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ break;
default:
return getInvalidInstructionMapping();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6..21a9b8147034f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -828,6 +828,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
+ bool hasPrefetch() const { return GFX12Insts; }
+
// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspective of an arbitrary workitem, this
// is 4-byte aligned.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7e..93af38d877c5d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -763,6 +763,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
+ if (Subtarget->hasPrefetch())
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -3858,6 +3861,23 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
}
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+ if (Op->isDivergent())
+ return SDValue();
+
+ switch (cast<MemSDNode>(Op)->getAddressSpace()) {
+ case AMDGPUAS::FLAT_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ break;
+ default:
+ return SDValue();
+ }
+
+ return Op;
+}
+
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
@@ -5395,6 +5415,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTACKSAVE(Op, DAG);
case ISD::GET_ROUNDING:
return lowerGET_ROUNDING(Op, DAG);
+ case ISD::PREFETCH:
+ return lowerPREFETCH(Op, DAG);
}
return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c9cc149218a99..5bc091d6e84de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -416,6 +416,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d691254..8e96d5f8abe15 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -483,6 +483,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
Offset = OffsetOp ? OffsetOp->getImm() : 0;
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
+ if (DataOpIdx == -1)
+ return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9362fe5d9678b..6d513fb0bdecc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3164,6 +3164,18 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;
+class SMPrefetchGetPcPat<string type, int cache_type> : GCNPat <
+ (prefetch (i64 imm:$offset), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) (S_ADD_U64_PSEUDO (S_GETPC_B64), $offset),
+ (i32 20), (i32 SGPR_NULL), (i8 0))
+ // Offset 20 should roughly adjust getpc sequence length.
+ > {
+ let AddedComplexity = 9;
+}
+
+def : SMPrefetchGetPcPat<"INST", 0>;
+def : SMPrefetchGetPcPat<"DATA", 1>;
+
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index c18846483cf95..a77856caae7a6 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -814,6 +814,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
}];
}
+def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return !N->getOperand(1)->isDivergent();}]> {
+ let GISelPredicateCode = [{
+ return isInstrUniform(MI);
+ }];
+}
+
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
@@ -959,6 +967,32 @@ def : GCNPat <
}
} // let OtherPredicates = [HasShaderCyclesRegister]
+def SIMM24bitPtr : ImmLeaf <iPTR,
+ [{return isInt<24>(Imm);}]
+>;
+
+multiclass SMPrefetchPat<string type, int cache_type> {
+ def : GCNPat <
+ (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
+ >;
+
+ def : GCNPat <
+ (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))
+ >;
+
+ def : GCNPat <
+ (prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0))
+ > {
+ let AddedComplexity = 10;
+ }
+}
+
+defm : SMPrefetchPat<"INST", 0>;
+defm : SMPrefetchPat<"DATA", 1>;
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
new file mode 100644
index 0000000000000..bca76770953b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-SDAG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+
+; Scalar data prefetch
+
+define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x200, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x200, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr addrspace(4) null, i32 128
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check large offsets
+
+define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_max_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], -0x800000, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_min_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_min_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 -8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_max_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel 0x7fffff, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_max_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 8388607
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_min_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data_pc_rel -0x800000, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_min_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 -8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_pc_rel_too_large_offset() {
+; GFX12-SDAG-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1]
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x14, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_pc_rel_too_large_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) null, i32 8388608
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check divergent address
+
+define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) {
+; GCN-LABEL: prefetch_data_vgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check LDS and Scratch, we cannot prefetch it
+
+define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) {
+; GCN-LABEL: prefetch_data_lds:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p3(ptr addrspace(3) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_scratch(ptr addrspace(5) inreg %ptr) {
+; GCN-LABEL: prefetch_data_scratch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p5(ptr addrspace(5) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Check supported address spaces
+
+define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_flat:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_flat:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_flat:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_global:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_global:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_global:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_mov_b32 s1, 0
+; GFX12-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; I$ prefetch
+
+define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
+; GFX12-SDAG-LABEL: prefetch_inst_sgpr_offset:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_sgpr_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_sgpr_offset:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x0, null, 0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX11-LABEL: prefetch_inst_pc_rel:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: prefetch_inst_pc_rel:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p4(ptr addrspace(4) null, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_pc_rel_offset() {
+; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_offset:
+; GFX12-SDAG: ; %bb.0:...
[truncated]
|
def : GCNPat < | ||
(prefetch SIMM24bitPtr:$offset, timm, timm, (i32 cache_type)), | ||
(!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type#"_PC_REL") (as_i32timm $offset), (i32 SGPR_NULL), (i8 0)) | ||
> { | ||
let AddedComplexity = 10; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand this pattern. (prefetch 0x12345678, 0, 0, 0)
should prefetch from absolute address 0x12345678, but this pattern selects S_PREFETCH_*_PC_REL 0x12345678, null, 0
which will prefetch from pc+0x12345678, which is different. @rampitec can you explain?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you do not have pointer (essentially provide a null to the prefetch intrinsic as a base pointer) this pc_rel pattern will be used. It may have no value as a data prefetch, but makes sense as inst prefetch. Say if you want to prefetch a next page of code. Like this:
define amdgpu_ps void @prefetch_inst_pc_rel_offset() {
; GFX12-SDAG-LABEL: prefetch_inst_pc_rel_offset:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_prefetch_inst_pc_rel 0x80, null, 0
; GFX12-SDAG-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) null, i32 128
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
ret void
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would interpret this as using the absolute address, you would need something else to represent a PC relative input
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe for now I will remove PC_REL part.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Prefetch on an absolute address is practically useless.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But that is how llvm.prefetch
is defined: "address
is the address to be prefetched". A different operation should use a different intrinsic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So you want a target intrinsic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really don't know. What would the use cases look like? Maybe it could be a generic intrinsic, if there is consensus that it is useful.
For the existing llvm.prefetch intrinsic, the only useful case I think of for instruction prefetching is:
define @f0() {
call @llvm.prefetch(@f1, ...)
...
call @f1()
}
define @f1() { ... }
to prefetch the code at the start of a function you are going to call. We could codegen that case using the _pc_rel form of the instruction.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not think we need to use PC_REL form to prefetch on a function's address. The instruction can take full 64-bit address, so one can just use this address. My understanding that PC_REL form can be useful if you expect something like a huge loop or a local branch and want to prefetch something like 1K from the PC. I am not sure though how useful this can be at a high language level or even in IR.
@@ -3101,6 +3101,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( | |||
applyDefaultMapping(OpdMapper); | |||
constrainOpWithReadfirstlane(B, MI, 8); // M0 | |||
return; | |||
case Intrinsic::prefetch: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should have a G_ instruction.
Why erase here and not in the legalizer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should have G_PREFETCH, but we do not currently have it.
Then I honestly do not remember why do I erase it here. Likely because I am erasing it here for VGPRBank anyway.
…eam-gfx12-sh-prefetch-ops-part2
No description provided.