-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Set inst_pref_size to maximum #126981
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Set inst_pref_size to maximum #126981
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesOn gfx11 and gfx12 set initial instruction prefetch size to a Fixes: SWDEV-513122 Full diff: https://github.com/llvm/llvm-project/pull/126981.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 031d8f0560ff2..d1d5b9a79ec5a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -622,12 +622,13 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
int64_t PGRM_Rsrc3 = 1;
bool EvaluatableRsrc3 =
- CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
+ CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
(void)PGRM_Rsrc3;
(void)EvaluatableRsrc3;
- assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
+ assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
+ STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
- KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+ KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -748,7 +749,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
OutContext, IsLocal)
->getVariableValue(),
- getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
return false;
}
@@ -757,7 +758,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
CurrentProgramInfo.NumArchVGPR,
STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
- CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
+ CurrentProgramInfo.ScratchSize,
+ CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -821,22 +823,22 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
[[maybe_unused]] int64_t PGMRSrc3;
- assert(STM.hasGFX90AInsts() ||
- (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
- PGMRSrc3) &&
+ assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
+ STM.hasGFX90AInsts() ||
+ (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
getMCExprStr(MCKernelDescriptor::bits_get(
- CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
getMCExprStr(MCKernelDescriptor::bits_get(
- CurrentProgramInfo.ComputePGMRSrc3GFX90A,
+ CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
false);
@@ -893,27 +895,6 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
}
}
-uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = STM.getInstrInfo();
-
- uint64_t CodeSize = 0;
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- // TODO: CodeSize should account for multiple functions.
-
- // TODO: Should we count size of debug info?
- if (MI.isDebugInstr())
- continue;
-
- CodeSize += TII->getInstSizeInBytes(MI);
- }
- }
-
- return CodeSize;
-}
-
// AccumOffset computed for the MCExpr equivalent of:
// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
@@ -1249,24 +1230,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;
+ // return ((Dst & ~Mask) | (Value << Shift))
+ auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+ uint32_t Shift) {
+ const auto *Shft = MCConstantExpr::create(Shift, Ctx);
+ const auto *Msk = MCConstantExpr::create(Mask, Ctx);
+ Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+ Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
+ Ctx);
+ return Dst;
+ };
+
if (STM.hasGFX90AInsts()) {
- // return ((Dst & ~Mask) | (Value << Shift))
- auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
- uint32_t Shift) {
- const auto *Shft = MCConstantExpr::create(Shift, Ctx);
- const auto *Msk = MCConstantExpr::create(Mask, Ctx);
- Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
- Dst = MCBinaryExpr::createOr(
- Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
- return Dst;
- };
-
- ProgInfo.ComputePGMRSrc3GFX90A =
- SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
- ProgInfo.ComputePGMRSrc3GFX90A =
- SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
+ ProgInfo.ComputePGMRSrc3 =
+ SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
@@ -1287,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
+
+ if (isGFX11Plus(STM)) {
+ uint32_t CodeSizeInBytes =
+ (uint32_t)std::min(ProgInfo.getFunctionCodeSize(MF),
+ (uint64_t)std::numeric_limits<uint32_t>::max());
+ uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
+ uint32_t Field, Shift, Width;
+ if (isGFX11(STM)) {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
+ } else {
+ Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
+ Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
+ Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
+ }
+ uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
+ ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
+ CreateExpr(InstPrefSize), Field, Shift);
+ }
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index cc8c4411805e2..2c959d7dbbd07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -50,8 +50,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
- uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
-
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
const SIProgramInfo &KernelInfo,
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 212edff097837..85ea3a9e17e09 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -27,6 +27,8 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+ CodeSizeInBytes.reset();
+
VGPRBlocks = ZeroExpr;
SGPRBlocks = ZeroExpr;
Priority = 0;
@@ -55,7 +57,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
LdsSize = 0;
EXCPEnable = 0;
- ComputePGMRSrc3GFX90A = ZeroExpr;
+ ComputePGMRSrc3 = ZeroExpr;
NumVGPR = ZeroExpr;
NumArchVGPR = ZeroExpr;
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
return MCConstantExpr::create(0, Ctx);
}
+
+uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
+ if (!CodeSizeInBytes.has_value()) {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = STM.getInstrInfo();
+
+ uint64_t CodeSize = 0;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ // TODO: CodeSize should account for multiple functions.
+
+ // TODO: Should we count size of debug info?
+ if (MI.isDebugInstr())
+ continue;
+
+ CodeSize += TII->getInstSizeInBytes(MI);
+ }
+ }
+
+ CodeSizeInBytes = CodeSize;
+ }
+
+ return *CodeSizeInBytes;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index c358a2d9db10b..2836af033d4a4 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -19,6 +19,7 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/Support/Compiler.h"
#include <cstdint>
+#include <optional>
namespace llvm {
@@ -29,6 +30,8 @@ class MachineFunction;
/// Track resource usage for kernels / entry functions.
struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
+ std::optional<uint64_t> CodeSizeInBytes;
+
// Fields set in PGM_RSRC1 pm4 packet.
const MCExpr *VGPRBlocks = nullptr;
const MCExpr *SGPRBlocks = nullptr;
@@ -60,7 +63,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
uint32_t LdsSize = 0;
uint32_t EXCPEnable = 0;
- const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
+ const MCExpr *ComputePGMRSrc3 = nullptr;
const MCExpr *NumVGPR = nullptr;
const MCExpr *NumArchVGPR = nullptr;
@@ -97,6 +100,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
// non-MCExpr members.
void reset(const MachineFunction &MF);
+ // Get function code size and cache the value.
+ uint64_t getFunctionCodeSize(const MachineFunction &MF);
+
/// Compute the value of the ComputePGMRsrc1 register.
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const;
diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
new file mode 100644
index 0000000000000..671352446ca31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
+
+; GCN-LABEL: .amdhsa_kernel large
+; GFX11: .amdhsa_inst_pref_size 3
+; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
+; GFX12: .amdhsa_inst_pref_size 4
+; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
+define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+bb:
+ call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
+ ret void
+}
+
+; GCN-LABEL: .amdhsa_kernel small
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
+define amdgpu_kernel void @small() {
+bb:
+ ret void
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, | |||
|
|||
return MCConstantExpr::create(0, Ctx); | |||
} | |||
|
|||
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function isn't even correct. We really should fix this to emit the MC resolved code size. Can we do that now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wanted to look at this separately. Right now the problem is AsmPrinter emits end function label into an incorrect place, actually into a kernel descriptor in .rodata. This is even a wrong section. That will take more and really a separate thing, but when fixed I could replace that with MCExpr. I.e., I can emit a separate end label, but this is also a hack.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm... I have the draft, but the output comes out as
.amdhsa_inst_pref_size ((((255*((((.Lfunc_end0-large)+127)/128)>255))+((((.Lfunc_end0-large)+127)/128)*((((.Lfunc_end0-large)+127)/128)<=255)))<<4)&4080)>>4
I am not sure we really want this.
ddd7760
to
6efc6a3
Compare
6efc6a3
to
f55b179
Compare
912528b
to
d7bfaee
Compare
I had to change code to lower bound code size estimation to avoid prefetching of unmaped memory. |
ping |
On gfx11 and gfx12 set initial instruction prefetch size to a minimum of kernel size and maximum allowed value. Fixes: SWDEV-513122
d7bfaee
to
35ab461
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Our instruction size estimates aren't super reliable either
On gfx11 and gfx12 set initial instruction prefetch size to a minimum of kernel size and maximum allowed value. Fixes: SWDEV-513122
On gfx11 and gfx12 set initial instruction prefetch size to a
minimum of kernel size and maximum allowed value.
Fixes: SWDEV-513122