Skip to content

Commit 6efc6a3

Browse files
committed
[AMDGPU] Set inst_pref_size to maximum
On gfx11 and gfx12 set initial instruction prefetch size to a minimum of kernel size and maximum allowed value. Fixes: SWDEV-513122
1 parent 3141afa commit 6efc6a3

File tree

5 files changed

+103
-50
lines changed

5 files changed

+103
-50
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -622,12 +622,13 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
622622

623623
int64_t PGRM_Rsrc3 = 1;
624624
bool EvaluatableRsrc3 =
625-
CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
625+
CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
626626
(void)PGRM_Rsrc3;
627627
(void)EvaluatableRsrc3;
628-
assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
628+
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
629+
STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
629630
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
630-
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
631+
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
631632

632633
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
633634
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -748,7 +749,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
748749
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
749750
OutContext, IsLocal)
750751
->getVariableValue(),
751-
getFunctionCodeSize(MF), MFI);
752+
CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
752753
return false;
753754
}
754755

@@ -757,7 +758,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
757758
CurrentProgramInfo.NumArchVGPR,
758759
STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
759760
CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
760-
CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
761+
CurrentProgramInfo.ScratchSize,
762+
CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
761763

762764
OutStreamer->emitRawComment(
763765
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -821,22 +823,22 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
821823
false);
822824

823825
[[maybe_unused]] int64_t PGMRSrc3;
824-
assert(STM.hasGFX90AInsts() ||
825-
(CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
826-
PGMRSrc3) &&
826+
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
827+
STM.hasGFX90AInsts() ||
828+
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
827829
static_cast<uint64_t>(PGMRSrc3) == 0));
828830
if (STM.hasGFX90AInsts()) {
829831
OutStreamer->emitRawComment(
830832
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
831833
getMCExprStr(MCKernelDescriptor::bits_get(
832-
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
834+
CurrentProgramInfo.ComputePGMRSrc3,
833835
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
834836
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
835837
false);
836838
OutStreamer->emitRawComment(
837839
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
838840
getMCExprStr(MCKernelDescriptor::bits_get(
839-
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
841+
CurrentProgramInfo.ComputePGMRSrc3,
840842
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
841843
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
842844
false);
@@ -893,27 +895,6 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
893895
}
894896
}
895897

896-
uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
897-
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
898-
const SIInstrInfo *TII = STM.getInstrInfo();
899-
900-
uint64_t CodeSize = 0;
901-
902-
for (const MachineBasicBlock &MBB : MF) {
903-
for (const MachineInstr &MI : MBB) {
904-
// TODO: CodeSize should account for multiple functions.
905-
906-
// TODO: Should we count size of debug info?
907-
if (MI.isDebugInstr())
908-
continue;
909-
910-
CodeSize += TII->getInstSizeInBytes(MI);
911-
}
912-
}
913-
914-
return CodeSize;
915-
}
916-
917898
// AccumOffset computed for the MCExpr equivalent of:
918899
// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
919900
static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
@@ -1249,24 +1230,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12491230
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
12501231
ProgInfo.EXCPEnable = 0;
12511232

1233+
// return ((Dst & ~Mask) | (Value << Shift))
1234+
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1235+
uint32_t Shift) {
1236+
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1237+
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1238+
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1239+
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
1240+
Ctx);
1241+
return Dst;
1242+
};
1243+
12521244
if (STM.hasGFX90AInsts()) {
1253-
// return ((Dst & ~Mask) | (Value << Shift))
1254-
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1255-
uint32_t Shift) {
1256-
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1257-
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1258-
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1259-
Dst = MCBinaryExpr::createOr(
1260-
Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1261-
return Dst;
1262-
};
1263-
1264-
ProgInfo.ComputePGMRSrc3GFX90A =
1265-
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1245+
ProgInfo.ComputePGMRSrc3 =
1246+
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
12661247
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
12671248
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1268-
ProgInfo.ComputePGMRSrc3GFX90A =
1269-
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1249+
ProgInfo.ComputePGMRSrc3 =
1250+
SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
12701251
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
12711252
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
12721253
}
@@ -1287,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12871268
", final occupancy is " + Twine(Occupancy));
12881269
F.getContext().diagnose(Diag);
12891270
}
1271+
1272+
if (isGFX11Plus(STM)) {
1273+
uint32_t CodeSizeInBytes =
1274+
(uint32_t)std::min(ProgInfo.getFunctionCodeSize(MF),
1275+
(uint64_t)std::numeric_limits<uint32_t>::max());
1276+
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1277+
uint32_t Field, Shift, Width;
1278+
if (isGFX11(STM)) {
1279+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1280+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1281+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1282+
} else {
1283+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1284+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1285+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1286+
}
1287+
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1288+
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1289+
CreateExpr(InstPrefSize), Field, Shift);
1290+
}
12901291
}
12911292

12921293
static unsigned getRsrcReg(CallingConv::ID CallConv) {

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
5050

5151
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
5252

53-
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
54-
5553
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
5654
void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
5755
const SIProgramInfo &KernelInfo,

llvm/lib/Target/AMDGPU/SIProgramInfo.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
2727

2828
const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
2929

30+
CodeSizeInBytes.reset();
31+
3032
VGPRBlocks = ZeroExpr;
3133
SGPRBlocks = ZeroExpr;
3234
Priority = 0;
@@ -55,7 +57,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
5557
LdsSize = 0;
5658
EXCPEnable = 0;
5759

58-
ComputePGMRSrc3GFX90A = ZeroExpr;
60+
ComputePGMRSrc3 = ZeroExpr;
5961

6062
NumVGPR = ZeroExpr;
6163
NumArchVGPR = ZeroExpr;
@@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
199201

200202
return MCConstantExpr::create(0, Ctx);
201203
}
204+
205+
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
206+
if (!CodeSizeInBytes.has_value()) {
207+
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
208+
const SIInstrInfo *TII = STM.getInstrInfo();
209+
210+
uint64_t CodeSize = 0;
211+
212+
for (const MachineBasicBlock &MBB : MF) {
213+
for (const MachineInstr &MI : MBB) {
214+
// TODO: CodeSize should account for multiple functions.
215+
216+
// TODO: Should we count size of debug info?
217+
if (MI.isDebugInstr())
218+
continue;
219+
220+
CodeSize += TII->getInstSizeInBytes(MI);
221+
}
222+
}
223+
224+
CodeSizeInBytes = CodeSize;
225+
}
226+
227+
return *CodeSizeInBytes;
228+
}

llvm/lib/Target/AMDGPU/SIProgramInfo.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "llvm/IR/CallingConv.h"
2020
#include "llvm/Support/Compiler.h"
2121
#include <cstdint>
22+
#include <optional>
2223

2324
namespace llvm {
2425

@@ -29,6 +30,8 @@ class MachineFunction;
2930

3031
/// Track resource usage for kernels / entry functions.
3132
struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
33+
std::optional<uint64_t> CodeSizeInBytes;
34+
3235
// Fields set in PGM_RSRC1 pm4 packet.
3336
const MCExpr *VGPRBlocks = nullptr;
3437
const MCExpr *SGPRBlocks = nullptr;
@@ -60,7 +63,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
6063
uint32_t LdsSize = 0;
6164
uint32_t EXCPEnable = 0;
6265

63-
const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
66+
const MCExpr *ComputePGMRSrc3 = nullptr;
6467

6568
const MCExpr *NumVGPR = nullptr;
6669
const MCExpr *NumArchVGPR = nullptr;
@@ -97,6 +100,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
97100
// non-MCExpr members.
98101
void reset(const MachineFunction &MF);
99102

103+
// Get function code size and cache the value.
104+
uint64_t getFunctionCodeSize(const MachineFunction &MF);
105+
100106
/// Compute the value of the ComputePGMRsrc1 register.
101107
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
102108
MCContext &Ctx) const;
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
3+
4+
; GCN-LABEL: .amdhsa_kernel large
5+
; GFX11: .amdhsa_inst_pref_size 3
6+
; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
7+
; GFX12: .amdhsa_inst_pref_size 4
8+
; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
9+
define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
10+
bb:
11+
call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
12+
ret void
13+
}
14+
15+
; GCN-LABEL: .amdhsa_kernel small
16+
; GCN: .amdhsa_inst_pref_size 1
17+
; GCN: codeLenInByte = {{[0-9]$}}
18+
define amdgpu_kernel void @small() {
19+
bb:
20+
ret void
21+
}

0 commit comments

Comments
 (0)