Skip to content

Commit d7bfaee

Browse files
committed
[AMDGPU] Set inst_pref_size to maximum
On gfx11 and gfx12 set initial instruction prefetch size to a minimum of kernel size and maximum allowed value. Fixes: SWDEV-513122
1 parent 5bf3748 commit d7bfaee

File tree

4 files changed

+92
-29
lines changed

4 files changed

+92
-29
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -622,12 +622,13 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
622622

623623
int64_t PGRM_Rsrc3 = 1;
624624
bool EvaluatableRsrc3 =
625-
CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
625+
CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
626626
(void)PGRM_Rsrc3;
627627
(void)EvaluatableRsrc3;
628-
assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
628+
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
629+
STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
629630
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
630-
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
631+
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
631632

632633
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
633634
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -822,22 +823,22 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
822823
false);
823824

824825
[[maybe_unused]] int64_t PGMRSrc3;
825-
assert(STM.hasGFX90AInsts() ||
826-
(CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
827-
PGMRSrc3) &&
826+
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
827+
STM.hasGFX90AInsts() ||
828+
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
828829
static_cast<uint64_t>(PGMRSrc3) == 0));
829830
if (STM.hasGFX90AInsts()) {
830831
OutStreamer->emitRawComment(
831832
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
832833
getMCExprStr(MCKernelDescriptor::bits_get(
833-
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
834+
CurrentProgramInfo.ComputePGMRSrc3,
834835
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
835836
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
836837
false);
837838
OutStreamer->emitRawComment(
838839
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
839840
getMCExprStr(MCKernelDescriptor::bits_get(
840-
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
841+
CurrentProgramInfo.ComputePGMRSrc3,
841842
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
842843
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
843844
false);
@@ -1229,24 +1230,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12291230
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
12301231
ProgInfo.EXCPEnable = 0;
12311232

1233+
// return ((Dst & ~Mask) | (Value << Shift))
1234+
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1235+
uint32_t Shift) {
1236+
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1237+
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1238+
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1239+
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
1240+
Ctx);
1241+
return Dst;
1242+
};
1243+
12321244
if (STM.hasGFX90AInsts()) {
1233-
// return ((Dst & ~Mask) | (Value << Shift))
1234-
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1235-
uint32_t Shift) {
1236-
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1237-
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1238-
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1239-
Dst = MCBinaryExpr::createOr(
1240-
Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1241-
return Dst;
1242-
};
1243-
1244-
ProgInfo.ComputePGMRSrc3GFX90A =
1245-
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1245+
ProgInfo.ComputePGMRSrc3 =
1246+
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
12461247
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
12471248
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1248-
ProgInfo.ComputePGMRSrc3GFX90A =
1249-
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1249+
ProgInfo.ComputePGMRSrc3 =
1250+
SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
12501251
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
12511252
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
12521253
}
@@ -1267,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12671268
", final occupancy is " + Twine(Occupancy));
12681269
F.getContext().diagnose(Diag);
12691270
}
1271+
1272+
if (isGFX11Plus(STM)) {
1273+
uint32_t CodeSizeInBytes = (uint32_t)std::min(
1274+
ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1275+
(uint64_t)std::numeric_limits<uint32_t>::max());
1276+
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1277+
uint32_t Field, Shift, Width;
1278+
if (isGFX11(STM)) {
1279+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1280+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1281+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1282+
} else {
1283+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1284+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1285+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1286+
}
1287+
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1288+
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1289+
CreateExpr(InstPrefSize), Field, Shift);
1290+
}
12701291
}
12711292

12721293
static unsigned getRsrcReg(CallingConv::ID CallConv) {

llvm/lib/Target/AMDGPU/SIProgramInfo.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
5757
LdsSize = 0;
5858
EXCPEnable = 0;
5959

60-
ComputePGMRSrc3GFX90A = ZeroExpr;
60+
ComputePGMRSrc3 = ZeroExpr;
6161

6262
NumVGPR = ZeroExpr;
6363
NumArchVGPR = ZeroExpr;
@@ -202,8 +202,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
202202
return MCConstantExpr::create(0, Ctx);
203203
}
204204

205-
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
206-
if (CodeSizeInBytes.has_value())
205+
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
206+
bool IsLowerBound) {
207+
if (!IsLowerBound && CodeSizeInBytes.has_value())
207208
return *CodeSizeInBytes;
208209

209210
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
@@ -216,14 +217,20 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
216217
// overestimated. In case of inline asm used getInstSizeInBytes() will
217218
// return a maximum size of a single instruction, where the real size may
218219
// differ. At this point CodeSize may be already off.
219-
CodeSize = alignTo(CodeSize, MBB.getAlignment());
220+
if (!IsLowerBound)
221+
CodeSize = alignTo(CodeSize, MBB.getAlignment());
220222

221223
for (const MachineInstr &MI : MBB) {
222224
// TODO: CodeSize should account for multiple functions.
223225

224226
if (MI.isMetaInstruction())
225227
continue;
226228

229+
// We cannot properly estimate inline asm size. It can be as small as zero
230+
// if that is just a comment.
231+
if (IsLowerBound && MI.isInlineAsm())
232+
continue;
233+
227234
CodeSize += TII->getInstSizeInBytes(MI);
228235
}
229236
}

llvm/lib/Target/AMDGPU/SIProgramInfo.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
6363
uint32_t LdsSize = 0;
6464
uint32_t EXCPEnable = 0;
6565

66-
const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
66+
const MCExpr *ComputePGMRSrc3 = nullptr;
6767

6868
const MCExpr *NumVGPR = nullptr;
6969
const MCExpr *NumArchVGPR = nullptr;
@@ -101,7 +101,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
101101
void reset(const MachineFunction &MF);
102102

103103
// Get function code size and cache the value.
104-
uint64_t getFunctionCodeSize(const MachineFunction &MF);
104+
// If \p IsLowerBound is set it returns a minimal code size which is safe
105+
// to address.
106+
uint64_t getFunctionCodeSize(const MachineFunction &MF,
107+
bool IsLowerBound = false);
105108

106109
/// Compute the value of the ComputePGMRsrc1 register.
107110
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
3+
4+
; GCN-LABEL: .amdhsa_kernel large
5+
; GFX11: .amdhsa_inst_pref_size 3
6+
; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
7+
; GFX12: .amdhsa_inst_pref_size 4
8+
; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
9+
define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
10+
bb:
11+
call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
12+
ret void
13+
}
14+
15+
; GCN-LABEL: .amdhsa_kernel small
16+
; GCN: .amdhsa_inst_pref_size 1
17+
; GCN: codeLenInByte = {{[0-9]$}}
18+
define amdgpu_kernel void @small() {
19+
bb:
20+
ret void
21+
}
22+
23+
; Ignore inline asm in size calculation
24+
25+
; GCN-LABEL: .amdhsa_kernel inline_asm
26+
; GCN: .amdhsa_inst_pref_size 1
27+
; GCN: codeLenInByte = {{[0-9]$}}
28+
define amdgpu_kernel void @inline_asm() {
29+
bb:
30+
call void asm sideeffect ".fill 256, 4, 0", ""()
31+
ret void
32+
}

0 commit comments

Comments
 (0)