Skip to content

Commit 912528b

Browse files
committed
[AMDGPU] Set inst_pref_size to maximum
On gfx11 and gfx12 set initial instruction prefetch size to a minimum of kernel size and maximum allowed value. Fixes: SWDEV-513122
1 parent 03cb46d commit 912528b

File tree

4 files changed

+67
-25
lines changed

4 files changed

+67
-25
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -622,12 +622,13 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
622622

623623
int64_t PGRM_Rsrc3 = 1;
624624
bool EvaluatableRsrc3 =
625-
CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
625+
CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
626626
(void)PGRM_Rsrc3;
627627
(void)EvaluatableRsrc3;
628-
assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
628+
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
629+
STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
629630
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
630-
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
631+
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
631632

632633
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
633634
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -822,22 +823,22 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
822823
false);
823824

824825
[[maybe_unused]] int64_t PGMRSrc3;
825-
assert(STM.hasGFX90AInsts() ||
826-
(CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
827-
PGMRSrc3) &&
826+
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
827+
STM.hasGFX90AInsts() ||
828+
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
828829
static_cast<uint64_t>(PGMRSrc3) == 0));
829830
if (STM.hasGFX90AInsts()) {
830831
OutStreamer->emitRawComment(
831832
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
832833
getMCExprStr(MCKernelDescriptor::bits_get(
833-
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
834+
CurrentProgramInfo.ComputePGMRSrc3,
834835
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
835836
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
836837
false);
837838
OutStreamer->emitRawComment(
838839
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
839840
getMCExprStr(MCKernelDescriptor::bits_get(
840-
CurrentProgramInfo.ComputePGMRSrc3GFX90A,
841+
CurrentProgramInfo.ComputePGMRSrc3,
841842
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
842843
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
843844
false);
@@ -1229,24 +1230,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12291230
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
12301231
ProgInfo.EXCPEnable = 0;
12311232

1233+
// return ((Dst & ~Mask) | (Value << Shift))
1234+
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1235+
uint32_t Shift) {
1236+
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1237+
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1238+
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1239+
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
1240+
Ctx);
1241+
return Dst;
1242+
};
1243+
12321244
if (STM.hasGFX90AInsts()) {
1233-
// return ((Dst & ~Mask) | (Value << Shift))
1234-
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1235-
uint32_t Shift) {
1236-
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1237-
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1238-
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1239-
Dst = MCBinaryExpr::createOr(
1240-
Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1241-
return Dst;
1242-
};
1243-
1244-
ProgInfo.ComputePGMRSrc3GFX90A =
1245-
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1245+
ProgInfo.ComputePGMRSrc3 =
1246+
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
12461247
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
12471248
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1248-
ProgInfo.ComputePGMRSrc3GFX90A =
1249-
SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1249+
ProgInfo.ComputePGMRSrc3 =
1250+
SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
12501251
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
12511252
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
12521253
}
@@ -1267,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
12671268
", final occupancy is " + Twine(Occupancy));
12681269
F.getContext().diagnose(Diag);
12691270
}
1271+
1272+
if (isGFX11Plus(STM)) {
1273+
uint32_t CodeSizeInBytes =
1274+
(uint32_t)std::min(ProgInfo.getFunctionCodeSize(MF),
1275+
(uint64_t)std::numeric_limits<uint32_t>::max());
1276+
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1277+
uint32_t Field, Shift, Width;
1278+
if (isGFX11(STM)) {
1279+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1280+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1281+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1282+
} else {
1283+
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1284+
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1285+
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1286+
}
1287+
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1288+
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1289+
CreateExpr(InstPrefSize), Field, Shift);
1290+
}
12701291
}
12711292

12721293
static unsigned getRsrcReg(CallingConv::ID CallConv) {

llvm/lib/Target/AMDGPU/SIProgramInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
5757
LdsSize = 0;
5858
EXCPEnable = 0;
5959

60-
ComputePGMRSrc3GFX90A = ZeroExpr;
60+
ComputePGMRSrc3 = ZeroExpr;
6161

6262
NumVGPR = ZeroExpr;
6363
NumArchVGPR = ZeroExpr;

llvm/lib/Target/AMDGPU/SIProgramInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
6363
uint32_t LdsSize = 0;
6464
uint32_t EXCPEnable = 0;
6565

66-
const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
66+
const MCExpr *ComputePGMRSrc3 = nullptr;
6767

6868
const MCExpr *NumVGPR = nullptr;
6969
const MCExpr *NumArchVGPR = nullptr;
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s
3+
4+
; GCN-LABEL: .amdhsa_kernel large
5+
; GFX11: .amdhsa_inst_pref_size 3
6+
; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
7+
; GFX12: .amdhsa_inst_pref_size 4
8+
; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
9+
define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
10+
bb:
11+
call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
12+
ret void
13+
}
14+
15+
; GCN-LABEL: .amdhsa_kernel small
16+
; GCN: .amdhsa_inst_pref_size 1
17+
; GCN: codeLenInByte = {{[0-9]$}}
18+
define amdgpu_kernel void @small() {
19+
bb:
20+
ret void
21+
}

0 commit comments

Comments
 (0)