Skip to content

[AMDGPU] Set inst_pref_size to maximum #126981

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1230,18 +1230,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;

if (STM.hasGFX90AInsts()) {
// return ((Dst & ~Mask) | (Value << Shift))
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
uint32_t Shift) {
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
Dst = MCBinaryExpr::createOr(
Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
return Dst;
};
// return ((Dst & ~Mask) | (Value << Shift))
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
uint32_t Shift) {
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
Ctx);
return Dst;
};

if (STM.hasGFX90AInsts()) {
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
Expand All @@ -1268,6 +1268,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}

if (isGFX11Plus(STM)) {
uint32_t CodeSizeInBytes = (uint32_t)std::min(
ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
(uint64_t)std::numeric_limits<uint32_t>::max());
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
uint32_t Field, Shift, Width;
if (isGFX11(STM)) {
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
} else {
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
}
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
CreateExpr(InstPrefSize), Field, Shift);
}
}

static unsigned getRsrcReg(CallingConv::ID CallConv) {
Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
return MCConstantExpr::create(0, Ctx);
}

uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
if (CodeSizeInBytes.has_value())
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
bool IsLowerBound) {
if (!IsLowerBound && CodeSizeInBytes.has_value())
return *CodeSizeInBytes;

const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
Expand All @@ -216,14 +217,20 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
// overestimated. In case of inline asm used getInstSizeInBytes() will
// return a maximum size of a single instruction, where the real size may
// differ. At this point CodeSize may be already off.
CodeSize = alignTo(CodeSize, MBB.getAlignment());
if (!IsLowerBound)
CodeSize = alignTo(CodeSize, MBB.getAlignment());

for (const MachineInstr &MI : MBB) {
// TODO: CodeSize should account for multiple functions.

if (MI.isMetaInstruction())
continue;

// We cannot properly estimate inline asm size. It can be as small as zero
// if that is just a comment.
if (IsLowerBound && MI.isInlineAsm())
continue;

CodeSize += TII->getInstSizeInBytes(MI);
}
}
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/SIProgramInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
void reset(const MachineFunction &MF);

// Get function code size and cache the value.
uint64_t getFunctionCodeSize(const MachineFunction &MF);
// If \p IsLowerBound is set it returns a minimal code size which is safe
// to address.
uint64_t getFunctionCodeSize(const MachineFunction &MF,
bool IsLowerBound = false);

/// Compute the value of the ComputePGMRsrc1 register.
const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
Expand Down
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s

; GCN-LABEL: .amdhsa_kernel large
; GFX11: .amdhsa_inst_pref_size 3
; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
; GFX12: .amdhsa_inst_pref_size 4
; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
bb:
call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false)
ret void
}

; GCN-LABEL: .amdhsa_kernel small
; GCN: .amdhsa_inst_pref_size 1
; GCN: codeLenInByte = {{[0-9]$}}
define amdgpu_kernel void @small() {
bb:
ret void
}

; Ignore inline asm in size calculation

; GCN-LABEL: .amdhsa_kernel inline_asm
; GCN: .amdhsa_inst_pref_size 1
; GCN: codeLenInByte = {{[0-9]$}}
define amdgpu_kernel void @inline_asm() {
bb:
call void asm sideeffect ".fill 256, 4, 0", ""()
ret void
}