Skip to content

Commit f4ba2bf

Browse files
authored
AMDGPU: Add amdgpu-agpr-alloc attribute to control AGPR allocation (#128034)
This provides a range to decide how to subdivide the vector register budget on gfx90a+. A single value declares the minimum AGPRs that should be allocatable. Eventually this should replace amdgpu-no-agpr. I want this primarily for testing agpr allocation behavior. We should have a heuristic try to detect a reasonable number of AGPRs to keep allocatable.
1 parent 560cfd5 commit f4ba2bf

File tree

4 files changed

+620
-11
lines changed

4 files changed

+620
-11
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,6 +1707,24 @@ The AMDGPU backend supports the following LLVM IR attributes.
17071707
as hidden. Hidden arguments are managed by the compiler and are not part of
17081708
the explicit arguments supplied by the user.
17091709

1710+
"amdgpu-agpr-alloc"="min(,max)" Indicates a minimum and maximum range for the number of AGPRs to make
1711+
available to allocate. The values will be rounded up to the next multiple
1712+
of the allocation granularity (4). The minimum value is interpreted as the
1713+
minimum required number of AGPRs for the function to allocate (that is, the
1714+
function requires no more than min registers). If only one value is specified,
1715+
it is interpreted as the minimum register budget. The maximum will restrict
1716+
allocation to use no more than max AGPRs.
1717+
1718+
The values may be ignored if satisfying it would violate other allocation
1719+
constraints.
1720+
1721+
The behavior is undefined if a function which requires more AGPRs than the
1722+
lower bound is reached through any function marked with a higher value of this
1723+
attribute. A minimum value of 0 indicates the function does not require
1724+
any AGPRs. A minimum of 0 is equivalent to "amdgpu-no-agpr".
1725+
1726+
This is only relevant on targets with AGPRs which support accum_offset (gfx90a+).
1727+
17101728
"amdgpu-sgpr-hazard-wait" Disabled SGPR hazard wait insertion if set to 0.
17111729
Exists for testing performance impact of SGPR hazard waits only.
17121730

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -572,9 +572,10 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
572572
std::pair<unsigned, unsigned>
573573
SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
574574
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
575-
unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
576-
unsigned MaxNumAGPRs = MaxNumVGPRs;
577-
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
575+
const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
576+
577+
unsigned MaxNumVGPRs = MaxVectorRegs;
578+
unsigned MaxNumAGPRs = 0;
578579

579580
// On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
580581
// a wave may have up to 512 total vector registers combining together both
@@ -585,16 +586,49 @@ SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
585586
// TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
586587
// register file accordingly.
587588
if (ST.hasGFX90AInsts()) {
588-
if (MFI->mayNeedAGPRs()) {
589-
MaxNumVGPRs /= 2;
590-
MaxNumAGPRs = MaxNumVGPRs;
589+
unsigned MinNumAGPRs = 0;
590+
const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
591+
const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
592+
593+
const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
594+
595+
// TODO: Replace amdgpu-no-agpr with amdgpu-agpr-alloc=0
596+
// TODO: Move this logic into subtarget on IR function
597+
//
598+
// TODO: The lower bound should probably force the number of required
599+
// registers up, overriding amdgpu-waves-per-eu.
600+
std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
601+
MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
602+
/*OnlyFirstRequired=*/true);
603+
604+
if (MinNumAGPRs == DefaultNumAGPR.first) {
605+
// Default to splitting half the registers if AGPRs are required.
606+
607+
if (MFI->mayNeedAGPRs())
608+
MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
609+
else
610+
MinNumAGPRs = 0;
591611
} else {
592-
if (MaxNumVGPRs > TotalNumVGPRs) {
593-
MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
594-
MaxNumVGPRs = TotalNumVGPRs;
595-
} else
596-
MaxNumAGPRs = 0;
612+
// Align to accum_offset's allocation granularity.
613+
MinNumAGPRs = alignTo(MinNumAGPRs, 4);
614+
615+
MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
597616
}
617+
618+
// Clamp values to be inbounds of our limits, and ensure min <= max.
619+
620+
MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
621+
MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
622+
623+
MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
624+
MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
625+
626+
assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
627+
MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
628+
"invalid register counts");
629+
} else if (ST.hasMAIInsts()) {
630+
// On gfx908 the number of AGPRs always equals the number of VGPRs.
631+
MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
598632
}
599633

600634
return std::pair(MaxNumVGPRs, MaxNumAGPRs);
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
2+
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s 2> %t.err | FileCheck -check-prefixes=CHECK,GFX90A %s
3+
; RUN: FileCheck -check-prefix=ERR < %t.err %s
4+
5+
; Test undefined behavior where a function ends up needing AGPRs that
6+
; was marked with "amdgpu-agpr-alloc="="0". There should be no asserts.
7+
8+
; TODO: Should this be an error, or let UB happen?
9+
10+
; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'kernel_illegal_agpr_use_asm'
11+
; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'func_illegal_agpr_use_asm'
12+
; ERR: error: <unknown>:0:0: no registers from class available to allocate in function 'kernel_calls_mfma.f32.32x32x1f32'
13+
14+
; CHECK: {{^}}kernel_illegal_agpr_use_asm:
15+
; CHECK: ; use a0
16+
17+
; CHECK: NumVgprs: 0
18+
; CHECK: NumAgprs: 1
19+
define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
20+
call void asm sideeffect "; use $0", "a"(i32 poison)
21+
ret void
22+
}
23+
24+
; CHECK: {{^}}func_illegal_agpr_use_asm:
25+
; CHECK: ; use a0
26+
27+
; CHECK: NumVgprs: 0
28+
; CHECK: NumAgprs: 1
29+
define void @func_illegal_agpr_use_asm() #0 {
30+
call void asm sideeffect "; use $0", "a"(i32 poison)
31+
ret void
32+
}
33+
34+
; CHECK-LABEL: {{^}}kernel_calls_mfma.f32.32x32x1f32:
35+
; CHECK: v_accvgpr_write_b32
36+
37+
; GFX908: NumVgprs: 5
38+
; GFX90A: NumVgprs: 36
39+
; CHECK: NumAgprs: 32
40+
41+
; GFX908: TotalNumVgprs: 32
42+
; GFX90A: TotalNumVgprs: 68
43+
define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) #0 {
44+
%result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
45+
store <32 x float> %result, ptr addrspace(1) %out
46+
ret void
47+
}
48+
49+
attributes #0 = { "amdgpu-agpr-alloc"="0" }

0 commit comments

Comments
 (0)