Skip to content

Commit 74e928a

Browse files
[amdgpu][lds] Remove recalculation of LDS frame from backend
Do the LDS frame calculation once, in the IR pass, instead of repeating the work in the backend. Prior to this patch: The IR lowering pass sets up a per-kernel LDS frame and annotates the variables with absolute_symbol metadata so that the assembler can build lookup tables out of it. There is a fragile association between kernel functions and named structs which is used to recompute the frame layout in the backend, with fatal_errors catching inconsistencies in the second calculation. After this patch: The IR lowering pass additionally sets a frame size attribute on kernels. The backend uses the same absolute_symbol metadata that the assembler uses to place objects within that frame size. Deleted the now dead allocation code from the backend. Left for a later cleanup: - enabling lowering for anonymous functions - removing the elide-module-lds attribute (test churn, it's not used by llc any more) - adjusting the dynamic alignment check to not use symbol names Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D155190
1 parent bd03e0c commit 74e928a

13 files changed

+105
-123
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,6 +1090,12 @@ The AMDGPU backend supports the following LLVM IR attributes.
10901090
kernel argument that holds the completion action pointer. If this
10911091
attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
10921092

1093+
"amdgpu-lds-size" The number of bytes that will be allocated in the Local Data Store at
1094+
address zero. Variables are allocated within this frame using absolute
1095+
symbol metadata, primarily by the AMDGPULowerModuleLDS pass. Internal
1096+
detail of how LDS variables are lowered, language front ends should not
1097+
set this.
1098+
10931099
======================================= ==========================================================
10941100

10951101
Calling Conventions

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -512,8 +512,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
512512
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
513513
const DataLayout &DL = F.getParent()->getDataLayout();
514514

515-
Info->allocateKnownAddressLDSGlobal(F);
516-
517515
SmallVector<CCValAssign, 16> ArgLocs;
518516
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
519517

@@ -596,8 +594,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
596594
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
597595
const DataLayout &DL = F.getParent()->getDataLayout();
598596

599-
Info->allocateKnownAddressLDSGlobal(F);
600-
601597
SmallVector<CCValAssign, 16> ArgLocs;
602598
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
603599

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,8 @@ class AMDGPULowerModuleLDS : public ModulePass {
11061106
return KernelToCreatedDynamicLDS;
11071107
}
11081108

1109+
// This attribute is no longer used by the backend. TODO: Delete it in favour
1110+
// of pass-local state and update the tests to remove the string.
11091111
static bool canElideModuleLDS(const Function &F) {
11101112
return F.hasFnAttribute("amdgpu-elide-module-lds");
11111113
}
@@ -1211,7 +1213,6 @@ class AMDGPULowerModuleLDS : public ModulePass {
12111213

12121214
// All kernel frames have been allocated. Calculate and record the
12131215
// addresses.
1214-
12151216
{
12161217
const DataLayout &DL = M.getDataLayout();
12171218

@@ -1220,8 +1221,8 @@ class AMDGPULowerModuleLDS : public ModulePass {
12201221
continue;
12211222

12221223
// All three of these are optional. The first variable is allocated at
1223-
// zero. They are allocated by allocateKnownAddressLDSGlobal in the
1224-
// following order:
1224+
// zero. They are allocated by AMDGPUMachineFunction as one block.
1225+
// Layout:
12251226
//{
12261227
// module.lds
12271228
// alignment padding
@@ -1250,22 +1251,23 @@ class AMDGPULowerModuleLDS : public ModulePass {
12501251

12511252
if (AllocateKernelScopeStruct) {
12521253
GlobalVariable *KernelStruct = Replacement->second.SGV;
1253-
12541254
Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
1255-
12561255
recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
1257-
12581256
Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
1259-
12601257
}
12611258

1259+
// If there is dynamic allocation, the alignment needed is included in
1260+
// the static frame size. There may be no reference to the dynamic
1261+
// variable in the kernel itself, so without including it here, that
1262+
// alignment padding could be missed.
12621263
if (AllocateDynamicVariable) {
12631264
GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
1264-
12651265
Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
1266-
12671266
recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
12681267
}
1268+
1269+
if (Offset != 0)
1270+
Func.addFnAttr("amdgpu-lds-size", std::to_string(Offset));
12691271
}
12701272
}
12711273

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Lines changed: 42 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
4343
// Assume the attribute allocates before any known GDS globals.
4444
StaticGDSSize = GDSSize;
4545

46+
// The two separate variables are only profitable when the LDS module lowering
47+
// pass is disabled. If graphics does not use dynamic LDS, this is never
48+
// profitable. Leaving cleanup for a later change.
49+
LDSSize = F.getFnAttributeAsParsedInteger("amdgpu-lds-size", 0);
50+
StaticLDSSize = LDSSize;
51+
4652
CallingConv::ID CC = F.getCallingConv();
4753
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
4854
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
@@ -65,6 +71,42 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
6571

6672
unsigned Offset;
6773
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
74+
75+
std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
76+
if (MaybeAbs) {
77+
// Absolute address LDS variables that exist prior to the LDS lowering
78+
// pass raise a fatal error in that pass. These failure modes are only
79+
// reachable if that lowering pass is disabled or broken. If/when adding
80+
// support for absolute addresses on user specified variables, the
81+
// alignment check moves to the lowering pass and the frame calculation
82+
// needs to take the user variables into consideration.
83+
84+
uint32_t ObjectStart = *MaybeAbs;
85+
86+
if (ObjectStart != alignTo(ObjectStart, Alignment)) {
87+
report_fatal_error("Absolute address LDS variable inconsistent with "
88+
"variable alignment");
89+
}
90+
91+
if (isModuleEntryFunction()) {
92+
// If this is a module entry function, we can also sanity check against
93+
// the static frame. Strictly it would be better to check against the
94+
// attribute, i.e. that the variable is within the always-allocated
95+
// section, and not within some other non-absolute-address object
96+
// allocated here, but the extra error detection is minimal and we would
97+
// have to pass the Function around or cache the attribute value.
98+
uint32_t ObjectEnd =
99+
ObjectStart + DL.getTypeAllocSize(GV.getValueType());
100+
if (ObjectEnd > StaticLDSSize) {
101+
report_fatal_error(
102+
"Absolute address LDS variable outside of static frame");
103+
}
104+
}
105+
106+
Entry.first->second = ObjectStart;
107+
return ObjectStart;
108+
}
109+
68110
/// TODO: We should sort these to minimize wasted space due to alignment
69111
/// padding. Currently the padding is decided by the first encountered use
70112
/// during lowering.
@@ -89,16 +131,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
89131
return Offset;
90132
}
91133

92-
static constexpr StringLiteral ModuleLDSName = "llvm.amdgcn.module.lds";
93-
94-
static const GlobalVariable *getKernelLDSGlobalFromFunction(const Function &F) {
95-
const Module *M = F.getParent();
96-
std::string KernelLDSName = "llvm.amdgcn.kernel.";
97-
KernelLDSName += F.getName();
98-
KernelLDSName += ".lds";
99-
return M->getNamedGlobal(KernelLDSName);
100-
}
101-
102134
static const GlobalVariable *
103135
getKernelDynLDSGlobalFromFunction(const Function &F) {
104136
const Module *M = F.getParent();
@@ -108,73 +140,6 @@ getKernelDynLDSGlobalFromFunction(const Function &F) {
108140
return M->getNamedGlobal(KernelDynLDSName);
109141
}
110142

111-
// This kernel calls no functions that require the module lds struct
112-
static bool canElideModuleLDS(const Function &F) {
113-
return F.hasFnAttribute("amdgpu-elide-module-lds");
114-
}
115-
116-
void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
117-
const Module *M = F.getParent();
118-
// This function is called before allocating any other LDS so that it can
119-
// reliably put values at known addresses. Consequently, dynamic LDS, if
120-
// present, will not yet have been allocated
121-
122-
assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated");
123-
124-
if (isModuleEntryFunction()) {
125-
126-
// Pointer values start from zero, memory allocated per-kernel-launch
127-
// Variables can be grouped into a module level struct and a struct per
128-
// kernel function by AMDGPULowerModuleLDSPass. If that is done, they
129-
// are allocated at statically computable addresses here.
130-
//
131-
// Address 0
132-
// {
133-
// llvm.amdgcn.module.lds
134-
// }
135-
// alignment padding
136-
// {
137-
// llvm.amdgcn.kernel.some-name.lds
138-
// }
139-
// other variables, e.g. dynamic lds, allocated after this call
140-
141-
const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
142-
const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
143-
const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
144-
145-
if (GV && !canElideModuleLDS(F)) {
146-
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
147-
std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*GV);
148-
if (!Expect || (Offset != *Expect)) {
149-
report_fatal_error("Inconsistent metadata on module LDS variable");
150-
}
151-
}
152-
153-
if (KV) {
154-
// The per-kernel offset is deterministic because it is allocated
155-
// before any other non-module LDS variables.
156-
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
157-
std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*KV);
158-
if (!Expect || (Offset != *Expect)) {
159-
report_fatal_error("Inconsistent metadata on kernel LDS variable");
160-
}
161-
}
162-
163-
if (Dyn) {
164-
// The dynamic LDS is deterministic because the per-kernel one has the
165-
// maximum alignment of any reachable and all remaining LDS variables,
166-
// if this is present, are themselves dynamic LDS and will be allocated
167-
// at the same address.
168-
setDynLDSAlign(F, *Dyn);
169-
unsigned Offset = LDSSize;
170-
std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
171-
if (!Expect || (Offset != *Expect)) {
172-
report_fatal_error("Inconsistent metadata on dynamic LDS variable");
173-
}
174-
}
175-
}
176-
}
177-
178143
std::optional<uint32_t>
179144
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
180145
// TODO: Would be more consistent with the abs symbols to use a range

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
104104
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
105105
Align Trailing);
106106

107-
void allocateKnownAddressLDSGlobal(const Function &F);
108-
109107
static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
110108
static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV);
111109

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2460,8 +2460,6 @@ SDValue SITargetLowering::LowerFormalArguments(
24602460
return DAG.getEntryNode();
24612461
}
24622462

2463-
Info->allocateKnownAddressLDSGlobal(Fn);
2464-
24652463
SmallVector<ISD::InputArg, 16> Splits;
24662464
SmallVector<CCValAssign, 16> ArgLocs;
24672465
BitVector Skipped(Ins.size());

llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 4, !absolute_symbol !0
2323
;.
2424
define amdgpu_kernel void @k0() #0 {
25-
; CHECK-LABEL: @k0(
25+
; CHECK-LABEL: @k0() #0
2626
; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4
2727
; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9
2828
; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11
@@ -40,7 +40,7 @@ define amdgpu_kernel void @k0() #0 {
4040
}
4141

4242
define amdgpu_kernel void @k1() #0 {
43-
; CHECK-LABEL: @k1(
43+
; CHECK-LABEL: @k1() #1
4444
; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17
4545
; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21
4646
; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23
@@ -56,7 +56,7 @@ define amdgpu_kernel void @k1() #0 {
5656
}
5757

5858
define amdgpu_kernel void @k2() #0 {
59-
; CHECK-LABEL: @k2(
59+
; CHECK-LABEL: @k2() #2
6060
; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 2
6161
; CHECK-NEXT: ret void
6262
;
@@ -66,7 +66,7 @@ define amdgpu_kernel void @k2() #0 {
6666
}
6767

6868
define amdgpu_kernel void @k3() #0 {
69-
; CHECK-LABEL: @k3(
69+
; CHECK-LABEL: @k3() #3
7070
; CHECK-NEXT: store i8 4, ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, align 4
7171
; CHECK-NEXT: ret void
7272
;
@@ -75,14 +75,14 @@ define amdgpu_kernel void @k3() #0 {
7575
ret void
7676
}
7777

78-
78+
; CHECK-LABEL: @calls_f0() #4
7979
define amdgpu_kernel void @calls_f0() {
8080
call void @f0()
8181
ret void
8282
}
8383

8484
define void @f0() {
85-
; CHECK-LABEL: define void @f0(
85+
; CHECK-LABEL: define void @f0()
8686
; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !24
8787
; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !24
8888
; CHECK-NEXT: ret void
@@ -93,7 +93,10 @@ define void @f0() {
9393
ret void
9494
}
9595

96-
attributes #0 = { "amdgpu-elide-module-lds" }
97-
; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
96+
; CHECK: attributes #0 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="23" }
97+
; CHECK: attributes #1 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="22" }
98+
; CHECK: attributes #2 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="2" }
99+
; CHECK: attributes #3 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="4" }
100+
; CHECK: attributes #4 = { "amdgpu-lds-size"="9" }
98101

99102
; CHECK: !0 = !{i64 0, i64 1}

llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
@B = external addrspace(3) global [0 x i32]
1010

1111
define amdgpu_kernel void @kernel_0() {
12-
; CHECK-LABEL: define amdgpu_kernel void @kernel_0() !llvm.amdgcn.lds.kernel.id !1 {
12+
; CHECK-LABEL: define amdgpu_kernel void @kernel_0() #0 !llvm.amdgcn.lds.kernel.id !1 {
1313
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_0.lds) ]
1414
; CHECK-NEXT: call void @call_store_A()
1515
; CHECK-NEXT: ret void
@@ -29,7 +29,7 @@ define amdgpu_kernel void @kernel_1() {
2929
}
3030

3131
define amdgpu_kernel void @kernel_2() {
32-
; CHECK-LABEL: define amdgpu_kernel void @kernel_2() !llvm.amdgcn.lds.kernel.id !3 {
32+
; CHECK-LABEL: define amdgpu_kernel void @kernel_2() #0 !llvm.amdgcn.lds.kernel.id !3 {
3333
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_2.lds) ]
3434
; CHECK-NEXT: call void @store_A()
3535
; CHECK-NEXT: ret void
@@ -82,3 +82,5 @@ define private ptr @get_B_ptr() {
8282
;
8383
ret ptr addrspacecast (ptr addrspace(3) @B to ptr)
8484
}
85+
86+
; CHECK: attributes #0 = { "amdgpu-lds-size"="64" }

llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ entry:
4848
ret void
4949
}
5050

51-
; CHECK-LABEL: @timestwo() #0
51+
; CHECK-LABEL: @timestwo() #1
5252
; CHECK-NOT: call void @llvm.donothing()
5353

5454
; CHECK: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr
@@ -67,14 +67,14 @@ entry:
6767
; CHECK: %12 = inttoptr i64 %11 to ptr
6868
; CHECK: store i32 %mul, ptr %12, align 4
6969
; CHECK: ret void
70-
define amdgpu_kernel void @timestwo() {
70+
define amdgpu_kernel void @timestwo() #1 {
7171
%ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64)) to ptr), align 4
7272
%mul = mul i32 %ld, 2
7373
store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64)) to ptr), align 4
7474
ret void
7575
}
7676

77-
; CHECK-LABEL: @through_functions()
77+
; CHECK-LABEL: @through_functions() #2
7878
define amdgpu_kernel void @through_functions() {
7979
%ld = call i32 @get_func()
8080
%mul = mul i32 %ld, 4
@@ -84,3 +84,5 @@ define amdgpu_kernel void @through_functions() {
8484

8585
attributes #0 = { "amdgpu-elide-module-lds" }
8686
; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
87+
; CHECK: attributes #1 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="8" }
88+
; CHECK: attributes #2 = { "amdgpu-lds-size"="8" }

0 commit comments

Comments
 (0)