Skip to content

Commit 7ed197b

Browse files
committed
[AMDGPU] Add dynamic LDS size implicit kernel argument to CO-v5(#65273)
"hidden_dynamic_lds_size" argument will be added in the reserved section at offset 120 of the implicit argument layout. Add "isDynamicLDSUsed" flag to AMDGPUMachineFunction to identify if a function uses dynamic LDS. hidden argument will be added in below cases: *LDS global is used in the kernel. *Kernel calls a function which uses LDS global. *LDS pointer is passed as argument to kernel itself.
1 parent 6214496 commit 7ed197b

10 files changed

+430
-11
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4114,6 +4114,9 @@ Code object V5 metadata is the same as
41144114
buffer that conforms to the requirements of the malloc/free
41154115
device library V1 version implementation.
41164116

4117+
"hidden_dynamic_lds_size"
4118+
Size of the dynamically allocated LDS memory is passed in the kernarg.
4119+
41174120
"hidden_private_base"
41184121
The high 32 bits of the flat addressing private aperture base.
41194122
Only used by GFX8 to allow conversion between private segment

llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
134134
.Case("hidden_default_queue", true)
135135
.Case("hidden_completion_action", true)
136136
.Case("hidden_multigrid_sync_arg", true)
137+
.Case("hidden_dynamic_lds_size", true)
137138
.Case("hidden_private_base", true)
138139
.Case("hidden_shared_base", true)
139140
.Case("hidden_queue_ptr", true)

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
646646
Offset += 8; // Skipped.
647647
}
648648

649-
Offset += 72; // Reserved.
649+
// Emit argument for hidden dynamic lds size
650+
if (MFI.isDynamicLDSUsed()) {
651+
emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
652+
Args);
653+
} else {
654+
Offset += 4; // skipped
655+
}
656+
657+
Offset += 68; // Reserved.
650658

651659
// hidden_private_base and hidden_shared_base are only when the subtarget has
652660
// ApertureRegs.

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,26 @@
1919

2020
using namespace llvm;
2121

22+
static const GlobalVariable *
23+
getKernelDynLDSGlobalFromFunction(const Function &F) {
24+
const Module *M = F.getParent();
25+
SmallString<64> KernelDynLDSName("llvm.amdgcn.");
26+
KernelDynLDSName += F.getName();
27+
KernelDynLDSName += ".dynlds";
28+
return M->getNamedGlobal(KernelDynLDSName);
29+
}
30+
31+
static bool hasLDSKernelArgument(const Function &F) {
32+
for (const Argument &Arg : F.args()) {
33+
Type *ArgTy = Arg.getType();
34+
if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) {
35+
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
36+
return true;
37+
}
38+
}
39+
return false;
40+
}
41+
2242
AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
2343
const AMDGPUSubtarget &ST)
2444
: IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
@@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
6585
Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
6686
NoSignedZerosFPMath =
6787
NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
88+
89+
const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
90+
if (DynLdsGlobal || hasLDSKernelArgument(F))
91+
UsesDynamicLDS = true;
6892
}
6993

7094
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
139163
return Offset;
140164
}
141165

142-
static const GlobalVariable *
143-
getKernelDynLDSGlobalFromFunction(const Function &F) {
144-
const Module *M = F.getParent();
145-
std::string KernelDynLDSName = "llvm.amdgcn.";
146-
KernelDynLDSName += F.getName();
147-
KernelDynLDSName += ".dynlds";
148-
return M->getNamedGlobal(KernelDynLDSName);
149-
}
150-
151166
std::optional<uint32_t>
152167
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
153168
// TODO: Would be more consistent with the abs symbols to use a range
@@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
210225
}
211226
}
212227
}
228+
229+
void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) {
230+
UsesDynamicLDS = DynLDS;
231+
}
232+
233+
bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; }

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
4646
/// stages.
4747
Align DynLDSAlign;
4848

49+
// Flag to check dynamic LDS usage by kernel.
50+
bool UsesDynamicLDS = false;
51+
4952
// Kernels + shaders. i.e. functions called by the hardware and not called
5053
// by other functions.
5154
bool IsEntryFunction = false;
@@ -119,6 +122,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
119122
Align getDynLDSAlign() const { return DynLDSAlign; }
120123

121124
void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
125+
126+
void setUsesDynamicLDS(bool DynLDS);
127+
128+
bool isDynamicLDSUsed() const;
122129
};
123130

124131
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6890,6 +6890,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
68906890
// Adjust alignment for that dynamic shared memory array.
68916891
Function &F = DAG.getMachineFunction().getFunction();
68926892
MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
6893+
MFI->setUsesDynamicLDS(true);
68936894
return SDValue(
68946895
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
68956896
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
4+
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
8+
9+
10+
; CHECK: amdhsa.kernels:
11+
; CHECK-NEXT: - .args:
12+
; CHECK-NEXT: - .address_space: global
13+
; CHECK-NEXT: .name: r
14+
; CHECK-NEXT: .offset: 0
15+
; CHECK-NEXT: .size: 8
16+
; CHECK-NEXT: .value_kind: global_buffer
17+
; CHECK-NEXT: - .address_space: global
18+
; CHECK-NEXT: .name: a
19+
; CHECK-NEXT: .offset: 8
20+
; CHECK-NEXT: .size: 8
21+
; CHECK-NEXT: .value_kind: global_buffer
22+
; CHECK-NEXT: - .address_space: global
23+
; CHECK-NEXT: .name: b
24+
; CHECK-NEXT: .offset: 16
25+
; CHECK-NEXT: .size: 8
26+
; CHECK-NEXT: .value_kind: global_buffer
27+
; CHECK-NEXT: - .offset: 24
28+
; CHECK-NEXT: .size: 4
29+
; CHECK-NEXT: .value_kind: hidden_block_count_x
30+
; CHECK-NEXT: - .offset: 28
31+
; CHECK-NEXT: .size: 4
32+
; CHECK-NEXT: .value_kind: hidden_block_count_y
33+
; CHECK-NEXT: - .offset: 32
34+
; CHECK-NEXT: .size: 4
35+
; CHECK-NEXT: .value_kind: hidden_block_count_z
36+
; CHECK-NEXT: - .offset: 36
37+
; CHECK-NEXT: .size: 2
38+
; CHECK-NEXT: .value_kind: hidden_group_size_x
39+
; CHECK-NEXT: - .offset: 38
40+
; CHECK-NEXT: .size: 2
41+
; CHECK-NEXT: .value_kind: hidden_group_size_y
42+
; CHECK-NEXT: - .offset: 40
43+
; CHECK-NEXT: .size: 2
44+
; CHECK-NEXT: .value_kind: hidden_group_size_z
45+
; CHECK-NEXT: - .offset: 42
46+
; CHECK-NEXT: .size: 2
47+
; CHECK-NEXT: .value_kind: hidden_remainder_x
48+
; CHECK-NEXT: - .offset: 44
49+
; CHECK-NEXT: .size: 2
50+
; CHECK-NEXT: .value_kind: hidden_remainder_y
51+
; CHECK-NEXT: - .offset: 46
52+
; CHECK-NEXT: .size: 2
53+
; CHECK-NEXT: .value_kind: hidden_remainder_z
54+
; CHECK-NEXT: - .offset: 64
55+
; CHECK-NEXT: .size: 8
56+
; CHECK-NEXT: .value_kind: hidden_global_offset_x
57+
; CHECK-NEXT: - .offset: 72
58+
; CHECK-NEXT: .size: 8
59+
; CHECK-NEXT: .value_kind: hidden_global_offset_y
60+
; CHECK-NEXT: - .offset: 80
61+
; CHECK-NEXT: .size: 8
62+
; CHECK-NEXT: .value_kind: hidden_global_offset_z
63+
; CHECK-NEXT: - .offset: 88
64+
; CHECK-NEXT: .size: 2
65+
; CHECK-NEXT: .value_kind: hidden_grid_dims
66+
; CHECK-NEXT: - .offset: 96
67+
; CHECK-NEXT: .size: 8
68+
; CHECK-NEXT: .value_kind: hidden_printf_buffer
69+
; CHECK-NEXT: - .offset: 104
70+
; CHECK-NEXT: .size: 8
71+
; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
72+
; CHECK-NEXT: - .offset: 112
73+
; CHECK-NEXT: .size: 8
74+
; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
75+
; CHECK-NEXT: - .offset: 120
76+
; CHECK-NEXT: .size: 8
77+
; CHECK-NEXT: .value_kind: hidden_heap_v1
78+
; CHECK-NEXT: - .offset: 128
79+
; CHECK-NEXT: .size: 8
80+
; CHECK-NEXT: .value_kind: hidden_default_queue
81+
; CHECK-NEXT: - .offset: 136
82+
; CHECK-NEXT: .size: 8
83+
; CHECK-NEXT: .value_kind: hidden_completion_action
84+
; CHECK: - .offset: 144
85+
; CHECK-NEXT: .size: 4
86+
; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
87+
; CHECK: - .offset: 224
88+
; CHECK-NEXT: .size: 8
89+
; CHECK-NEXT: .value_kind: hidden_queue_ptr
90+
91+
; CHECK: .name: test_v5
92+
; CHECK: .symbol: test_v5.kd
93+
94+
; CHECK: amdhsa.version:
95+
; CHECK-NEXT: - 1
96+
; CHECK-NEXT: - 2
97+
@lds = external hidden addrspace(3) global [0 x i32], align 4
98+
99+
define void @funcs_dyn_lds() {
100+
store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
101+
ret void
102+
}
103+
104+
define amdgpu_kernel void @test_v5(
105+
ptr addrspace(1) %r,
106+
ptr addrspace(1) %a,
107+
ptr addrspace(1) %b) #0 {
108+
entry:
109+
%a.val = load half, ptr addrspace(1) %a
110+
%b.val = load half, ptr addrspace(1) %b
111+
%r.val = fadd half %a.val, %b.val
112+
store half %r.val, ptr addrspace(1) %r
113+
call void @funcs_dyn_lds()
114+
ret void
115+
}
116+
117+
!llvm.module.flags = !{!0}
118+
!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
119+
!llvm.printf.fmts = !{!1, !2}
120+
!1 = !{!"1:1:4:%d\5Cn"}
121+
!2 = !{!"2:1:8:%g\5Cn"}
122+
123+
attributes #0 = { optnone noinline }
124+
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
4+
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
8+
9+
10+
; CHECK: amdhsa.kernels:
11+
; CHECK-NEXT: - .args:
12+
; CHECK-NEXT: - .address_space: global
13+
; CHECK-NEXT: .name: r
14+
; CHECK-NEXT: .offset: 0
15+
; CHECK-NEXT: .size: 8
16+
; CHECK-NEXT: .value_kind: global_buffer
17+
; CHECK-NEXT: - .address_space: global
18+
; CHECK-NEXT: .name: a
19+
; CHECK-NEXT: .offset: 8
20+
; CHECK-NEXT: .size: 8
21+
; CHECK-NEXT: .value_kind: global_buffer
22+
; CHECK-NEXT: - .address_space: global
23+
; CHECK-NEXT: .name: b
24+
; CHECK-NEXT: .offset: 16
25+
; CHECK-NEXT: .size: 8
26+
; CHECK-NEXT: .value_kind: global_buffer
27+
; CHECK-NEXT: - .offset: 24
28+
; CHECK-NEXT: .size: 4
29+
; CHECK-NEXT: .value_kind: hidden_block_count_x
30+
; CHECK-NEXT: - .offset: 28
31+
; CHECK-NEXT: .size: 4
32+
; CHECK-NEXT: .value_kind: hidden_block_count_y
33+
; CHECK-NEXT: - .offset: 32
34+
; CHECK-NEXT: .size: 4
35+
; CHECK-NEXT: .value_kind: hidden_block_count_z
36+
; CHECK-NEXT: - .offset: 36
37+
; CHECK-NEXT: .size: 2
38+
; CHECK-NEXT: .value_kind: hidden_group_size_x
39+
; CHECK-NEXT: - .offset: 38
40+
; CHECK-NEXT: .size: 2
41+
; CHECK-NEXT: .value_kind: hidden_group_size_y
42+
; CHECK-NEXT: - .offset: 40
43+
; CHECK-NEXT: .size: 2
44+
; CHECK-NEXT: .value_kind: hidden_group_size_z
45+
; CHECK-NEXT: - .offset: 42
46+
; CHECK-NEXT: .size: 2
47+
; CHECK-NEXT: .value_kind: hidden_remainder_x
48+
; CHECK-NEXT: - .offset: 44
49+
; CHECK-NEXT: .size: 2
50+
; CHECK-NEXT: .value_kind: hidden_remainder_y
51+
; CHECK-NEXT: - .offset: 46
52+
; CHECK-NEXT: .size: 2
53+
; CHECK-NEXT: .value_kind: hidden_remainder_z
54+
; CHECK-NEXT: - .offset: 64
55+
; CHECK-NEXT: .size: 8
56+
; CHECK-NEXT: .value_kind: hidden_global_offset_x
57+
; CHECK-NEXT: - .offset: 72
58+
; CHECK-NEXT: .size: 8
59+
; CHECK-NEXT: .value_kind: hidden_global_offset_y
60+
; CHECK-NEXT: - .offset: 80
61+
; CHECK-NEXT: .size: 8
62+
; CHECK-NEXT: .value_kind: hidden_global_offset_z
63+
; CHECK-NEXT: - .offset: 88
64+
; CHECK-NEXT: .size: 2
65+
; CHECK-NEXT: .value_kind: hidden_grid_dims
66+
; CHECK-NEXT: - .offset: 96
67+
; CHECK-NEXT: .size: 8
68+
; CHECK-NEXT: .value_kind: hidden_printf_buffer
69+
; CHECK-NEXT: - .offset: 104
70+
; CHECK-NEXT: .size: 8
71+
; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
72+
; CHECK-NEXT: - .offset: 112
73+
; CHECK-NEXT: .size: 8
74+
; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
75+
; CHECK-NEXT: - .offset: 120
76+
; CHECK-NEXT: .size: 8
77+
; CHECK-NEXT: .value_kind: hidden_heap_v1
78+
; CHECK-NEXT: - .offset: 128
79+
; CHECK-NEXT: .size: 8
80+
; CHECK-NEXT: .value_kind: hidden_default_queue
81+
; CHECK-NEXT: - .offset: 136
82+
; CHECK-NEXT: .size: 8
83+
; CHECK-NEXT: .value_kind: hidden_completion_action
84+
; CHECK: - .offset: 144
85+
; CHECK-NEXT: .size: 4
86+
; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
87+
; CHECK: - .offset: 224
88+
; CHECK-NEXT: .size: 8
89+
; CHECK-NEXT: .value_kind: hidden_queue_ptr
90+
91+
; CHECK: .name: test_v5
92+
; CHECK: .symbol: test_v5.kd
93+
94+
; CHECK: amdhsa.version:
95+
; CHECK-NEXT: - 1
96+
; CHECK-NEXT: - 2
97+
@lds = external hidden addrspace(3) global [0 x i32], align 4
98+
99+
define void @funcs_dyn_lds(ptr addrspace(3) %lds_ptr) {
100+
store i32 1234, ptr addrspace(3) %lds_ptr, align 4
101+
ret void
102+
}
103+
104+
define amdgpu_kernel void @test_v5(
105+
ptr addrspace(1) %r,
106+
ptr addrspace(1) %a,
107+
ptr addrspace(1) %b) #0 {
108+
entry:
109+
%a.val = load half, ptr addrspace(1) %a
110+
%b.val = load half, ptr addrspace(1) %b
111+
%r.val = fadd half %a.val, %b.val
112+
store half %r.val, ptr addrspace(1) %r
113+
call void @funcs_dyn_lds(ptr addrspace(3) @lds)
114+
ret void
115+
}
116+
117+
!llvm.module.flags = !{!0}
118+
!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
119+
!llvm.printf.fmts = !{!1, !2}
120+
!1 = !{!"1:1:4:%d\5Cn"}
121+
!2 = !{!"2:1:8:%g\5Cn"}
122+
123+
attributes #0 = { optnone noinline }
124+

0 commit comments

Comments
 (0)