Skip to content

Commit 0ff80e0

Browse files
skc7ronlieb
authored andcommitted
[AMDGPU] Add dynamic LDS size implicit kernel argument to CO-v5 (llvm#65273)
"hidden_dynamic_lds_size" argument will be added in the reserved section at offset 120 of the implicit argument layout. Add "isDynamicLDSUsed" flag to AMDGPUMachineFunction to identify if a function uses dynamic LDS. hidden argument will be added in below cases: - LDS global is used in the kernel. - Kernel calls a function which uses LDS global. - LDS pointer is passed as argument to kernel itself. Change-Id: Idfa96ab8941b56277aaf35b416725a2522c85e61
1 parent dc4eba5 commit 0ff80e0

10 files changed

+404
-12
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4231,6 +4231,9 @@ Code object V5 metadata is the same as
42314231
buffer that conforms to the requirements of the malloc/free
42324232
device library V1 version implementation.
42334233

4234+
"hidden_dynamic_lds_size"
4235+
Size of the dynamically allocated LDS memory is passed in the kernarg.
4236+
42344237
"hidden_private_base"
42354238
The high 32 bits of the flat addressing private aperture base.
42364239
Only used by GFX8 to allow conversion between private segment

llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
134134
.Case("hidden_default_queue", true)
135135
.Case("hidden_completion_action", true)
136136
.Case("hidden_multigrid_sync_arg", true)
137+
.Case("hidden_dynamic_lds_size", true)
137138
.Case("hidden_private_base", true)
138139
.Case("hidden_shared_base", true)
139140
.Case("hidden_queue_ptr", true)

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
651651
Offset += 8; // Skipped.
652652
}
653653

654-
Offset += 72; // Reserved.
654+
// Emit argument for hidden dynamic lds size
655+
if (MFI.isDynamicLDSUsed()) {
656+
emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
657+
Args);
658+
} else {
659+
Offset += 4; // skipped
660+
}
661+
662+
Offset += 68; // Reserved.
655663

656664
// hidden_private_base and hidden_shared_base are only when the subtarget has
657665
// ApertureRegs.

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,26 @@
1919

2020
using namespace llvm;
2121

22+
static const GlobalVariable *
23+
getKernelDynLDSGlobalFromFunction(const Function &F) {
24+
const Module *M = F.getParent();
25+
SmallString<64> KernelDynLDSName("llvm.amdgcn.");
26+
KernelDynLDSName += F.getName();
27+
KernelDynLDSName += ".dynlds";
28+
return M->getNamedGlobal(KernelDynLDSName);
29+
}
30+
31+
static bool hasLDSKernelArgument(const Function &F) {
32+
for (const Argument &Arg : F.args()) {
33+
Type *ArgTy = Arg.getType();
34+
if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) {
35+
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
36+
return true;
37+
}
38+
}
39+
return false;
40+
}
41+
2242
AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
2343
const AMDGPUSubtarget &ST)
2444
: IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
@@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
6585
Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
6686
NoSignedZerosFPMath =
6787
NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
88+
89+
const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
90+
if (DynLdsGlobal || hasLDSKernelArgument(F))
91+
UsesDynamicLDS = true;
6892
}
6993

7094
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
139163
return Offset;
140164
}
141165

142-
static const GlobalVariable *
143-
getKernelDynLDSGlobalFromFunction(const Function &F) {
144-
const Module *M = F.getParent();
145-
std::string KernelDynLDSName = "llvm.amdgcn.";
146-
KernelDynLDSName += F.getName();
147-
KernelDynLDSName += ".dynlds";
148-
return M->getNamedGlobal(KernelDynLDSName);
149-
}
150-
151166
std::optional<uint32_t>
152167
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
153168
// TODO: Would be more consistent with the abs symbols to use a range
@@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
210225
}
211226
}
212227
}
228+
229+
void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) {
230+
UsesDynamicLDS = DynLDS;
231+
}
232+
233+
bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; }

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
4646
/// stages.
4747
Align DynLDSAlign;
4848

49+
// Flag to check dynamic LDS usage by kernel.
50+
bool UsesDynamicLDS = false;
51+
4952
// Kernels + shaders. i.e. functions called by the hardware and not called
5053
// by other functions.
5154
bool IsEntryFunction = false;
@@ -119,6 +122,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
119122
Align getDynLDSAlign() const { return DynLDSAlign; }
120123

121124
void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
125+
126+
void setUsesDynamicLDS(bool DynLDS);
127+
128+
bool isDynamicLDSUsed() const;
122129
};
123130

124131
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7216,6 +7216,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
72167216
// Adjust alignment for that dynamic shared memory array.
72177217
Function &F = DAG.getMachineFunction().getFunction();
72187218
MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7219+
MFI->setUsesDynamicLDS(true);
72197220
return SDValue(
72207221
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
72217222
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
2+
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
4+
5+
; CHECK: amdhsa.kernels:
6+
; CHECK-NEXT: - .args:
7+
; CHECK-NEXT: - .address_space: global
8+
; CHECK-NEXT: .name: r
9+
; CHECK-NEXT: .offset: 0
10+
; CHECK-NEXT: .size: 8
11+
; CHECK-NEXT: .value_kind: global_buffer
12+
; CHECK-NEXT: - .address_space: global
13+
; CHECK-NEXT: .name: a
14+
; CHECK-NEXT: .offset: 8
15+
; CHECK-NEXT: .size: 8
16+
; CHECK-NEXT: .value_kind: global_buffer
17+
; CHECK-NEXT: - .address_space: global
18+
; CHECK-NEXT: .name: b
19+
; CHECK-NEXT: .offset: 16
20+
; CHECK-NEXT: .size: 8
21+
; CHECK-NEXT: .value_kind: global_buffer
22+
; CHECK-NEXT: - .offset: 24
23+
; CHECK-NEXT: .size: 4
24+
; CHECK-NEXT: .value_kind: hidden_block_count_x
25+
; CHECK-NEXT: - .offset: 28
26+
; CHECK-NEXT: .size: 4
27+
; CHECK-NEXT: .value_kind: hidden_block_count_y
28+
; CHECK-NEXT: - .offset: 32
29+
; CHECK-NEXT: .size: 4
30+
; CHECK-NEXT: .value_kind: hidden_block_count_z
31+
; CHECK-NEXT: - .offset: 36
32+
; CHECK-NEXT: .size: 2
33+
; CHECK-NEXT: .value_kind: hidden_group_size_x
34+
; CHECK-NEXT: - .offset: 38
35+
; CHECK-NEXT: .size: 2
36+
; CHECK-NEXT: .value_kind: hidden_group_size_y
37+
; CHECK-NEXT: - .offset: 40
38+
; CHECK-NEXT: .size: 2
39+
; CHECK-NEXT: .value_kind: hidden_group_size_z
40+
; CHECK-NEXT: - .offset: 42
41+
; CHECK-NEXT: .size: 2
42+
; CHECK-NEXT: .value_kind: hidden_remainder_x
43+
; CHECK-NEXT: - .offset: 44
44+
; CHECK-NEXT: .size: 2
45+
; CHECK-NEXT: .value_kind: hidden_remainder_y
46+
; CHECK-NEXT: - .offset: 46
47+
; CHECK-NEXT: .size: 2
48+
; CHECK-NEXT: .value_kind: hidden_remainder_z
49+
; CHECK-NEXT: - .offset: 64
50+
; CHECK-NEXT: .size: 8
51+
; CHECK-NEXT: .value_kind: hidden_global_offset_x
52+
; CHECK-NEXT: - .offset: 72
53+
; CHECK-NEXT: .size: 8
54+
; CHECK-NEXT: .value_kind: hidden_global_offset_y
55+
; CHECK-NEXT: - .offset: 80
56+
; CHECK-NEXT: .size: 8
57+
; CHECK-NEXT: .value_kind: hidden_global_offset_z
58+
; CHECK-NEXT: - .offset: 88
59+
; CHECK-NEXT: .size: 2
60+
; CHECK-NEXT: .value_kind: hidden_grid_dims
61+
; CHECK-NEXT: - .offset: 96
62+
; CHECK-NEXT: .size: 8
63+
; CHECK-NEXT: .value_kind: hidden_printf_buffer
64+
; CHECK-NEXT: - .offset: 104
65+
; CHECK-NEXT: .size: 8
66+
; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
67+
; CHECK-NEXT: - .offset: 112
68+
; CHECK-NEXT: .size: 8
69+
; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
70+
; CHECK-NEXT: - .offset: 120
71+
; CHECK-NEXT: .size: 8
72+
; CHECK-NEXT: .value_kind: hidden_heap_v1
73+
; CHECK-NEXT: - .offset: 128
74+
; CHECK-NEXT: .size: 8
75+
; CHECK-NEXT: .value_kind: hidden_default_queue
76+
; CHECK-NEXT: - .offset: 144
77+
; CHECK-NEXT: .size: 4
78+
; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
79+
; CHECK: - .offset: 224
80+
; CHECK-NEXT: .size: 8
81+
; CHECK-NEXT: .value_kind: hidden_queue_ptr
82+
83+
; CHECK: .name: test_v5
84+
; CHECK: .symbol: test_v5.kd
85+
86+
; CHECK: amdhsa.version:
87+
; CHECK-NEXT: - 1
88+
; CHECK-NEXT: - 2
89+
@lds = external hidden addrspace(3) global [0 x i32], align 4
90+
91+
define void @funcs_dyn_lds() {
92+
store i32 1234, ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 4
93+
ret void
94+
}
95+
96+
define amdgpu_kernel void @test_v5(
97+
ptr addrspace(1) %r,
98+
ptr addrspace(1) %a,
99+
ptr addrspace(1) %b) #0 {
100+
entry:
101+
%a.val = load half, ptr addrspace(1) %a
102+
%b.val = load half, ptr addrspace(1) %b
103+
%r.val = fadd half %a.val, %b.val
104+
store half %r.val, ptr addrspace(1) %r
105+
call void @funcs_dyn_lds()
106+
ret void
107+
}
108+
109+
!llvm.module.flags = !{!0}
110+
!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
111+
!llvm.printf.fmts = !{!1, !2}
112+
!1 = !{!"1:1:4:%d\5Cn"}
113+
!2 = !{!"2:1:8:%g\5Cn"}
114+
115+
attributes #0 = { optnone noinline }
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
2+
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s
4+
5+
; CHECK: amdhsa.kernels:
6+
; CHECK-NEXT: - .args:
7+
; CHECK-NEXT: - .address_space: global
8+
; CHECK-NEXT: .name: r
9+
; CHECK-NEXT: .offset: 0
10+
; CHECK-NEXT: .size: 8
11+
; CHECK-NEXT: .value_kind: global_buffer
12+
; CHECK-NEXT: - .address_space: global
13+
; CHECK-NEXT: .name: a
14+
; CHECK-NEXT: .offset: 8
15+
; CHECK-NEXT: .size: 8
16+
; CHECK-NEXT: .value_kind: global_buffer
17+
; CHECK-NEXT: - .address_space: global
18+
; CHECK-NEXT: .name: b
19+
; CHECK-NEXT: .offset: 16
20+
; CHECK-NEXT: .size: 8
21+
; CHECK-NEXT: .value_kind: global_buffer
22+
; CHECK-NEXT: - .offset: 24
23+
; CHECK-NEXT: .size: 4
24+
; CHECK-NEXT: .value_kind: hidden_block_count_x
25+
; CHECK-NEXT: - .offset: 28
26+
; CHECK-NEXT: .size: 4
27+
; CHECK-NEXT: .value_kind: hidden_block_count_y
28+
; CHECK-NEXT: - .offset: 32
29+
; CHECK-NEXT: .size: 4
30+
; CHECK-NEXT: .value_kind: hidden_block_count_z
31+
; CHECK-NEXT: - .offset: 36
32+
; CHECK-NEXT: .size: 2
33+
; CHECK-NEXT: .value_kind: hidden_group_size_x
34+
; CHECK-NEXT: - .offset: 38
35+
; CHECK-NEXT: .size: 2
36+
; CHECK-NEXT: .value_kind: hidden_group_size_y
37+
; CHECK-NEXT: - .offset: 40
38+
; CHECK-NEXT: .size: 2
39+
; CHECK-NEXT: .value_kind: hidden_group_size_z
40+
; CHECK-NEXT: - .offset: 42
41+
; CHECK-NEXT: .size: 2
42+
; CHECK-NEXT: .value_kind: hidden_remainder_x
43+
; CHECK-NEXT: - .offset: 44
44+
; CHECK-NEXT: .size: 2
45+
; CHECK-NEXT: .value_kind: hidden_remainder_y
46+
; CHECK-NEXT: - .offset: 46
47+
; CHECK-NEXT: .size: 2
48+
; CHECK-NEXT: .value_kind: hidden_remainder_z
49+
; CHECK-NEXT: - .offset: 64
50+
; CHECK-NEXT: .size: 8
51+
; CHECK-NEXT: .value_kind: hidden_global_offset_x
52+
; CHECK-NEXT: - .offset: 72
53+
; CHECK-NEXT: .size: 8
54+
; CHECK-NEXT: .value_kind: hidden_global_offset_y
55+
; CHECK-NEXT: - .offset: 80
56+
; CHECK-NEXT: .size: 8
57+
; CHECK-NEXT: .value_kind: hidden_global_offset_z
58+
; CHECK-NEXT: - .offset: 88
59+
; CHECK-NEXT: .size: 2
60+
; CHECK-NEXT: .value_kind: hidden_grid_dims
61+
; CHECK-NEXT: - .offset: 96
62+
; CHECK-NEXT: .size: 8
63+
; CHECK-NEXT: .value_kind: hidden_printf_buffer
64+
; CHECK-NEXT: - .offset: 104
65+
; CHECK-NEXT: .size: 8
66+
; CHECK-NEXT: .value_kind: hidden_hostcall_buffer
67+
; CHECK-NEXT: - .offset: 112
68+
; CHECK-NEXT: .size: 8
69+
; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg
70+
; CHECK-NEXT: - .offset: 120
71+
; CHECK-NEXT: .size: 8
72+
; CHECK-NEXT: .value_kind: hidden_heap_v1
73+
; CHECK-NEXT: - .offset: 128
74+
; CHECK-NEXT: .size: 8
75+
; CHECK-NEXT: .value_kind: hidden_default_queue
76+
; CHECK-NEXT: - .offset: 144
77+
; CHECK-NEXT: .size: 4
78+
; CHECK-NEXT: .value_kind: hidden_dynamic_lds_size
79+
; CHECK: - .offset: 224
80+
; CHECK-NEXT: .size: 8
81+
; CHECK-NEXT: .value_kind: hidden_queue_ptr
82+
83+
; CHECK: .name: test_v5
84+
; CHECK: .symbol: test_v5.kd
85+
86+
; CHECK: amdhsa.version:
87+
; CHECK-NEXT: - 1
88+
; CHECK-NEXT: - 2
89+
@lds = external hidden addrspace(3) global [0 x i32], align 4
90+
91+
define void @funcs_dyn_lds(ptr addrspace(3) %lds_ptr) {
92+
store i32 1234, ptr addrspace(3) %lds_ptr, align 4
93+
ret void
94+
}
95+
96+
define amdgpu_kernel void @test_v5(
97+
ptr addrspace(1) %r,
98+
ptr addrspace(1) %a,
99+
ptr addrspace(1) %b) #0 {
100+
entry:
101+
%a.val = load half, ptr addrspace(1) %a
102+
%b.val = load half, ptr addrspace(1) %b
103+
%r.val = fadd half %a.val, %b.val
104+
store half %r.val, ptr addrspace(1) %r
105+
call void @funcs_dyn_lds(ptr addrspace(3) @lds)
106+
ret void
107+
}
108+
109+
!llvm.module.flags = !{!0}
110+
!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
111+
!llvm.printf.fmts = !{!1, !2}
112+
!1 = !{!"1:1:4:%d\5Cn"}
113+
!2 = !{!"2:1:8:%g\5Cn"}
114+
115+
attributes #0 = { optnone noinline }

0 commit comments

Comments
 (0)