Skip to content

Commit ff55c9b

Browse files
ergawyJon Chesterfield
andauthored
[llvm][amdgpu] Handle indirect refs to LDS GVs during LDS lowering (#124089)
Fixes #123800 Extends LDS lowering by allowing it to discover transitive indirect/escpaing references to LDS GVs. For example, given the following input: ```llvm @lds_item_to_indirectly_load = internal addrspace(3) global ptr undef, align 8 %store_type = type { i32, ptr } @place_to_store_indirect_caller = internal addrspace(3) global %store_type undef, align 8 define amdgpu_kernel void @offloading_kernel() { store ptr @indirectly_load_lds, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @place_to_store_indirect_caller, i32 0), align 8 call void @call_unknown() ret void } define void @call_unknown() { %1 = alloca ptr, align 8 %2 = call i32 %1() ret void } define void @indirectly_load_lds() { call void @directly_load_lds() ret void } define void @directly_load_lds() { %2 = load ptr, ptr addrspace(3) @lds_item_to_indirectly_load, align 8 ret void } ``` With the above input, prior to this patch, LDS lowering failed to lower the reference to `@lds_item_to_indirectly_load` because: 1. it is indirectly called by a function whose address is taken in the kernel. 2. we did not check if the kernel indirectly makes any calls to unknown functions (we only checked the direct calls). Co-authored-by: Jon Chesterfield <[email protected]>
1 parent d8eb4ac commit ff55c9b

File tree

3 files changed

+90
-8
lines changed

3 files changed

+90
-8
lines changed

llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -141,20 +141,25 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
141141
FunctionVariableMap DirectMapFunction;
142142
getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
143143

144-
// Collect variables that are used by functions whose address has escaped
145-
DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
144+
// Collect functions whose address has escaped
145+
DenseSet<Function *> AddressTakenFuncs;
146146
for (Function &F : M.functions()) {
147147
if (!isKernelLDS(&F))
148148
if (F.hasAddressTaken(nullptr,
149149
/* IgnoreCallbackUses */ false,
150150
/* IgnoreAssumeLikeCalls */ false,
151151
/* IgnoreLLVMUsed */ true,
152152
/* IgnoreArcAttachedCall */ false)) {
153-
set_union(VariablesReachableThroughFunctionPointer,
154-
DirectMapFunction[&F]);
153+
AddressTakenFuncs.insert(&F);
155154
}
156155
}
157156

157+
// Collect variables that are used by functions whose address has escaped
158+
DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
159+
for (Function *F : AddressTakenFuncs) {
160+
set_union(VariablesReachableThroughFunctionPointer, DirectMapFunction[F]);
161+
}
162+
158163
auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
159164
assert(!F->isDeclaration());
160165
for (const CallGraphNode::CallRecord &R : *CG[F]) {
@@ -206,6 +211,13 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
206211
}
207212
}
208213

214+
// Collect variables that are transitively used by functions whose address has
215+
// escaped
216+
for (Function *F : AddressTakenFuncs) {
217+
set_union(VariablesReachableThroughFunctionPointer,
218+
TransitiveMapFunction[F]);
219+
}
220+
209221
// DirectMapKernel lists which variables are used by the kernel
210222
// find the variables which are used through a function call
211223
FunctionVariableMap IndirectMapKernel;
@@ -218,11 +230,37 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
218230
Function *Ith = R.second->getFunction();
219231
if (Ith) {
220232
set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
221-
} else {
222-
set_union(IndirectMapKernel[&Func],
223-
VariablesReachableThroughFunctionPointer);
224233
}
225234
}
235+
236+
// Check if the kernel encounters unknows calls, wheher directly or
237+
// indirectly.
238+
bool SeesUnknownCalls = [&]() {
239+
SmallVector<Function *> WorkList = {CG[&Func]->getFunction()};
240+
SmallPtrSet<Function *, 8> Visited;
241+
242+
while (!WorkList.empty()) {
243+
Function *F = WorkList.pop_back_val();
244+
245+
for (const CallGraphNode::CallRecord &CallRecord : *CG[F]) {
246+
if (!CallRecord.second)
247+
continue;
248+
249+
Function *Callee = CallRecord.second->getFunction();
250+
if (!Callee)
251+
return true;
252+
253+
if (Visited.insert(Callee).second)
254+
WorkList.push_back(Callee);
255+
}
256+
}
257+
return false;
258+
}();
259+
260+
if (SeesUnknownCalls) {
261+
set_union(IndirectMapKernel[&Func],
262+
VariablesReachableThroughFunctionPointer);
263+
}
226264
}
227265

228266
// Verify that we fall into one of 2 cases:
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
2+
3+
; Tests that the LDS lowering pass handles indirect references to LDS GVs; i.e.
4+
; that it lowers to accesses into the generated LDS struct if these references
5+
; are deep in the call graph starting at the kernel.
6+
7+
@lds_item_to_indirectly_load = internal addrspace(3) global ptr poison, align 8
8+
9+
%store_type = type { i32, ptr }
10+
@place_to_store_indirect_caller = internal addrspace(3) global %store_type poison, align 8
11+
12+
define amdgpu_kernel void @offloading_kernel() {
13+
store ptr @indirectly_load_lds, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @place_to_store_indirect_caller, i32 0), align 8
14+
call void @call_unknown()
15+
ret void
16+
}
17+
18+
define void @call_unknown() {
19+
%1 = alloca ptr, align 8
20+
%2 = call i32 %1()
21+
ret void
22+
}
23+
24+
define void @indirectly_load_lds() {
25+
call void @directly_load_lds()
26+
ret void
27+
}
28+
29+
define void @directly_load_lds() {
30+
%2 = load ptr, ptr addrspace(3) @lds_item_to_indirectly_load, align 8
31+
ret void
32+
}
33+
34+
; CHECK: %[[LDS_STRUCT_TY:.*]] = type { %store_type, ptr }
35+
; CHECK: @[[LDS_STRUCT:.*]] = {{.*}} %[[LDS_STRUCT_TY]] {{.*}} !absolute_symbol
36+
37+
; CHECK: define amdgpu_kernel void @offloading_kernel() {{.*}} {
38+
; CHECK: store ptr @indirectly_load_lds, {{.*}} @[[LDS_STRUCT]]
39+
; CHECK: call void @call_unknown()
40+
; CHECK: }
41+
42+
; CHECK: define void @directly_load_lds() {
43+
; CHECK: load ptr, {{.*}} (%[[LDS_STRUCT_TY]], {{.*}} @[[LDS_STRUCT]], i32 0, i32 1)
44+
; CHECK: }

llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ define amdgpu_kernel void @kernel_lds_recursion() {
196196
; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
197197
; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
198198
; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
199-
; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
199+
; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="4" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
200200
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
201201
; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
202202
;.

0 commit comments

Comments
 (0)