Skip to content

Commit 032f3d9

Browse files
committed
[AMDGPU][LTO] Introduce AMDGPUCloneModuleLDS
The purpose of this pass is to ensure that the combined module contains as many LDS global variables as there are kernels that (indirectly) access them. As LDS variables behave like C++ static variables, it is important that each partition contains a unique copy of the variable on a per kernel basis. This representation also prepares the combined module to eliminate cross-module dependencies of LDS variables. This pass operates as follows: 1. Firstly, traverse the call graph from each kernel to determine the number of kernels calling each device function. 2. For each LDS global variable GV, determine the function F that defines it. Collect it's caller functions. Clone F and GV, and finally insert a call/invoke instruction in each caller function. Change-Id: I998291a389ea3db10de9122f08fe55c981da6049
1 parent 28d85e2 commit 032f3d9

File tree

9 files changed

+543
-0
lines changed

9 files changed

+543
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ struct AMDGPULowerBufferFatPointersPass
149149
const TargetMachine &TM;
150150
};
151151

152+
struct AMDGPUCloneModuleLDSPass
153+
: public PassInfoMixin<AMDGPUCloneModuleLDSPass> {
154+
PreservedAnalyses run(Module &, ModuleAnalysisManager &);
155+
};
156+
152157
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
153158
extern char &AMDGPURewriteOutArgumentsID;
154159

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// The purpose of this pass is to ensure that the combined module contains
10+
// as many LDS global variables as there are kernels that (indirectly) access
11+
// them. As LDS variables behave like C++ static variables, it is important that
12+
// each partition contains a unique copy of the variable on a per kernel basis.
13+
// This representation also prepares the combined module to eliminate
14+
// cross-module dependencies of LDS variables.
15+
//
16+
// This pass operates as follows:
17+
// 1. Firstly, traverse the call graph from each kernel to determine the number
18+
// of kernels calling each device function.
19+
// 2. For each LDS global variable GV, determine the function F that defines it.
20+
// Collect it's caller functions. Clone F and GV, and finally insert a
21+
// call/invoke instruction in each caller function.
22+
//
23+
//===----------------------------------------------------------------------===//
24+
25+
#include "AMDGPU.h"
26+
#include "llvm/ADT/DepthFirstIterator.h"
27+
#include "llvm/ADT/Twine.h"
28+
#include "llvm/Analysis/CallGraph.h"
29+
#include "llvm/IR/InstrTypes.h"
30+
#include "llvm/IR/Instructions.h"
31+
#include "llvm/Passes/PassBuilder.h"
32+
#include "llvm/Support/ScopedPrinter.h"
33+
#include "llvm/Transforms/Utils/Cloning.h"
34+
35+
using namespace llvm;
36+
37+
#define DEBUG_TYPE "amdgpu-clone-module-lds"
38+
39+
static cl::opt<unsigned int> MaxCountForClonedFunctions(
40+
"clone-lds-functions-max-count", cl::init(16), cl::Hidden,
41+
cl::desc("Specify a limit to the number of clones of a function"));
42+
43+
/// Return the function that defines \p GV
44+
/// \param GV The global variable in question
45+
/// \return The function defining \p GV
46+
static Function *getFunctionDefiningGV(GlobalVariable &GV) {
47+
SmallVector<User *> Worklist(GV.users());
48+
while (!Worklist.empty()) {
49+
User *U = Worklist.pop_back_val();
50+
if (auto *Inst = dyn_cast<Instruction>(U))
51+
return Inst->getFunction();
52+
if (auto *Op = dyn_cast<Operator>(U))
53+
append_range(Worklist, Op->users());
54+
}
55+
return nullptr;
56+
};
57+
58+
PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M,
59+
ModuleAnalysisManager &AM) {
60+
if (MaxCountForClonedFunctions.getValue() == 1)
61+
return PreservedAnalyses::all();
62+
63+
bool Changed = false;
64+
auto &CG = AM.getResult<CallGraphAnalysis>(M);
65+
66+
// For each function in the call graph, determine the number
67+
// of ancestor-caller kernels.
68+
DenseMap<Function *, unsigned int> KernelRefsToFuncs;
69+
for (auto &Fn : M) {
70+
if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL)
71+
continue;
72+
for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I)
73+
if (auto *F = I->getFunction())
74+
KernelRefsToFuncs[F]++;
75+
}
76+
77+
DenseMap<GlobalVariable *, Function *> GVToFnMap;
78+
for (auto &GV : M.globals()) {
79+
if (GVToFnMap.contains(&GV) ||
80+
GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
81+
!GV.hasInitializer())
82+
continue;
83+
84+
auto *OldF = getFunctionDefiningGV(GV);
85+
GVToFnMap.insert({&GV, OldF});
86+
LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function "
87+
<< OldF->getName() << '\n');
88+
89+
// Collect all call instructions to OldF
90+
SmallVector<Instruction *> InstsCallingOldF;
91+
for (auto &I : OldF->uses())
92+
if (auto *CI = dyn_cast<CallBase>(I.getUser()))
93+
InstsCallingOldF.push_back(CI);
94+
95+
// Create as many clones of the function containing LDS global as
96+
// there are kernels calling the function (including the function
97+
// already defining the LDS global). Respectively, clone the
98+
// LDS global and the call instructions to the function.
99+
LLVM_DEBUG(dbgs() << "\tFunction is referenced by "
100+
<< KernelRefsToFuncs[OldF] << " kernels.\n");
101+
for (unsigned int ID = 0;
102+
ID + 1 < std::min(KernelRefsToFuncs[OldF],
103+
MaxCountForClonedFunctions.getValue());
104+
++ID) {
105+
// Clone LDS global variable
106+
auto *NewGV = new GlobalVariable(
107+
M, GV.getValueType(), GV.isConstant(), GlobalValue::InternalLinkage,
108+
PoisonValue::get(GV.getValueType()),
109+
GV.getName() + ".clone." + Twine(ID), &GV,
110+
GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
111+
NewGV->copyAttributesFrom(&GV);
112+
NewGV->copyMetadata(&GV, 0);
113+
NewGV->setComdat(GV.getComdat());
114+
LLVM_DEBUG(dbgs() << "Inserting LDS clone with name " << NewGV->getName()
115+
<< '\n');
116+
117+
// Clone function
118+
ValueToValueMapTy VMap;
119+
VMap[&GV] = NewGV;
120+
auto *NewF = CloneFunction(OldF, VMap);
121+
NewF->setName(OldF->getName() + ".clone." + Twine(ID));
122+
LLVM_DEBUG(dbgs() << "Inserting function clone with name "
123+
<< NewF->getName() << '\n');
124+
125+
126+
// Create a new CallInst to call the cloned function
127+
for (auto *Inst : InstsCallingOldF) {
128+
Instruction *I = Inst->clone();
129+
I->setName(Inst->getName() + ".clone." + Twine(ID));
130+
if (auto *CI = dyn_cast<CallBase>(I))
131+
CI->setCalledOperand(NewF);
132+
I->insertAfter(Inst);
133+
LLVM_DEBUG(dbgs() << "Inserting inst: " << *I << '\n');
134+
}
135+
Changed = true;
136+
}
137+
}
138+
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
139+
}

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
2222
AMDGPULowerBufferFatPointersPass(*this))
2323
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
2424
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
25+
MODULE_PASS("amdgpu-clone-module-lds", AMDGPUCloneModuleLDSPass())
2526
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
2627
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
2728
#undef MODULE_PASS

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
725725
// We want to support the -lto-partitions=N option as "best effort".
726726
// For that, we need to lower LDS earlier in the pipeline before the
727727
// module is partitioned for codegen.
728+
PM.addPass(AMDGPUCloneModuleLDSPass());
728729
if (EnableLowerModuleLDS)
729730
PM.addPass(AMDGPULowerModuleLDSPass(*this));
730731
});

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
5050
AMDGPUAtomicOptimizer.cpp
5151
AMDGPUAttributor.cpp
5252
AMDGPUCallLowering.cpp
53+
AMDGPUCloneModuleLDS.cpp
5354
AMDGPUCodeGenPrepare.cpp
5455
AMDGPUCombinerHelper.cpp
5556
AMDGPUCtorDtorLowering.cpp
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
2+
3+
target triple = "amdgcn-amd-amdhsa"
4+
5+
; In this examples, CloneModuleLDS pass creates two copies of LDS_GV
6+
; as two kernels call the same device function where LDS_GV is used.
7+
8+
; CHECK: [[LDS_GV_CLONE:@.*\.clone\.0]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
9+
; CHECK: [[LDS_GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
10+
@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
11+
12+
define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
13+
; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
14+
; CHECK-SAME: i32 [[N:%.*]]) {
15+
; CHECK-NEXT: entry:
16+
; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func(i32 [[N]])
17+
; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]])
18+
; CHECK-NEXT: ret void
19+
;
20+
entry:
21+
%call = call i32 @lds_func(i32 %n)
22+
ret void
23+
}
24+
25+
define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
26+
; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
27+
; CHECK-SAME: i32 [[N:%.*]]) {
28+
; CHECK-NEXT: entry:
29+
; CHECK-NEXT: [[CALL:%.*]] = call i32 @lds_func(i32 [[N]])
30+
; CHECK-NEXT: [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]])
31+
; CHECK-NEXT: ret void
32+
;
33+
entry:
34+
%call = call i32 @lds_func(i32 %n)
35+
ret void
36+
}
37+
38+
39+
define i32 @lds_func(i32 %x) {
40+
; CHECK-LABEL: define i32 @lds_func(
41+
; CHECK-SAME: i32 [[X:%.*]]) {
42+
; CHECK-NEXT: entry:
43+
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV]] to ptr), i64 0, i64 0
44+
; CHECK-NEXT: store i32 [[X]], ptr [[P]], align 4
45+
; CHECK-NEXT: ret i32 [[X]]
46+
;
47+
entry:
48+
%p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
49+
store i32 %x, ptr %p
50+
ret i32 %x
51+
}
52+
53+
; CHECK-LABEL: define i32 @lds_func.clone.0(i32 %x) {
54+
; CHECK-NEXT: entry:
55+
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV_CLONE]] to ptr), i64 0, i64 0
56+
; CHECK-NEXT: store i32 %x, ptr %p, align 4
57+
; CHECK-NEXT: ret i32 %x
58+
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
3+
4+
target triple = "amdgcn-amd-amdhsa"
5+
6+
; Before transformation, After transformation,
7+
; K1 K2 K1 K2
8+
; | / | /
9+
; | / | /
10+
; A ==> A
11+
; | \ | \
12+
; | \ | \
13+
; B C B C
14+
; | | \
15+
; X X1 X2
16+
;
17+
; where X contains an LDS reference
18+
19+
; CHECK: [[GV_CLONE:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
20+
; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
21+
@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
22+
23+
define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
24+
; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
25+
; CHECK-SAME: i32 [[N:%.*]]) {
26+
; CHECK-NEXT: entry:
27+
; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]])
28+
; CHECK-NEXT: ret void
29+
;
30+
entry:
31+
%call = call i32 @A(i32 %n)
32+
ret void
33+
}
34+
35+
define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
36+
; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
37+
; CHECK-SAME: i32 [[N:%.*]]) {
38+
; CHECK-NEXT: entry:
39+
; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]])
40+
; CHECK-NEXT: ret void
41+
;
42+
entry:
43+
%call = call i32 @A(i32 %n)
44+
ret void
45+
}
46+
47+
define void @A() {
48+
; CHECK-LABEL: define void @A() {
49+
; CHECK-NEXT: entry:
50+
; CHECK-NEXT: call void @B()
51+
; CHECK-NEXT: call void @C()
52+
; CHECK-NEXT: ret void
53+
;
54+
entry:
55+
call void @B()
56+
call void @C()
57+
ret void
58+
}
59+
60+
define i32 @B() {
61+
; CHECK-LABEL: define i32 @B() {
62+
; CHECK-NEXT: entry:
63+
; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4
64+
; CHECK-NEXT: store i32 5, ptr [[P]], align 4
65+
; CHECK-NEXT: [[RET:%.*]] = call i32 @X(ptr [[P]])
66+
; CHECK-NEXT: [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]])
67+
; CHECK-NEXT: ret i32 [[RET]]
68+
;
69+
entry:
70+
%p = alloca i32
71+
store i32 5, ptr %p
72+
%ret = call i32 @X(ptr %p)
73+
ret i32 %ret
74+
}
75+
76+
define void @C() {
77+
; CHECK-LABEL: define void @C() {
78+
; CHECK-NEXT: entry:
79+
; CHECK-NEXT: ret void
80+
;
81+
entry:
82+
ret void
83+
}
84+
85+
define i32 @X(ptr %x) {
86+
; CHECK-LABEL: define i32 @X(
87+
; CHECK-SAME: ptr [[X:%.*]]) {
88+
; CHECK-NEXT: entry:
89+
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0
90+
; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[X]], align 4
91+
; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4
92+
; CHECK-NEXT: ret i32 [[V]]
93+
;
94+
entry:
95+
%p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
96+
%v = load i32, ptr %x
97+
store i32 %v, ptr %p
98+
ret i32 %v
99+
}
100+
101+
; CHECK-LABEL: define i32 @X.clone.0(ptr %x) {
102+
; CHECK-NEXT: entry:
103+
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE]] to ptr), i64 0, i64 0
104+
; CHECK-NEXT: %v = load i32, ptr %x, align 4
105+
; CHECK-NEXT: store i32 %v, ptr %p, align 4
106+
; CHECK-NEXT: ret i32 %v

0 commit comments

Comments
 (0)