Skip to content

Commit af7fb7a

Browse files
committed
[AMDGPU][LTO] Introduce AMDGPUCloneModuleLDS
The purpose of this pass is to ensure that the combined module contains as many LDS global variables as there are kernels that (indirectly) access them. As LDS variables behave like C++ static variables, it is important that each partition contains a unique copy of the variable on a per kernel basis. This representation also prepares the combined module to eliminate cross-module dependencies of LDS variables. This pass operates as follows: 1. Firstly, traverse the call graph from each kernel to determine the number of kernels calling each device function. 2. For each LDS global variable GV, determine the function F that defines it. Collect it's caller functions. Clone F and GV, and finally insert a call/invoke instruction in each caller function. Change-Id: I998291a389ea3db10de9122f08fe55c981da6049
1 parent 28d85e2 commit af7fb7a

File tree

9 files changed

+606
-0
lines changed

9 files changed

+606
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ struct AMDGPULowerBufferFatPointersPass
149149
const TargetMachine &TM;
150150
};
151151

152+
struct AMDGPUCloneModuleLDSPass
153+
: public PassInfoMixin<AMDGPUCloneModuleLDSPass> {
154+
PreservedAnalyses run(Module &, ModuleAnalysisManager &);
155+
};
156+
152157
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
153158
extern char &AMDGPURewriteOutArgumentsID;
154159

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// The purpose of this pass is to ensure that the combined module contains
10+
// as many LDS global variables as there are kernels that (indirectly) access
11+
// them. As LDS variables behave like C++ static variables, it is important that
12+
// each partition contains a unique copy of the variable on a per kernel basis.
13+
// This representation also prepares the combined module to eliminate
14+
// cross-module false dependencies of LDS variables. This pass runs prior to the
15+
// AMDGPULowerModuleLDS pass in the fullLTO pipeline and is used to improve
16+
// the functionality of --lto-partitions.
17+
//
18+
// This pass operates as follows:
19+
// 1. Firstly, traverse the call graph from each kernel to determine the number
20+
// of kernels calling each device function.
21+
// 2. For each LDS global variable GV, determine the function F that defines it.
22+
// Collect it's caller functions. Clone F and GV, and finally insert a
23+
// call/invoke instruction in each caller function.
24+
//
25+
//===----------------------------------------------------------------------===//
26+
27+
#include "AMDGPU.h"
28+
#include "llvm/ADT/DepthFirstIterator.h"
29+
#include "llvm/ADT/Twine.h"
30+
#include "llvm/Analysis/CallGraph.h"
31+
#include "llvm/IR/InstrTypes.h"
32+
#include "llvm/IR/Instructions.h"
33+
#include "llvm/Passes/PassBuilder.h"
34+
#include "llvm/Support/ScopedPrinter.h"
35+
#include "llvm/Transforms/Utils/Cloning.h"
36+
37+
using namespace llvm;
38+
39+
#define DEBUG_TYPE "amdgpu-clone-module-lds"
40+
41+
static cl::opt<unsigned int> MaxCountForClonedFunctions(
42+
"clone-lds-functions-max-count", cl::init(16), cl::Hidden,
43+
cl::desc("Specify a limit to the number of clones of a function"));
44+
45+
/// Return the function that defines \p GV
46+
/// \param GV The global variable in question
47+
/// \return The function defining \p GV
48+
static Function *getFunctionDefiningGV(GlobalVariable &GV) {
49+
SmallVector<User *> Worklist(GV.users());
50+
while (!Worklist.empty()) {
51+
User *U = Worklist.pop_back_val();
52+
if (auto *Inst = dyn_cast<Instruction>(U))
53+
return Inst->getFunction();
54+
if (auto *Op = dyn_cast<Operator>(U))
55+
append_range(Worklist, Op->users());
56+
}
57+
return nullptr;
58+
};
59+
60+
PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M,
61+
ModuleAnalysisManager &AM) {
62+
if (MaxCountForClonedFunctions.getValue() == 1)
63+
return PreservedAnalyses::all();
64+
65+
bool Changed = false;
66+
auto &CG = AM.getResult<CallGraphAnalysis>(M);
67+
68+
// For each function in the call graph, determine the number
69+
// of ancestor-caller kernels.
70+
DenseMap<Function *, unsigned int> KernelRefsToFuncs;
71+
for (auto &Fn : M) {
72+
if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL)
73+
continue;
74+
for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I) {
75+
if (auto *F = I->getFunction())
76+
KernelRefsToFuncs[F]++;
77+
}
78+
}
79+
80+
DenseMap<GlobalVariable *, Function *> GVToFnMap;
81+
for (auto &GV : M.globals()) {
82+
if (GVToFnMap.contains(&GV) ||
83+
GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || !GV.hasInitializer())
84+
continue;
85+
86+
auto *OldF = getFunctionDefiningGV(GV);
87+
GVToFnMap.insert({&GV, OldF});
88+
LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function "
89+
<< OldF->getName() << '\n');
90+
91+
// Collect all call instructions to OldF
92+
SmallVector<Instruction *> InstsCallingOldF;
93+
for (auto &I : OldF->uses()) {
94+
if (auto *CI = dyn_cast<CallBase>(I.getUser()))
95+
InstsCallingOldF.push_back(CI);
96+
}
97+
98+
// Create as many clones of the function containing LDS global as
99+
// there are kernels calling the function (including the function
100+
// already defining the LDS global). Respectively, clone the
101+
// LDS global and the call instructions to the function.
102+
LLVM_DEBUG(dbgs() << "\tFunction is referenced by "
103+
<< KernelRefsToFuncs[OldF] << " kernels.\n");
104+
for (unsigned int ID = 0;
105+
ID + 1 < std::min(KernelRefsToFuncs[OldF],
106+
MaxCountForClonedFunctions.getValue());
107+
++ID) {
108+
// Clone LDS global variable
109+
auto *NewGV = new GlobalVariable(
110+
M, GV.getValueType(), GV.isConstant(), GlobalValue::InternalLinkage,
111+
PoisonValue::get(GV.getValueType()),
112+
GV.getName() + ".clone." + Twine(ID), &GV,
113+
GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
114+
NewGV->copyAttributesFrom(&GV);
115+
NewGV->copyMetadata(&GV, 0);
116+
NewGV->setComdat(GV.getComdat());
117+
LLVM_DEBUG(dbgs() << "Inserting LDS clone with name " << NewGV->getName()
118+
<< '\n');
119+
120+
// Clone function
121+
ValueToValueMapTy VMap;
122+
VMap[&GV] = NewGV;
123+
auto *NewF = CloneFunction(OldF, VMap);
124+
NewF->setName(OldF->getName() + ".clone." + Twine(ID));
125+
LLVM_DEBUG(dbgs() << "Inserting function clone with name "
126+
<< NewF->getName() << '\n');
127+
128+
// Create a new CallInst to call the cloned function
129+
for (auto *Inst : InstsCallingOldF) {
130+
Instruction *I = Inst->clone();
131+
I->setName(Inst->getName() + ".clone." + Twine(ID));
132+
if (auto *CI = dyn_cast<CallBase>(I))
133+
CI->setCalledOperand(NewF);
134+
I->insertAfter(Inst);
135+
LLVM_DEBUG(dbgs() << "Inserting inst: " << *I << '\n');
136+
}
137+
Changed = true;
138+
}
139+
}
140+
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
141+
}

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
2222
AMDGPULowerBufferFatPointersPass(*this))
2323
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
2424
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
25+
MODULE_PASS("amdgpu-clone-module-lds", AMDGPUCloneModuleLDSPass())
2526
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
2627
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
2728
#undef MODULE_PASS

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
725725
// We want to support the -lto-partitions=N option as "best effort".
726726
// For that, we need to lower LDS earlier in the pipeline before the
727727
// module is partitioned for codegen.
728+
PM.addPass(AMDGPUCloneModuleLDSPass());
728729
if (EnableLowerModuleLDS)
729730
PM.addPass(AMDGPULowerModuleLDSPass(*this));
730731
});

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
5050
AMDGPUAtomicOptimizer.cpp
5151
AMDGPUAttributor.cpp
5252
AMDGPUCallLowering.cpp
53+
AMDGPUCloneModuleLDS.cpp
5354
AMDGPUCodeGenPrepare.cpp
5455
AMDGPUCombinerHelper.cpp
5556
AMDGPUCtorDtorLowering.cpp
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
2+
3+
; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t
4+
; RUN: llvm-split -o %u %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a
5+
; RUN: llvm-dis -o - %u0 | FileCheck --check-prefix=MOD0 %s
6+
; RUN: llvm-dis -o - %u1 | FileCheck --check-prefix=MOD1 %s
7+
8+
target triple = "amdgcn-amd-amdhsa"
9+
10+
; Before transformation, After transformation,
11+
; K1 K2 K1 K2
12+
; | / | /
13+
; | / | /
14+
; A ==> A
15+
; | \ | \
16+
; | \ | \
17+
; B C B C
18+
; | | \
19+
; X X1 X2
20+
;
21+
; where X contains an LDS reference
22+
23+
; CHECK: [[GV_CLONE:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
24+
; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
25+
@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
26+
27+
define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
28+
; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
29+
; CHECK-SAME: i32 [[N:%.*]]) {
30+
; CHECK-NEXT: entry:
31+
; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]])
32+
; CHECK-NEXT: ret void
33+
;
34+
entry:
35+
%call = call i32 @A(i32 %n)
36+
ret void
37+
}
38+
39+
define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
40+
; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
41+
; CHECK-SAME: i32 [[N:%.*]]) {
42+
; CHECK-NEXT: entry:
43+
; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]])
44+
; CHECK-NEXT: ret void
45+
;
46+
entry:
47+
%call = call i32 @A(i32 %n)
48+
ret void
49+
}
50+
51+
define void @A() {
52+
; CHECK-LABEL: define void @A() {
53+
; CHECK-NEXT: entry:
54+
; CHECK-NEXT: call void @B()
55+
; CHECK-NEXT: call void @C()
56+
; CHECK-NEXT: ret void
57+
;
58+
entry:
59+
call void @B()
60+
call void @C()
61+
ret void
62+
}
63+
64+
define i32 @B() {
65+
; CHECK-LABEL: define i32 @B() {
66+
; CHECK-NEXT: entry:
67+
; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4
68+
; CHECK-NEXT: store i32 5, ptr [[P]], align 4
69+
; CHECK-NEXT: [[RET:%.*]] = call i32 @X(ptr [[P]])
70+
; CHECK-NEXT: [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]])
71+
; CHECK-NEXT: ret i32 [[RET]]
72+
;
73+
entry:
74+
%p = alloca i32
75+
store i32 5, ptr %p
76+
%ret = call i32 @X(ptr %p)
77+
ret i32 %ret
78+
}
79+
80+
define void @C() {
81+
; CHECK-LABEL: define void @C() {
82+
; CHECK-NEXT: entry:
83+
; CHECK-NEXT: ret void
84+
;
85+
entry:
86+
ret void
87+
}
88+
89+
define i32 @X(ptr %x) {
90+
; CHECK-LABEL: define i32 @X(
91+
; CHECK-SAME: ptr [[X:%.*]]) {
92+
; CHECK-NEXT: entry:
93+
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0
94+
; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[X]], align 4
95+
; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4
96+
; CHECK-NEXT: ret i32 [[V]]
97+
;
98+
entry:
99+
%p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
100+
%v = load i32, ptr %x
101+
store i32 %v, ptr %p
102+
ret i32 %v
103+
}
104+
105+
; CHECK-LABEL: define i32 @X.clone.0(ptr %x) {
106+
; CHECK-NEXT: entry:
107+
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE]] to ptr), i64 0, i64 0
108+
; CHECK-NEXT: %v = load i32, ptr %x, align 4
109+
; CHECK-NEXT: store i32 %v, ptr %p, align 4
110+
; CHECK-NEXT: ret i32 %v
111+
112+
; MOD0: ModuleID = '%u0'
113+
; MOD0: {{.*}} addrspace(3) global [64 x i32] undef, align 16
114+
; MOD0: define i32 @X(ptr %x)
115+
116+
; MOD1: ModuleID = '%u1'
117+
; MOD1: {{.*}} addrspace(3) global [64 x i32] poison, align 16
118+
; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n)
119+
; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n)
120+
; MOD1: define void @A()
121+
; MOD1: define i32 @B()
122+
; MOD1: define i32 @X.clone.0(ptr %x)

0 commit comments

Comments
 (0)