Skip to content

Commit 7384287

Browse files
committed
[AMDGPU] Run LowerLDS at the end of the fullLTO pipeline
This change allows us to use `--lto-partitions` in some cases (not guaranteed it works perfectly), as LDS is lowered before the module is split for parallel codegen. LowerrLDS doesn't support being ran twice because it'll think the lowered LDS GVs are "absolute addresses" LDS which aren't supported, so I just added a module flag to detect multiple runs.
1 parent 95ef8e3 commit 7384287

File tree

4 files changed

+73
-4
lines changed

4 files changed

+73
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,10 @@ extern char &AMDGPULowerModuleLDSLegacyPassID;
131131

132132
struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
133133
const AMDGPUTargetMachine &TM;
134-
AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
134+
bool IsEarlyRun;
135+
AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_,
136+
bool IsEarlyRun = false)
137+
: TM(TM_), IsEarlyRun(IsEarlyRun) {}
135138

136139
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
137140
};

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,12 @@ using namespace llvm;
215215

216216
namespace {
217217

218+
cl::opt<bool>
219+
ForceAddModuleFlag("amdgpu-lower-module-lds-force-add-moduleflag",
220+
cl::desc("Always add the module flag that prevents "
221+
"multiple runs of LowerModuleLDS."),
222+
cl::init(false), cl::ReallyHidden);
223+
218224
cl::opt<bool> SuperAlignLDSGlobals(
219225
"amdgpu-super-align-lds-globals",
220226
cl::desc("Increase alignment of LDS if it is not on align boundary"),
@@ -254,6 +260,7 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
254260

255261
class AMDGPULowerModuleLDS {
256262
const AMDGPUTargetMachine &TM;
263+
bool IsEarlyRun;
257264

258265
static void
259266
removeLocalVarsFromUsedLists(Module &M,
@@ -328,7 +335,8 @@ class AMDGPULowerModuleLDS {
328335
}
329336

330337
public:
331-
AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
338+
AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_, bool IsEarlyRun = false)
339+
: TM(TM_), IsEarlyRun(IsEarlyRun) {}
332340

333341
using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
334342

@@ -1133,6 +1141,15 @@ class AMDGPULowerModuleLDS {
11331141
}
11341142

11351143
bool runOnModule(Module &M) {
1144+
// This pass may run twice in a full LTO pipeline.
1145+
//
1146+
// If we ran it early, we'll have added metadata to skip next runs.
1147+
if (M.getModuleFlag("amdgcn.lowered_module_lds"))
1148+
return false;
1149+
if (IsEarlyRun || ForceAddModuleFlag)
1150+
M.addModuleFlag(Module::ModFlagBehavior::Warning,
1151+
"amdgcn.lowered_module_lds", 1);
1152+
11361153
CallGraph CG = CallGraph(M);
11371154
bool Changed = superAlignLDSGlobals(M);
11381155

@@ -1626,6 +1643,7 @@ llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) {
16261643

16271644
PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
16281645
ModuleAnalysisManager &) {
1629-
return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none()
1630-
: PreservedAnalyses::all();
1646+
return AMDGPULowerModuleLDS(TM, IsEarlyRun).runOnModule(M)
1647+
? PreservedAnalyses::none()
1648+
: PreservedAnalyses::all();
16311649
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
779779

780780
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
781781
});
782+
783+
PB.registerFullLinkTimeOptimizationLastEPCallback(
784+
[this](ModulePassManager &PM, OptimizationLevel Level) {
785+
// We want to support the -lto-partitions=N option as "best effort".
786+
// For that, we need to lower LDS earlier in the pipeline before the
787+
// module is partitioned for codegen.
788+
if (EnableLowerModuleLDS)
789+
PM.addPass(AMDGPULowerModuleLDSPass(*this, /*IsEarlyRun*/ true));
790+
});
782791
}
783792

784793
int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %s -o %t.ll
2+
; RUN: not --crash opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %t.ll -o - 2>&1 | FileCheck %s --check-prefix=ERR
3+
4+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module --amdgpu-lower-module-lds-force-add-moduleflag=1 %s -o %t.ll
5+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %t.ll -o - | FileCheck %s
6+
7+
; Check re-run of LowerModuleLDS don't crash when the module flag is used.
8+
;
9+
; We first check this test still crashes when ran twice. If it no longer crashes at some point
10+
; we should update it to ensure the flag still does its job.
11+
;
12+
; This test jus has the bare minimum checks to see if the pass ran.
13+
14+
; ERR: LLVM ERROR: LDS variables with absolute addresses are unimplemented.
15+
16+
; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
17+
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 8
18+
19+
; CHECK: attributes #0 = { "amdgpu-lds-size"="12" }
20+
21+
@var0 = addrspace(3) global float poison, align 8
22+
@var1 = addrspace(3) global i32 poison, align 8
23+
@ptr = addrspace(1) global ptr addrspace(3) @var1, align 4
24+
@with_init = addrspace(3) global i64 0
25+
26+
define void @func() {
27+
%dec = atomicrmw fsub ptr addrspace(3) @var0, float 1.0 monotonic
28+
%val0 = load i32, ptr addrspace(3) @var1, align 4
29+
%val1 = add i32 %val0, 4
30+
store i32 %val1, ptr addrspace(3) @var1, align 4
31+
%unused0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 monotonic
32+
ret void
33+
}
34+
35+
define amdgpu_kernel void @kern_call() {
36+
call void @func()
37+
%dec = atomicrmw fsub ptr addrspace(3) @var0, float 2.0 monotonic
38+
ret void
39+
}

0 commit comments

Comments
 (0)