Skip to content

Commit f59b39f

Browse files
committed
[AMDGPU] Run LowerLDS at the end of the fullLTO pipeline
This change allows us to use `--lto-partitions` in some cases (not guaranteed it works perfectly), as LDS is lowered before the module is split for parallel codegen. LowerrLDS doesn't support being ran twice because it'll think the lowered LDS GVs are "absolute addresses" LDS which aren't supported, so I just added a module flag to detect multiple runs.
1 parent e418988 commit f59b39f

File tree

4 files changed

+73
-4
lines changed

4 files changed

+73
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,10 @@ extern char &AMDGPULowerModuleLDSLegacyPassID;
130130

131131
struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
132132
const AMDGPUTargetMachine &TM;
133-
AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
133+
bool IsEarlyRun;
134+
AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_,
135+
bool IsEarlyRun = false)
136+
: TM(TM_), IsEarlyRun(IsEarlyRun) {}
134137

135138
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
136139
};

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,12 @@ using namespace llvm;
215215

216216
namespace {
217217

218+
cl::opt<bool>
219+
ForceAddModuleFlag("amdgpu-lower-module-lds-force-add-moduleflag",
220+
cl::desc("Always add the module flag that prevents "
221+
"multiple runs of LowerModuleLDS."),
222+
cl::init(false), cl::ReallyHidden);
223+
218224
cl::opt<bool> SuperAlignLDSGlobals(
219225
"amdgpu-super-align-lds-globals",
220226
cl::desc("Increase alignment of LDS if it is not on align boundary"),
@@ -254,6 +260,7 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
254260

255261
class AMDGPULowerModuleLDS {
256262
const AMDGPUTargetMachine &TM;
263+
bool IsEarlyRun;
257264

258265
static void
259266
removeLocalVarsFromUsedLists(Module &M,
@@ -328,7 +335,8 @@ class AMDGPULowerModuleLDS {
328335
}
329336

330337
public:
331-
AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
338+
AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_, bool IsEarlyRun = false)
339+
: TM(TM_), IsEarlyRun(IsEarlyRun) {}
332340

333341
using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
334342

@@ -1088,6 +1096,15 @@ class AMDGPULowerModuleLDS {
10881096
}
10891097

10901098
bool runOnModule(Module &M) {
1099+
// This pass may run twice in a full LTO pipeline.
1100+
//
1101+
// If we ran it early, we'll have added metadata to skip next runs.
1102+
if (M.getModuleFlag("amdgcn.lowered_module_lds"))
1103+
return false;
1104+
if (IsEarlyRun || ForceAddModuleFlag)
1105+
M.addModuleFlag(Module::ModFlagBehavior::Warning,
1106+
"amdgcn.lowered_module_lds", 1);
1107+
10911108
CallGraph CG = CallGraph(M);
10921109
bool Changed = superAlignLDSGlobals(M);
10931110

@@ -1574,6 +1591,7 @@ llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) {
15741591

15751592
PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
15761593
ModuleAnalysisManager &) {
1577-
return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none()
1578-
: PreservedAnalyses::all();
1594+
return AMDGPULowerModuleLDS(TM, IsEarlyRun).runOnModule(M)
1595+
? PreservedAnalyses::none()
1596+
: PreservedAnalyses::all();
15791597
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
770770

771771
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
772772
});
773+
774+
PB.registerFullLinkTimeOptimizationLastEPCallback(
775+
[this](ModulePassManager &PM, OptimizationLevel Level) {
776+
// We want to support the -lto-partitions=N option as "best effort".
777+
// For that, we need to lower LDS earlier in the pipeline before the
778+
// module is partitioned for codegen.
779+
if (EnableLowerModuleLDS)
780+
PM.addPass(AMDGPULowerModuleLDSPass(*this, /*IsEarlyRun*/ true));
781+
});
773782
}
774783

775784
int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %s -o %t.ll
2+
; RUN: not --crash opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %t.ll -o - 2>&1 | FileCheck %s --check-prefix=ERR
3+
4+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module --amdgpu-lower-module-lds-force-add-moduleflag=1 %s -o %t.ll
5+
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module %t.ll -o - | FileCheck %s
6+
7+
; Check re-run of LowerModuleLDS don't crash when the module flag is used.
8+
;
9+
; We first check this test still crashes when ran twice. If it no longer crashes at some point
10+
; we should update it to ensure the flag still does its job.
11+
;
12+
; This test jus has the bare minimum checks to see if the pass ran.
13+
14+
; ERR: LLVM ERROR: LDS variables with absolute addresses are unimplemented.
15+
16+
; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
17+
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 8
18+
19+
; CHECK: attributes #0 = { "amdgpu-lds-size"="12" }
20+
21+
@var0 = addrspace(3) global float poison, align 8
22+
@var1 = addrspace(3) global i32 poison, align 8
23+
@ptr = addrspace(1) global ptr addrspace(3) @var1, align 4
24+
@with_init = addrspace(3) global i64 0
25+
26+
define void @func() {
27+
%dec = atomicrmw fsub ptr addrspace(3) @var0, float 1.0 monotonic
28+
%val0 = load i32, ptr addrspace(3) @var1, align 4
29+
%val1 = add i32 %val0, 4
30+
store i32 %val1, ptr addrspace(3) @var1, align 4
31+
%unused0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 monotonic
32+
ret void
33+
}
34+
35+
define amdgpu_kernel void @kern_call() {
36+
call void @func()
37+
%dec = atomicrmw fsub ptr addrspace(3) @var0, float 2.0 monotonic
38+
ret void
39+
}

0 commit comments

Comments
 (0)