Skip to content

Commit f05ce90

Browse files
committed
[NVPTX] Add NVPTXCtorDtorLoweringPass to handle global ctors / dtors
This patch mostly adapts the existing AMDGPUCtorDtorLoweringPass for use by the Nvidia backend. This pass transforms the ctor / dtor list into a kernel call that can be used to invoke those functinos. Furthermore, we emit globals such that the names and addresses of these constructor functions can be found by the driver. Unfortunately, since NVPTX has no way to emit variables at a named section, nor a functioning linker to provide the begin / end symbols, we need to mangle these names and have an external application find them. This work is related to the work in D149398 and D149340. Reviewed By: tra Differential Revision: https://reviews.llvm.org/D149451
1 parent 909095a commit f05ce90

File tree

10 files changed

+232
-7
lines changed

10 files changed

+232
-7
lines changed

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -695,8 +695,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
695695
/// toolchain.
696696
NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
697697
const llvm::Triple &HostTriple,
698-
const ArgList &Args)
699-
: ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args) {
698+
const ArgList &Args, bool Freestanding = false)
699+
: ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args),
700+
Freestanding(Freestanding) {
700701
if (CudaInstallation.isValid()) {
701702
CudaInstallation.WarnIfUnsupportedVersion();
702703
getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
@@ -711,7 +712,8 @@ NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
711712
NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
712713
const ArgList &Args)
713714
: NVPTXToolChain(D, Triple,
714-
llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args) {}
715+
llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args,
716+
/*Freestanding=*/true) {}
715717

716718
llvm::opt::DerivedArgList *
717719
NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
@@ -735,6 +737,16 @@ NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
735737
return DAL;
736738
}
737739

740+
void NVPTXToolChain::addClangTargetOptions(
741+
const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
742+
Action::OffloadKind DeviceOffloadingKind) const {
743+
// If we are compiling with a standalone NVPTX toolchain we want to try to
744+
// mimic a standard environment as much as possible. So we enable lowering
745+
// ctor / dtor functions to global symbols that can be registered.
746+
if (Freestanding)
747+
CC1Args.append({"-mllvm", "--nvptx-lower-global-ctor-dtor"});
748+
}
749+
738750
bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
739751
const Option &O = A->getOption();
740752
return (O.matches(options::OPT_gN_Group) &&

clang/lib/Driver/ToolChains/Cuda.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,8 @@ namespace toolchains {
132132
class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
133133
public:
134134
NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
135-
const llvm::Triple &HostTriple,
136-
const llvm::opt::ArgList &Args);
135+
const llvm::Triple &HostTriple, const llvm::opt::ArgList &Args,
136+
bool Freestanding);
137137

138138
NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
139139
const llvm::opt::ArgList &Args);
@@ -142,6 +142,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
142142
TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
143143
Action::OffloadKind DeviceOffloadKind) const override;
144144

145+
void
146+
addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
147+
llvm::opt::ArgStringList &CC1Args,
148+
Action::OffloadKind DeviceOffloadKind) const override;
149+
145150
// Never try to use the integrated assembler with CUDA; always fork out to
146151
// ptxas.
147152
bool useIntegratedAs() const override { return false; }
@@ -168,6 +173,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
168173
protected:
169174
Tool *buildAssembler() const override; // ptxas.
170175
Tool *buildLinker() const override; // nvlink.
176+
177+
private:
178+
bool Freestanding = false;
171179
};
172180

173181
class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {

clang/test/Driver/cuda-cross-compiling.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,12 @@
6868
// DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_35" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
6969
// DEFAULT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_35" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
7070
// DEFAULT-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_35" {{.*}} "[[CUBIN]].cubin"
71+
72+
//
73+
// Test to ensure that we enable handling global constructors in a freestanding
74+
// Nvidia compilation.
75+
//
76+
// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_70 %s -### 2>&1 \
77+
// RUN: | FileCheck -check-prefix=LOWERING %s
78+
79+
// LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor"

llvm/lib/Target/NVPTX/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ set(NVPTXCodeGen_sources
3737
NVVMIntrRange.cpp
3838
NVVMReflect.cpp
3939
NVPTXProxyRegErasure.cpp
40+
NVPTXCtorDtorLowering.cpp
4041
)
4142

4243
add_llvm_target(NVPTXCodeGen

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
3939
llvm::CodeGenOpt::Level OptLevel);
4040
ModulePass *createNVPTXAssignValidGlobalNamesPass();
4141
ModulePass *createGenericToNVVMLegacyPass();
42+
ModulePass *createNVPTXCtorDtorLoweringLegacyPass();
4243
FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
4344
FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
4445
MachineFunctionPass *createNVPTXPrologEpilogPass();

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@
9292

9393
using namespace llvm;
9494

95+
static cl::opt<bool>
96+
LowerCtorDtor("nvptx-lower-global-ctor-dtor",
97+
cl::desc("Lower GPU ctor / dtors to globals on the device."),
98+
cl::init(false), cl::Hidden);
99+
95100
#define DEPOTNAME "__local_depot"
96101

97102
/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
@@ -788,12 +793,14 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
788793
report_fatal_error("Module has aliases, which NVPTX does not support.");
789794
return true; // error
790795
}
791-
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) {
796+
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors")) &&
797+
!LowerCtorDtor) {
792798
report_fatal_error(
793799
"Module has a nontrivial global ctor, which NVPTX does not support.");
794800
return true; // error
795801
}
796-
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) {
802+
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors")) &&
803+
!LowerCtorDtor) {
797804
report_fatal_error(
798805
"Module has a nontrivial global dtor, which NVPTX does not support.");
799806
return true; // error
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
//===-- NVPTXCtorDtorLowering.cpp - Handle global ctors and dtors --------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This pass creates a unified init and fini kernel with the required metadata
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "NVPTXCtorDtorLowering.h"
14+
#include "NVPTX.h"
15+
#include "llvm/IR/Constants.h"
16+
#include "llvm/IR/Function.h"
17+
#include "llvm/IR/GlobalVariable.h"
18+
#include "llvm/IR/IRBuilder.h"
19+
#include "llvm/IR/Module.h"
20+
#include "llvm/IR/Value.h"
21+
#include "llvm/Pass.h"
22+
#include "llvm/Support/CommandLine.h"
23+
#include "llvm/Transforms/Utils/ModuleUtils.h"
24+
25+
using namespace llvm;
26+
27+
#define DEBUG_TYPE "nvptx-lower-ctor-dtor"
28+
29+
static cl::opt<std::string>
30+
GlobalStr("nvptx-lower-global-ctor-dtor-id",
31+
cl::desc("Override unique ID of ctor/dtor globals."),
32+
cl::init(""), cl::Hidden);
33+
34+
namespace {
35+
36+
static std::string getHash(StringRef Str) {
37+
llvm::MD5 Hasher;
38+
llvm::MD5::MD5Result Hash;
39+
Hasher.update(Str);
40+
Hasher.final(Hash);
41+
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
42+
}
43+
44+
static bool createInitOrFiniGlobls(Module &M, StringRef GlobalName,
45+
bool IsCtor) {
46+
GlobalVariable *GV = M.getGlobalVariable(GlobalName);
47+
if (!GV || !GV->hasInitializer())
48+
return false;
49+
ConstantArray *GA = dyn_cast<ConstantArray>(GV->getInitializer());
50+
if (!GA || GA->getNumOperands() == 0)
51+
return false;
52+
53+
// NVPTX has no way to emit variables at specific sections or support for
54+
// the traditional constructor sections. Instead, we emit mangled global
55+
// names so the runtime can build the list manually.
56+
for (Value *V : GA->operands()) {
57+
auto *CS = cast<ConstantStruct>(V);
58+
auto *F = cast<Constant>(CS->getOperand(1));
59+
uint64_t Priority = cast<ConstantInt>(CS->getOperand(0))->getSExtValue();
60+
std::string PriorityStr = "." + std::to_string(Priority);
61+
// We append a semi-unique hash and the priority to the global name.
62+
std::string GlobalID =
63+
!GlobalStr.empty() ? GlobalStr : getHash(M.getSourceFileName());
64+
std::string NameStr =
65+
((IsCtor ? "__init_array_object_" : "__fini_array_object_") +
66+
F->getName() + "_" + GlobalID + "_" + std::to_string(Priority))
67+
.str();
68+
// PTX does not support exported names with '.' in them.
69+
llvm::transform(NameStr, NameStr.begin(),
70+
[](char c) { return c == '.' ? '_' : c; });
71+
72+
auto *GV = new GlobalVariable(M, F->getType(), /*IsConstant=*/true,
73+
GlobalValue::ExternalLinkage, F, NameStr,
74+
nullptr, GlobalValue::NotThreadLocal,
75+
/*AddressSpace=*/4);
76+
// This isn't respected by Nvidia, simply put here for clarity.
77+
GV->setSection(IsCtor ? ".init_array" + PriorityStr
78+
: ".fini_array" + PriorityStr);
79+
GV->setVisibility(GlobalVariable::ProtectedVisibility);
80+
appendToUsed(M, {GV});
81+
}
82+
83+
GV->eraseFromParent();
84+
return true;
85+
}
86+
87+
static bool lowerCtorsAndDtors(Module &M) {
88+
bool Modified = false;
89+
Modified |= createInitOrFiniGlobls(M, "llvm.global_ctors", /*IsCtor =*/true);
90+
Modified |= createInitOrFiniGlobls(M, "llvm.global_dtors", /*IsCtor =*/false);
91+
return Modified;
92+
}
93+
94+
class NVPTXCtorDtorLoweringLegacy final : public ModulePass {
95+
public:
96+
static char ID;
97+
NVPTXCtorDtorLoweringLegacy() : ModulePass(ID) {}
98+
bool runOnModule(Module &M) override { return lowerCtorsAndDtors(M); }
99+
};
100+
101+
} // End anonymous namespace
102+
103+
PreservedAnalyses NVPTXCtorDtorLoweringPass::run(Module &M,
104+
ModuleAnalysisManager &AM) {
105+
return lowerCtorsAndDtors(M) ? PreservedAnalyses::none()
106+
: PreservedAnalyses::all();
107+
}
108+
109+
char NVPTXCtorDtorLoweringLegacy::ID = 0;
110+
char &llvm::NVPTXCtorDtorLoweringLegacyPassID = NVPTXCtorDtorLoweringLegacy::ID;
111+
INITIALIZE_PASS(NVPTXCtorDtorLoweringLegacy, DEBUG_TYPE,
112+
"Lower ctors and dtors for NVPTX", false, false)
113+
114+
ModulePass *llvm::createNVPTXCtorDtorLoweringLegacyPass() {
115+
return new NVPTXCtorDtorLoweringLegacy();
116+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//===-- NVPTXCtorDtorLowering.h --------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H
10+
#define LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H
11+
12+
#include "llvm/IR/PassManager.h"
13+
14+
namespace llvm {
15+
class Module;
16+
class PassRegistry;
17+
18+
extern char &NVPTXCtorDtorLoweringLegacyPassID;
19+
extern void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
20+
21+
/// Lower llvm.global_ctors and llvm.global_dtors to special kernels.
22+
class NVPTXCtorDtorLoweringPass
23+
: public PassInfoMixin<NVPTXCtorDtorLoweringPass> {
24+
public:
25+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
26+
};
27+
28+
} // namespace llvm
29+
30+
#endif // LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "NVPTXAliasAnalysis.h"
1616
#include "NVPTXAllocaHoisting.h"
1717
#include "NVPTXAtomicLower.h"
18+
#include "NVPTXCtorDtorLowering.h"
1819
#include "NVPTXLowerAggrCopies.h"
1920
#include "NVPTXMachineFunctionInfo.h"
2021
#include "NVPTXTargetObjectFile.h"
@@ -68,8 +69,10 @@ void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
6869
void initializeNVPTXAllocaHoistingPass(PassRegistry &);
6970
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
7071
void initializeNVPTXAtomicLowerPass(PassRegistry &);
72+
void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
7173
void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
7274
void initializeNVPTXLowerAllocaPass(PassRegistry &);
75+
void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
7376
void initializeNVPTXLowerArgsPass(PassRegistry &);
7477
void initializeNVPTXProxyRegErasurePass(PassRegistry &);
7578
void initializeNVVMIntrRangePass(PassRegistry &);
@@ -95,6 +98,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
9598
initializeNVPTXAtomicLowerPass(PR);
9699
initializeNVPTXLowerArgsPass(PR);
97100
initializeNVPTXLowerAllocaPass(PR);
101+
initializeNVPTXCtorDtorLoweringLegacyPass(PR);
98102
initializeNVPTXLowerAggrCopiesPass(PR);
99103
initializeNVPTXProxyRegErasurePass(PR);
100104
initializeNVPTXDAGToDAGISelPass(PR);
@@ -249,6 +253,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
249253
PB.registerPipelineParsingCallback(
250254
[](StringRef PassName, ModulePassManager &PM,
251255
ArrayRef<PassBuilder::PipelineElement>) {
256+
if (PassName == "nvptx-lower-ctor-dtor") {
257+
PM.addPass(NVPTXCtorDtorLoweringPass());
258+
return true;
259+
}
252260
if (PassName == "generic-to-nvvm") {
253261
PM.addPass(GenericToNVVMPass());
254262
return true;
@@ -369,6 +377,7 @@ void NVPTXPassConfig::addIRPasses() {
369377
}
370378

371379
addPass(createAtomicExpandPass());
380+
addPass(createNVPTXCtorDtorLoweringLegacyPass());
372381

373382
// === LSR and other generic IR passes ===
374383
TargetPassConfig::addIRPasses();
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; RUN: opt -S -mtriple=nvptx64-- -nvptx-lower-ctor-dtor < %s | FileCheck %s
2+
; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor < %s | FileCheck %s
3+
; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor \
4+
; RUN: -nvptx-lower-global-ctor-dtor-id=unique_id < %s | FileCheck %s --check-prefix=GLOBAL
5+
6+
; Make sure we get the same result if we run multiple times
7+
; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor,nvptx-lower-ctor-dtor < %s | FileCheck %s
8+
; RUN: llc -nvptx-lower-global-ctor-dtor -mtriple=nvptx64-amd-amdhsa -mcpu=sm_70 -filetype=asm -o - < %s | FileCheck %s -check-prefix=VISIBILITY
9+
10+
@llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
11+
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
12+
13+
; CHECK-NOT: @llvm.global_ctors
14+
; CHECK-NOT: @llvm.global_dtors
15+
16+
; CHECK: @__init_array_object_foo_[[HASH:[0-9a-f]+]]_1 = protected addrspace(4) constant ptr @foo, section ".init_array.1"
17+
; CHECK: @__fini_array_object_bar_[[HASH:[0-9a-f]+]]_1 = protected addrspace(4) constant ptr @bar, section ".fini_array.1"
18+
; CHECK: @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @__init_array_object_foo_[[HASH]]_1 to ptr), ptr addrspacecast (ptr addrspace(4) @__fini_array_object_bar_[[HASH]]_1 to ptr)], section "llvm.metadata"
19+
; GLOBAL: @__init_array_object_foo_unique_id_1 = protected addrspace(4) constant ptr @foo, section ".init_array.1"
20+
; GLOBAL: @__fini_array_object_bar_unique_id_1 = protected addrspace(4) constant ptr @bar, section ".fini_array.1"
21+
; GLOBAL: @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @__init_array_object_foo_unique_id_1 to ptr), ptr addrspacecast (ptr addrspace(4) @__fini_array_object_bar_unique_id_1 to ptr)], section "llvm.metadata"
22+
23+
; VISIBILITY: .visible .const .align 8 .u64 __init_array_object_foo_[[HASH:[0-9a-f]+]]_1 = foo;
24+
; VISIBILITY: .visible .const .align 8 .u64 __fini_array_object_bar_[[HASH:[0-9a-f]+]]_1 = bar;
25+
26+
define internal void @foo() {
27+
ret void
28+
}
29+
30+
define internal void @bar() {
31+
ret void
32+
}

0 commit comments

Comments
 (0)