Skip to content

Commit 2e1c0ec

Browse files
committed
[libc] Support global constructors and destructors on NVPTX
This patch adds the necessary hacks to support global constructors and destructors. This is an incredibly hacky process caused by the primary fact that Nvidia does not provide any binary tools and very little linker support. We first had to emit references to these functions and their priority in D149451. Then we dig them out of the module once it's loaded to manually create the list that the linker should have made for us. This patch also contains a few Nvidia specific hacks, but it passes the test, albeit with a stack size warning from `ptxas` for the callback. But this should be fine given the resource usage of a common test. This also adds a dependency on LLVM to the NVPTX loader, which hopefully doesn't cause problems with our CUDA buildbot. Depends on D149451 Reviewed By: tra Differential Revision: https://reviews.llvm.org/D149527
1 parent f05ce90 commit 2e1c0ec

File tree

9 files changed

+222
-23
lines changed

9 files changed

+222
-23
lines changed

libc/cmake/modules/LLVMLibCTestRules.cmake

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -497,12 +497,12 @@ function(add_integration_test test_name)
497497
# The GPU build requires overriding the default CMake triple and architecture.
498498
if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
499499
target_compile_options(${fq_build_target_name} PRIVATE
500-
-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
501-
--target=${LIBC_GPU_TARGET_TRIPLE})
500+
-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
501+
-flto --target=${LIBC_GPU_TARGET_TRIPLE})
502502
elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
503503
get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
504504
target_compile_options(${fq_build_target_name} PRIVATE
505-
${nvptx_options}
505+
${nvptx_options} -fno-use-cxa-atexit
506506
--target=${LIBC_GPU_TARGET_TRIPLE})
507507
endif()
508508

libc/startup/gpu/nvptx/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ add_startup_object(
66
DEPENDS
77
libc.src.__support.RPC.rpc_client
88
libc.src.__support.GPU.utils
9+
libc.src.stdlib.exit
10+
libc.src.stdlib.atexit
911
COMPILE_OPTIONS
1012
-ffreestanding # To avoid compiler warnings about calling the main function.
1113
-fno-builtin

libc/startup/gpu/nvptx/start.cpp

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,36 +8,98 @@
88

99
#include "src/__support/GPU/utils.h"
1010
#include "src/__support/RPC/rpc_client.h"
11+
#include "src/stdlib/atexit.h"
12+
#include "src/stdlib/exit.h"
1113

1214
extern "C" int main(int argc, char **argv, char **envp);
1315

1416
namespace __llvm_libc {
1517

1618
static cpp::Atomic<uint32_t> lock = 0;
1719

18-
static cpp::Atomic<uint32_t> init = 0;
20+
static cpp::Atomic<uint32_t> count = 0;
1921

20-
void init_rpc(void *in, void *out, void *buffer) {
21-
// Only a single thread should update the RPC data.
22+
extern "C" {
23+
// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
24+
// to manually create them and update the globals in the loader implememtation.
25+
uintptr_t *__init_array_start [[gnu::visibility("protected")]];
26+
uintptr_t *__init_array_end [[gnu::visibility("protected")]];
27+
uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
28+
uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
29+
}
30+
31+
using InitCallback = void(int, char **, char **);
32+
using FiniCallback = void(void);
33+
34+
static uint64_t get_grid_size() {
35+
return gpu::get_num_threads() * gpu::get_num_blocks();
36+
}
37+
38+
static void call_init_array_callbacks(int argc, char **argv, char **env) {
39+
size_t init_array_size = __init_array_end - __init_array_start;
40+
for (size_t i = 0; i < init_array_size; ++i)
41+
reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
42+
}
43+
44+
static void call_fini_array_callbacks() {
45+
size_t fini_array_size = __fini_array_end - __fini_array_start;
46+
for (size_t i = 0; i < fini_array_size; ++i)
47+
reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
48+
}
49+
50+
// TODO: Put this in a separate kernel and call it with one thread.
51+
void initialize(int argc, char **argv, char **env, void *in, void *out,
52+
void *buffer) {
53+
// We need a single GPU thread to perform the initialization of the global
54+
// constructors and data. We simply mask off all but a single thread and
55+
// execute.
56+
count.fetch_add(1, cpp::MemoryOrder::RELAXED);
2257
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
58+
// We need to set up the RPC client first in case any of the constructors
59+
// require it.
2360
rpc::client.reset(&lock, in, out, buffer);
24-
init.store(1, cpp::MemoryOrder::RELAXED);
61+
62+
// We want the fini array callbacks to be run after other atexit
63+
// callbacks are run. So, we register them before running the init
64+
// array callbacks as they can potentially register their own atexit
65+
// callbacks.
66+
// FIXME: The function pointer escaping this TU causes warnings.
67+
__llvm_libc::atexit(&call_fini_array_callbacks);
68+
call_init_array_callbacks(argc, argv, env);
2569
}
2670

27-
// Wait until the previous thread signals that the data has been written.
28-
while (!init.load(cpp::MemoryOrder::RELAXED))
71+
// We wait until every single thread launched on the GPU has seen the
72+
// initialization code. This will get very, very slow for high thread counts,
73+
// but for testing purposes it is unlikely to matter.
74+
while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
2975
rpc::sleep_briefly();
76+
gpu::sync_threads();
77+
}
3078

31-
// Wait for the threads in the block to converge and fence the write.
79+
// TODO: Put this in a separate kernel and call it with one thread.
80+
void finalize(int retval) {
81+
// We wait until every single thread launched on the GPU has finished
82+
// executing and reached the finalize region.
83+
count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
84+
while (count.load(cpp::MemoryOrder::RELAXED) != 0)
85+
rpc::sleep_briefly();
3286
gpu::sync_threads();
87+
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
88+
// Only a single thread should call `exit` here, the rest should gracefully
89+
// return from the kernel. This is so only one thread calls the destructors
90+
// registred with 'atexit' above.
91+
__llvm_libc::exit(retval);
92+
}
3393
}
3494

3595
} // namespace __llvm_libc
3696

3797
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
3898
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
3999
void *buffer) {
40-
__llvm_libc::init_rpc(in, out, buffer);
100+
__llvm_libc::initialize(argc, argv, envp, in, out, buffer);
41101

42102
__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
103+
104+
__llvm_libc::finalize(*ret);
43105
}

libc/test/IntegrationTest/test.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ int memcmp(const void *lhs, const void *rhs, size_t count);
2222
void *memcpy(void *__restrict, const void *__restrict, size_t);
2323
void *memmove(void *dst, const void *src, size_t count);
2424
void *memset(void *ptr, int value, size_t count);
25+
int atexit(void (*func)(void));
2526

2627
} // namespace __llvm_libc
2728

@@ -44,6 +45,9 @@ void *memset(void *ptr, int value, size_t count) {
4445
return __llvm_libc::memset(ptr, value, count);
4546
}
4647

48+
// This is needed if the test was compiled with '-fno-use-cxa-atexit'.
49+
int atexit(void (*func)(void)) { return __llvm_libc::atexit(func); }
50+
4751
} // extern "C"
4852

4953
// Integration tests cannot use the SCUDO standalone allocator as SCUDO pulls

libc/test/integration/startup/gpu/CMakeLists.txt

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,9 @@ add_integration_test(
2626
--threads 1
2727
)
2828

29-
# Constructors are currently only supported on AMDGPU.
30-
if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
31-
add_integration_test(
32-
init_fini_array_test
33-
SUITE libc-startup-tests
34-
SRCS
35-
init_fini_array_test.cpp
36-
)
37-
endif()
29+
add_integration_test(
30+
init_fini_array_test
31+
SUITE libc-startup-tests
32+
SRCS
33+
init_fini_array_test.cpp
34+
)

libc/test/integration/startup/gpu/init_fini_array_test.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ __attribute__((destructor(1))) void reset_initval() {
5353
initval = 0;
5454
}
5555

56-
TEST_MAIN() {
56+
TEST_MAIN(int argc, char **argv, char **env) {
5757
ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER);
5858
ASSERT_EQ(initval, INITVAL_INITIALIZER);
5959
return 0;

libc/utils/gpu/loader/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ else()
1212
endif()
1313

1414
find_package(CUDAToolkit QUIET)
15-
if(CUDAToolkit_FOUND)
15+
# The CUDA loader requires LLVM to traverse the ELF image for symbols.
16+
find_package(LLVM QUIET)
17+
if(CUDAToolkit_FOUND AND LLVM_FOUND)
1618
add_subdirectory(nvptx)
1719
else()
1820
message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
add_executable(nvptx_loader Loader.cpp)
22
add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
33

4+
if(NOT LLVM_ENABLE_RTTI)
5+
target_compile_options(nvptx_loader PRIVATE -fno-rtti)
6+
endif()
7+
target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})
48
target_link_libraries(nvptx_loader
59
PRIVATE
610
gpu_loader
711
CUDA::cuda_driver
12+
LLVMObject
13+
LLVMSupport
814
)

libc/utils/gpu/loader/nvptx/Loader.cpp

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,18 @@
1717
#include "Server.h"
1818

1919
#include "cuda.h"
20+
21+
#include "llvm/Object/ELF.h"
22+
#include "llvm/Object/ELFObjectFile.h"
23+
2024
#include <cstddef>
2125
#include <cstdio>
2226
#include <cstdlib>
2327
#include <cstring>
28+
#include <vector>
29+
30+
using namespace llvm;
31+
using namespace object;
2432

2533
/// The arguments to the '_start' kernel.
2634
struct kernel_args_t {
@@ -51,11 +59,122 @@ static void handle_error(const char *msg) {
5159
exit(EXIT_FAILURE);
5260
}
5361

62+
// Gets the names of all the globals that contain functions to initialize or
63+
// deinitialize. We need to do this manually because the NVPTX toolchain does
64+
// not contain the necessary binary manipulation tools.
65+
template <typename Alloc>
66+
Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
67+
Alloc allocator, CUmodule binary) {
68+
auto mem_buffer = MemoryBuffer::getMemBuffer(
69+
StringRef(reinterpret_cast<const char *>(image), size), "image",
70+
/*RequiresNullTerminator=*/false);
71+
Expected<ELF64LEObjectFile> elf_or_err =
72+
ELF64LEObjectFile::create(*mem_buffer);
73+
if (!elf_or_err)
74+
handle_error(toString(elf_or_err.takeError()).c_str());
75+
76+
std::vector<std::pair<const char *, uint16_t>> ctors;
77+
std::vector<std::pair<const char *, uint16_t>> dtors;
78+
// CUDA has no way to iterate over all the symbols so we need to inspect the
79+
// ELF directly using the LLVM libraries.
80+
for (const auto &symbol : elf_or_err->symbols()) {
81+
auto name_or_err = symbol.getName();
82+
if (!name_or_err)
83+
handle_error(toString(name_or_err.takeError()).c_str());
84+
85+
// Search for all symbols that contain a constructor or destructor.
86+
if (!name_or_err->starts_with("__init_array_object_") &&
87+
!name_or_err->starts_with("__fini_array_object_"))
88+
continue;
89+
90+
uint16_t priority;
91+
if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
92+
handle_error("Invalid priority for constructor or destructor");
93+
94+
if (name_or_err->starts_with("__init"))
95+
ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
96+
else
97+
dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
98+
}
99+
// Lower priority constructors are run before higher ones. The reverse is true
100+
// for destructors.
101+
llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
102+
llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
103+
llvm::reverse(dtors);
104+
105+
// Allocate host pinned memory to make these arrays visible to the GPU.
106+
CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
107+
ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
108+
uint64_t global_size = 0;
109+
110+
// Get the address of the global and then store the address of the constructor
111+
// function to call in the constructor array.
112+
CUdeviceptr *dev_ctors_start = dev_memory;
113+
CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
114+
for (uint64_t i = 0; i < ctors.size(); ++i) {
115+
CUdeviceptr dev_ptr;
116+
if (CUresult err =
117+
cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
118+
handle_error(err);
119+
if (CUresult err =
120+
cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
121+
handle_error(err);
122+
}
123+
124+
// Get the address of the global and then store the address of the destructor
125+
// function to call in the destructor array.
126+
CUdeviceptr *dev_dtors_start = dev_ctors_end;
127+
CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
128+
for (uint64_t i = 0; i < dtors.size(); ++i) {
129+
CUdeviceptr dev_ptr;
130+
if (CUresult err =
131+
cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
132+
handle_error(err);
133+
if (CUresult err =
134+
cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
135+
handle_error(err);
136+
}
137+
138+
// Obtain the address of the pointers the startup implementation uses to
139+
// iterate the constructors and destructors.
140+
CUdeviceptr init_start;
141+
if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
142+
"__init_array_start"))
143+
handle_error(err);
144+
CUdeviceptr init_end;
145+
if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
146+
"__init_array_end"))
147+
handle_error(err);
148+
CUdeviceptr fini_start;
149+
if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
150+
"__fini_array_start"))
151+
handle_error(err);
152+
CUdeviceptr fini_end;
153+
if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
154+
"__fini_array_end"))
155+
handle_error(err);
156+
157+
// Copy the pointers to the newly written array to the symbols so the startup
158+
// implementation can iterate them.
159+
if (CUresult err =
160+
cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
161+
handle_error(err);
162+
if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
163+
handle_error(err);
164+
if (CUresult err =
165+
cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
166+
handle_error(err);
167+
if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
168+
handle_error(err);
169+
170+
return dev_memory;
171+
}
172+
54173
int load(int argc, char **argv, char **envp, void *image, size_t size,
55174
const LaunchParameters &params) {
175+
56176
if (CUresult err = cuInit(0))
57177
handle_error(err);
58-
59178
// Obtain the first device found on the system.
60179
CUdevice device;
61180
if (CUresult err = cuDeviceGet(&device, 0))
@@ -91,6 +210,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
91210
handle_error(err);
92211
return dev_ptr;
93212
};
213+
214+
auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
215+
if (!memory_or_err)
216+
handle_error(toString(memory_or_err.takeError()).c_str());
217+
94218
void *dev_argv = copy_argument_vector(argc, argv, allocator);
95219
if (!dev_argv)
96220
handle_error("Failed to allocate device argv");
@@ -153,6 +277,8 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
153277
handle_error(err);
154278

155279
// Free the memory allocated for the device.
280+
if (CUresult err = cuMemFreeHost(*memory_or_err))
281+
handle_error(err);
156282
if (CUresult err = cuMemFree(dev_ret))
157283
handle_error(err);
158284
if (CUresult err = cuMemFreeHost(dev_argv))

0 commit comments

Comments
 (0)