[libc] Support global constructors and destructors on NVPTX

jhuber6 · jhuber6 · commit 2e1c0ec62979 · 2023-05-04T07:13:00.000-05:00
This patch adds the necessary hacks to support global constructors and destructors. This is an incredibly hacky process caused by the primary fact that Nvidia does not provide any binary tools and very little linker support. We first had to emit references to these functions and their priority in D149451. Then we dig them out of the module once it's loaded to manually create the list that the linker should have made for us. This patch also contains a few Nvidia specific hacks, but it passes the test, albeit with a stack size warning from `ptxas` for the callback. But this should be fine given the resource usage of a common test. This also adds a dependency on LLVM to the NVPTX loader, which hopefully doesn't cause problems with our CUDA buildbot. Depends on D149451 Reviewed By: tra Differential Revision: https://reviews.llvm.org/D149527
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -497,12 +497,12 @@ function(add_integration_test test_name)
   # The GPU build requires overriding the default CMake triple and architecture.
   if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_compile_options(${fq_build_target_name} PRIVATE
-                           -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-                           --target=${LIBC_GPU_TARGET_TRIPLE})
+                           -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
+                           -flto --target=${LIBC_GPU_TARGET_TRIPLE})
   elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
     get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
     target_compile_options(${fq_build_target_name} PRIVATE
-                           ${nvptx_options}
+                           ${nvptx_options} -fno-use-cxa-atexit
                            --target=${LIBC_GPU_TARGET_TRIPLE})
   endif()
 
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -6,6 +6,8 @@ add_startup_object(
   DEPENDS
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
+    libc.src.stdlib.exit
+    libc.src.stdlib.atexit
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
@@ -8,36 +8,98 @@
 
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
 
 namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> init = 0;
+static cpp::Atomic<uint32_t> count = 0;
 
-void init_rpc(void *in, void *out, void *buffer) {
-  // Only a single thread should update the RPC data.
+extern "C" {
+// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
+// to manually create them and update the globals in the loader implememtation.
+uintptr_t *__init_array_start [[gnu::visibility("protected")]];
+uintptr_t *__init_array_end [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
+}
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static uint64_t get_grid_size() {
+  return gpu::get_num_threads() * gpu::get_num_blocks();
+}
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = 0; i < fini_array_size; ++i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
+}
+
+// TODO: Put this in a separate kernel and call it with one thread.
+void initialize(int argc, char **argv, char **env, void *in, void *out,
+                void *buffer) {
+  // We need a single GPU thread to perform the initialization of the global
+  // constructors and data. We simply mask off all but a single thread and
+  // execute.
+  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
   if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // We need to set up the RPC client first in case any of the constructors
+    // require it.
     rpc::client.reset(&lock, in, out, buffer);
-    init.store(1, cpp::MemoryOrder::RELAXED);
+
+    // We want the fini array callbacks to be run after other atexit
+    // callbacks are run. So, we register them before running the init
+    // array callbacks as they can potentially register their own atexit
+    // callbacks.
+    // FIXME: The function pointer escaping this TU causes warnings.
+    __llvm_libc::atexit(&call_fini_array_callbacks);
+    call_init_array_callbacks(argc, argv, env);
   }
 
-  // Wait until the previous thread signals that the data has been written.
-  while (!init.load(cpp::MemoryOrder::RELAXED))
+  // We wait until every single thread launched on the GPU has seen the
+  // initialization code. This will get very, very slow for high thread counts,
+  // but for testing purposes it is unlikely to matter.
+  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
     rpc::sleep_briefly();
+  gpu::sync_threads();
+}
 
-  // Wait for the threads in the block to converge and fence the write.
+// TODO: Put this in a separate kernel and call it with one thread.
+void finalize(int retval) {
+  // We wait until every single thread launched on the GPU has finished
+  // executing and reached the finalize region.
+  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
+  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
+    rpc::sleep_briefly();
   gpu::sync_threads();
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // Only a single thread should call `exit` here, the rest should gracefully
+    // return from the kernel. This is so only one thread calls the destructors
+    // registred with 'atexit' above.
+    __llvm_libc::exit(retval);
+  }
 }
 
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::init_rpc(in, out, buffer);
+  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+
+  __llvm_libc::finalize(*ret);
 }
diff --git a/libc/test/IntegrationTest/test.cpp b/libc/test/IntegrationTest/test.cpp
@@ -22,6 +22,7 @@ int memcmp(const void *lhs, const void *rhs, size_t count);
 void *memcpy(void *__restrict, const void *__restrict, size_t);
 void *memmove(void *dst, const void *src, size_t count);
 void *memset(void *ptr, int value, size_t count);
+int atexit(void (*func)(void));
 
 } // namespace __llvm_libc
 
@@ -44,6 +45,9 @@ void *memset(void *ptr, int value, size_t count) {
   return __llvm_libc::memset(ptr, value, count);
 }
 
+// This is needed if the test was compiled with '-fno-use-cxa-atexit'.
+int atexit(void (*func)(void)) { return __llvm_libc::atexit(func); }
+
 } // extern "C"
 
 // Integration tests cannot use the SCUDO standalone allocator as SCUDO pulls
diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -26,12 +26,9 @@ add_integration_test(
     --threads 1
 )
 
-# Constructors are currently only supported on AMDGPU.
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
-  add_integration_test(
-    init_fini_array_test
-    SUITE libc-startup-tests
-    SRCS
-      init_fini_array_test.cpp
-  )
-endif()
+add_integration_test(
+  init_fini_array_test
+  SUITE libc-startup-tests
+  SRCS
+    init_fini_array_test.cpp
+)
diff --git a/libc/test/integration/startup/gpu/init_fini_array_test.cpp b/libc/test/integration/startup/gpu/init_fini_array_test.cpp
@@ -53,7 +53,7 @@ __attribute__((destructor(1))) void reset_initval() {
   initval = 0;
 }
 
-TEST_MAIN() {
+TEST_MAIN(int argc, char **argv, char **env) {
   ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER);
   ASSERT_EQ(initval, INITVAL_INITIALIZER);
   return 0;
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
@@ -12,7 +12,9 @@ else()
 endif()
 
 find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
+# The CUDA loader requires LLVM to traverse the ELF image for symbols.
+find_package(LLVM QUIET)
+if(CUDAToolkit_FOUND AND LLVM_FOUND)
   add_subdirectory(nvptx)
 else()
   message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -1,8 +1,14 @@
 add_executable(nvptx_loader Loader.cpp)
 add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
 
+if(NOT LLVM_ENABLE_RTTI)
+  target_compile_options(nvptx_loader PRIVATE -fno-rtti)
+endif()
+target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})
 target_link_libraries(nvptx_loader
   PRIVATE
   gpu_loader
   CUDA::cuda_driver
+  LLVMObject
+  LLVMSupport
 )
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -17,10 +17,18 @@
 #include "Server.h"
 
 #include "cuda.h"
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <vector>
+
+using namespace llvm;
+using namespace object;
 
 /// The arguments to the '_start' kernel.
 struct kernel_args_t {
@@ -51,11 +59,122 @@ static void handle_error(const char *msg) {
   exit(EXIT_FAILURE);
 }
 
+// Gets the names of all the globals that contain functions to initialize or
+// deinitialize. We need to do this manually because the NVPTX toolchain does
+// not contain the necessary binary manipulation tools.
+template <typename Alloc>
+Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
+                                     Alloc allocator, CUmodule binary) {
+  auto mem_buffer = MemoryBuffer::getMemBuffer(
+      StringRef(reinterpret_cast<const char *>(image), size), "image",
+      /*RequiresNullTerminator=*/false);
+  Expected<ELF64LEObjectFile> elf_or_err =
+      ELF64LEObjectFile::create(*mem_buffer);
+  if (!elf_or_err)
+    handle_error(toString(elf_or_err.takeError()).c_str());
+
+  std::vector<std::pair<const char *, uint16_t>> ctors;
+  std::vector<std::pair<const char *, uint16_t>> dtors;
+  // CUDA has no way to iterate over all the symbols so we need to inspect the
+  // ELF directly using the LLVM libraries.
+  for (const auto &symbol : elf_or_err->symbols()) {
+    auto name_or_err = symbol.getName();
+    if (!name_or_err)
+      handle_error(toString(name_or_err.takeError()).c_str());
+
+    // Search for all symbols that contain a constructor or destructor.
+    if (!name_or_err->starts_with("__init_array_object_") &&
+        !name_or_err->starts_with("__fini_array_object_"))
+      continue;
+
+    uint16_t priority;
+    if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
+      handle_error("Invalid priority for constructor or destructor");
+
+    if (name_or_err->starts_with("__init"))
+      ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
+    else
+      dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
+  }
+  // Lower priority constructors are run before higher ones. The reverse is true
+  // for destructors.
+  llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
+  llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
+  llvm::reverse(dtors);
+
+  // Allocate host pinned memory to make these arrays visible to the GPU.
+  CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
+      ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
+  uint64_t global_size = 0;
+
+  // Get the address of the global and then store the address of the constructor
+  // function to call in the constructor array.
+  CUdeviceptr *dev_ctors_start = dev_memory;
+  CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
+  for (uint64_t i = 0; i < ctors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Get the address of the global and then store the address of the destructor
+  // function to call in the destructor array.
+  CUdeviceptr *dev_dtors_start = dev_ctors_end;
+  CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
+  for (uint64_t i = 0; i < dtors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Obtain the address of the pointers the startup implementation uses to
+  // iterate the constructors and destructors.
+  CUdeviceptr init_start;
+  if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
+                                       "__init_array_start"))
+    handle_error(err);
+  CUdeviceptr init_end;
+  if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
+                                       "__init_array_end"))
+    handle_error(err);
+  CUdeviceptr fini_start;
+  if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
+                                       "__fini_array_start"))
+    handle_error(err);
+  CUdeviceptr fini_end;
+  if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
+                                       "__fini_array_end"))
+    handle_error(err);
+
+  // Copy the pointers to the newly written array to the symbols so the startup
+  // implementation can iterate them.
+  if (CUresult err =
+          cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err =
+          cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
+    handle_error(err);
+
+  return dev_memory;
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
+
   if (CUresult err = cuInit(0))
     handle_error(err);
-
   // Obtain the first device found on the system.
   CUdevice device;
   if (CUresult err = cuDeviceGet(&device, 0))
@@ -91,6 +210,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
       handle_error(err);
     return dev_ptr;
   };
+
+  auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
+  if (!memory_or_err)
+    handle_error(toString(memory_or_err.takeError()).c_str());
+
   void *dev_argv = copy_argument_vector(argc, argv, allocator);
   if (!dev_argv)
     handle_error("Failed to allocate device argv");
@@ -153,6 +277,8 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   // Free the memory allocated for the device.
+  if (CUresult err = cuMemFreeHost(*memory_or_err))
+    handle_error(err);
   if (CUresult err = cuMemFree(dev_ret))
     handle_error(err);
   if (CUresult err = cuMemFreeHost(dev_argv))

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ __attribute__((destructor(1))) void reset_initval() {`
`53`	`53`	`initval = 0;`
`54`	`54`	`}`
`55`	`55`
`56`		`-TEST_MAIN() {`
	`56`	`+TEST_MAIN(int argc, char argv, char env) {`
`57`	`57`	`ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER);`
`58`	`58`	`ASSERT_EQ(initval, INITVAL_INITIALIZER);`
`59`	`59`	`return 0;`