|
17 | 17 | #include "Server.h"
|
18 | 18 |
|
19 | 19 | #include "cuda.h"
|
| 20 | + |
| 21 | +#include "llvm/Object/ELF.h" |
| 22 | +#include "llvm/Object/ELFObjectFile.h" |
| 23 | + |
20 | 24 | #include <cstddef>
|
21 | 25 | #include <cstdio>
|
22 | 26 | #include <cstdlib>
|
23 | 27 | #include <cstring>
|
| 28 | +#include <vector> |
| 29 | + |
| 30 | +using namespace llvm; |
| 31 | +using namespace object; |
24 | 32 |
|
25 | 33 | /// The arguments to the '_start' kernel.
|
26 | 34 | struct kernel_args_t {
|
@@ -51,11 +59,122 @@ static void handle_error(const char *msg) {
|
51 | 59 | exit(EXIT_FAILURE);
|
52 | 60 | }
|
53 | 61 |
|
| 62 | +// Gets the names of all the globals that contain functions to initialize or |
| 63 | +// deinitialize. We need to do this manually because the NVPTX toolchain does |
| 64 | +// not contain the necessary binary manipulation tools. |
| 65 | +template <typename Alloc> |
| 66 | +Expected<void *> get_ctor_dtor_array(const void *image, const size_t size, |
| 67 | + Alloc allocator, CUmodule binary) { |
| 68 | + auto mem_buffer = MemoryBuffer::getMemBuffer( |
| 69 | + StringRef(reinterpret_cast<const char *>(image), size), "image", |
| 70 | + /*RequiresNullTerminator=*/false); |
| 71 | + Expected<ELF64LEObjectFile> elf_or_err = |
| 72 | + ELF64LEObjectFile::create(*mem_buffer); |
| 73 | + if (!elf_or_err) |
| 74 | + handle_error(toString(elf_or_err.takeError()).c_str()); |
| 75 | + |
| 76 | + std::vector<std::pair<const char *, uint16_t>> ctors; |
| 77 | + std::vector<std::pair<const char *, uint16_t>> dtors; |
| 78 | + // CUDA has no way to iterate over all the symbols so we need to inspect the |
| 79 | + // ELF directly using the LLVM libraries. |
| 80 | + for (const auto &symbol : elf_or_err->symbols()) { |
| 81 | + auto name_or_err = symbol.getName(); |
| 82 | + if (!name_or_err) |
| 83 | + handle_error(toString(name_or_err.takeError()).c_str()); |
| 84 | + |
| 85 | + // Search for all symbols that contain a constructor or destructor. |
| 86 | + if (!name_or_err->starts_with("__init_array_object_") && |
| 87 | + !name_or_err->starts_with("__fini_array_object_")) |
| 88 | + continue; |
| 89 | + |
| 90 | + uint16_t priority; |
| 91 | + if (name_or_err->rsplit('_').second.getAsInteger(10, priority)) |
| 92 | + handle_error("Invalid priority for constructor or destructor"); |
| 93 | + |
| 94 | + if (name_or_err->starts_with("__init")) |
| 95 | + ctors.emplace_back(std::make_pair(name_or_err->data(), priority)); |
| 96 | + else |
| 97 | + dtors.emplace_back(std::make_pair(name_or_err->data(), priority)); |
| 98 | + } |
| 99 | + // Lower priority constructors are run before higher ones. The reverse is true |
| 100 | + // for destructors. |
| 101 | + llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; }); |
| 102 | + llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; }); |
| 103 | + llvm::reverse(dtors); |
| 104 | + |
| 105 | + // Allocate host pinned memory to make these arrays visible to the GPU. |
| 106 | + CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator( |
| 107 | + ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr))); |
| 108 | + uint64_t global_size = 0; |
| 109 | + |
| 110 | + // Get the address of the global and then store the address of the constructor |
| 111 | + // function to call in the constructor array. |
| 112 | + CUdeviceptr *dev_ctors_start = dev_memory; |
| 113 | + CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size(); |
| 114 | + for (uint64_t i = 0; i < ctors.size(); ++i) { |
| 115 | + CUdeviceptr dev_ptr; |
| 116 | + if (CUresult err = |
| 117 | + cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first)) |
| 118 | + handle_error(err); |
| 119 | + if (CUresult err = |
| 120 | + cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t))) |
| 121 | + handle_error(err); |
| 122 | + } |
| 123 | + |
| 124 | + // Get the address of the global and then store the address of the destructor |
| 125 | + // function to call in the destructor array. |
| 126 | + CUdeviceptr *dev_dtors_start = dev_ctors_end; |
| 127 | + CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size(); |
| 128 | + for (uint64_t i = 0; i < dtors.size(); ++i) { |
| 129 | + CUdeviceptr dev_ptr; |
| 130 | + if (CUresult err = |
| 131 | + cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first)) |
| 132 | + handle_error(err); |
| 133 | + if (CUresult err = |
| 134 | + cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t))) |
| 135 | + handle_error(err); |
| 136 | + } |
| 137 | + |
| 138 | + // Obtain the address of the pointers the startup implementation uses to |
| 139 | + // iterate the constructors and destructors. |
| 140 | + CUdeviceptr init_start; |
| 141 | + if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary, |
| 142 | + "__init_array_start")) |
| 143 | + handle_error(err); |
| 144 | + CUdeviceptr init_end; |
| 145 | + if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary, |
| 146 | + "__init_array_end")) |
| 147 | + handle_error(err); |
| 148 | + CUdeviceptr fini_start; |
| 149 | + if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary, |
| 150 | + "__fini_array_start")) |
| 151 | + handle_error(err); |
| 152 | + CUdeviceptr fini_end; |
| 153 | + if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary, |
| 154 | + "__fini_array_end")) |
| 155 | + handle_error(err); |
| 156 | + |
| 157 | + // Copy the pointers to the newly written array to the symbols so the startup |
| 158 | + // implementation can iterate them. |
| 159 | + if (CUresult err = |
| 160 | + cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t))) |
| 161 | + handle_error(err); |
| 162 | + if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t))) |
| 163 | + handle_error(err); |
| 164 | + if (CUresult err = |
| 165 | + cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t))) |
| 166 | + handle_error(err); |
| 167 | + if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t))) |
| 168 | + handle_error(err); |
| 169 | + |
| 170 | + return dev_memory; |
| 171 | +} |
| 172 | + |
54 | 173 | int load(int argc, char **argv, char **envp, void *image, size_t size,
|
55 | 174 | const LaunchParameters ¶ms) {
|
| 175 | + |
56 | 176 | if (CUresult err = cuInit(0))
|
57 | 177 | handle_error(err);
|
58 |
| - |
59 | 178 | // Obtain the first device found on the system.
|
60 | 179 | CUdevice device;
|
61 | 180 | if (CUresult err = cuDeviceGet(&device, 0))
|
@@ -91,6 +210,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
|
91 | 210 | handle_error(err);
|
92 | 211 | return dev_ptr;
|
93 | 212 | };
|
| 213 | + |
| 214 | + auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary); |
| 215 | + if (!memory_or_err) |
| 216 | + handle_error(toString(memory_or_err.takeError()).c_str()); |
| 217 | + |
94 | 218 | void *dev_argv = copy_argument_vector(argc, argv, allocator);
|
95 | 219 | if (!dev_argv)
|
96 | 220 | handle_error("Failed to allocate device argv");
|
@@ -153,6 +277,8 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
|
153 | 277 | handle_error(err);
|
154 | 278 |
|
155 | 279 | // Free the memory allocated for the device.
|
| 280 | + if (CUresult err = cuMemFreeHost(*memory_or_err)) |
| 281 | + handle_error(err); |
156 | 282 | if (CUresult err = cuMemFree(dev_ret))
|
157 | 283 | handle_error(err);
|
158 | 284 | if (CUresult err = cuMemFreeHost(dev_argv))
|
|
0 commit comments