Skip to content

Commit 421085f

Browse files
authored
[Offload] Change unregister library to use atexit instead of destructor (#86830)
Summary: The 'new driver' sets up the lifetime of a registered liftime using global constructors and destructors. Currently, this is put at priority 1 which isn't strictly conformant as it will conflict with system utilities. We now use 101 as this is the loweest suggested for non-system constructors and will still run before user constructors. Secondly, there were issues with the CUDA runtime when destructed with a global destructor. Because the global ones are in any order and potentially run before other things we were hitting an edge case where the OpenMP runtime was uninitialized *after* `_dl_fini` was called. This would result in us erroring when we call into a destroyed `libcuda.so` instance. using `atexit` is what CUDA / HIP use and it prevents this from happening. Most everything uses `atexit` except system utilities and because of the constructor priority it will be unregistered *after* everything else but not after `_fl_fini`.
1 parent dd06b8e commit 421085f

File tree

2 files changed

+41
-37
lines changed

2 files changed

+41
-37
lines changed

clang/test/Driver/linker-wrapper-image.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@
2626
// OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8
2727
// OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
2828
// OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
29-
// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_reg, ptr null }]
30-
// OPENMP-NEXT: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.omp_offloading.descriptor_unreg, ptr null }]
29+
// OPENMP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.omp_offloading.descriptor_reg, ptr null }]
3130

3231
// OPENMP: define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
3332
// OPENMP-NEXT: entry:
33+
// OPENMP-NEXT: %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
3434
// OPENMP-NEXT: call void @__tgt_register_lib(ptr @.omp_offloading.descriptor)
3535
// OPENMP-NEXT: ret void
3636
// OPENMP-NEXT: }
@@ -62,7 +62,7 @@
6262
// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
6363
// CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
6464

65-
// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]
65+
// CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }]
6666

6767
// CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
6868
// CUDA-NEXT: entry:
@@ -162,7 +162,7 @@
162162
// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
163163
// HIP-NEXT: @.hip.binary_handle = internal global ptr null
164164

165-
// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]
165+
// HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.hip.fatbin_reg, ptr null }]
166166

167167
// HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
168168
// HIP-NEXT: entry:

llvm/lib/Frontend/Offloading/OffloadWrapper.cpp

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -186,57 +186,62 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs,
186186
".omp_offloading.descriptor" + Suffix);
187187
}
188188

189-
void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
190-
StringRef Suffix) {
189+
Function *createUnregisterFunction(Module &M, GlobalVariable *BinDesc,
190+
StringRef Suffix) {
191191
LLVMContext &C = M.getContext();
192192
auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
193-
auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
194-
".omp_offloading.descriptor_reg" + Suffix, &M);
193+
auto *Func =
194+
Function::Create(FuncTy, GlobalValue::InternalLinkage,
195+
".omp_offloading.descriptor_unreg" + Suffix, &M);
195196
Func->setSection(".text.startup");
196197

197-
// Get __tgt_register_lib function declaration.
198-
auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
199-
/*isVarArg*/ false);
200-
FunctionCallee RegFuncC =
201-
M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
198+
// Get __tgt_unregister_lib function declaration.
199+
auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
200+
/*isVarArg*/ false);
201+
FunctionCallee UnRegFuncC =
202+
M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
202203

203204
// Construct function body
204205
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
205-
Builder.CreateCall(RegFuncC, BinDesc);
206+
Builder.CreateCall(UnRegFuncC, BinDesc);
206207
Builder.CreateRetVoid();
207208

208-
// Add this function to constructors.
209-
// Set priority to 1 so that __tgt_register_lib is executed AFTER
210-
// __tgt_register_requires (we want to know what requirements have been
211-
// asked for before we load a libomptarget plugin so that by the time the
212-
// plugin is loaded it can report how many devices there are which can
213-
// satisfy these requirements).
214-
appendToGlobalCtors(M, Func, /*Priority*/ 1);
209+
return Func;
215210
}
216211

217-
void createUnregisterFunction(Module &M, GlobalVariable *BinDesc,
218-
StringRef Suffix) {
212+
void createRegisterFunction(Module &M, GlobalVariable *BinDesc,
213+
StringRef Suffix) {
219214
LLVMContext &C = M.getContext();
220215
auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
221-
auto *Func =
222-
Function::Create(FuncTy, GlobalValue::InternalLinkage,
223-
".omp_offloading.descriptor_unreg" + Suffix, &M);
216+
auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
217+
".omp_offloading.descriptor_reg" + Suffix, &M);
224218
Func->setSection(".text.startup");
225219

226-
// Get __tgt_unregister_lib function declaration.
227-
auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
228-
/*isVarArg*/ false);
229-
FunctionCallee UnRegFuncC =
230-
M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
220+
// Get __tgt_register_lib function declaration.
221+
auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
222+
/*isVarArg*/ false);
223+
FunctionCallee RegFuncC =
224+
M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
225+
226+
auto *AtExitTy = FunctionType::get(
227+
Type::getInt32Ty(C), PointerType::getUnqual(C), /*isVarArg=*/false);
228+
FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy);
229+
230+
Function *UnregFunc = createUnregisterFunction(M, BinDesc, Suffix);
231231

232232
// Construct function body
233233
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
234-
Builder.CreateCall(UnRegFuncC, BinDesc);
234+
235+
// Register the destructors with 'atexit'. This is expected by the CUDA
236+
// runtime and ensures that we clean up before dynamic objects are destroyed.
237+
// This needs to be done before the runtime is called and registers its own.
238+
Builder.CreateCall(AtExit, UnregFunc);
239+
240+
Builder.CreateCall(RegFuncC, BinDesc);
235241
Builder.CreateRetVoid();
236242

237-
// Add this function to global destructors.
238-
// Match priority of __tgt_register_lib
239-
appendToGlobalDtors(M, Func, /*Priority*/ 1);
243+
// Add this function to constructors.
244+
appendToGlobalCtors(M, Func, /*Priority=*/101);
240245
}
241246

242247
// struct fatbin_wrapper {
@@ -578,7 +583,7 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc,
578583
DtorBuilder.CreateRetVoid();
579584

580585
// Add this function to constructors.
581-
appendToGlobalCtors(M, CtorFunc, /*Priority*/ 1);
586+
appendToGlobalCtors(M, CtorFunc, /*Priority=*/101);
582587
}
583588
} // namespace
584589

@@ -591,7 +596,6 @@ Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images,
591596
return createStringError(inconvertibleErrorCode(),
592597
"No binary descriptors created.");
593598
createRegisterFunction(M, Desc, Suffix);
594-
createUnregisterFunction(M, Desc, Suffix);
595599
return Error::success();
596600
}
597601

0 commit comments

Comments
 (0)