@@ -27,6 +27,8 @@ using namespace clang;
27
27
using namespace CodeGen ;
28
28
29
29
namespace {
30
+ constexpr unsigned CudaFatMagic = 0x466243b1 ;
31
+ constexpr unsigned HIPFatMagic = 0x48495046 ; // "HIPF"
30
32
31
33
class CGNVCUDARuntime : public CGCUDARuntime {
32
34
@@ -310,19 +312,20 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
310
312
// / }
311
313
// / \endcode
312
314
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction () {
315
+ bool IsHIP = CGM.getLangOpts ().HIP ;
313
316
// No need to generate ctors/dtors if there is no GPU binary.
314
- std::string GpuBinaryFileName = CGM.getCodeGenOpts ().CudaGpuBinaryFileName ;
315
- if (GpuBinaryFileName .empty ())
317
+ StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts ().CudaGpuBinaryFileName ;
318
+ if (CudaGpuBinaryFileName .empty () && !IsHIP )
316
319
return nullptr ;
317
320
318
- // void __cuda_register_globals (void* handle);
321
+ // void __{cuda|hip}_register_globals (void* handle);
319
322
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn ();
320
323
// We always need a function to pass in as callback. Create a dummy
321
324
// implementation if we don't need to register anything.
322
325
if (RelocatableDeviceCode && !RegisterGlobalsFunc)
323
326
RegisterGlobalsFunc = makeDummyFunction (getRegisterGlobalsFnTy ());
324
327
325
- // void ** __cudaRegisterFatBinary (void *);
328
+ // void ** __{cuda|hip}RegisterFatBinary (void *);
326
329
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction (
327
330
llvm::FunctionType::get (VoidPtrPtrTy, VoidPtrTy, false ),
328
331
addUnderscoredPrefixToName (" RegisterFatBinary" ));
@@ -334,12 +337,16 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
334
337
// global variable and save a reference in GpuBinaryHandle to be cleaned up
335
338
// in destructor on exit. Then associate all known kernels with the GPU binary
336
339
// handle so CUDA runtime can figure out what to call on the GPU side.
337
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
338
- llvm::MemoryBuffer::getFileOrSTDIN (GpuBinaryFileName);
339
- if (std::error_code EC = GpuBinaryOrErr.getError ()) {
340
- CGM.getDiags ().Report (diag::err_cannot_open_file)
341
- << GpuBinaryFileName << EC.message ();
342
- return nullptr ;
340
+ std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
341
+ if (!IsHIP) {
342
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
343
+ llvm::MemoryBuffer::getFileOrSTDIN (CudaGpuBinaryFileName);
344
+ if (std::error_code EC = CudaGpuBinaryOrErr.getError ()) {
345
+ CGM.getDiags ().Report (diag::err_cannot_open_file)
346
+ << CudaGpuBinaryFileName << EC.message ();
347
+ return nullptr ;
348
+ }
349
+ CudaGpuBinary = std::move (CudaGpuBinaryOrErr.get ());
343
350
}
344
351
345
352
llvm::Function *ModuleCtorFunc = llvm::Function::Create (
@@ -353,39 +360,71 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
353
360
CtorBuilder.SetInsertPoint (CtorEntryBB);
354
361
355
362
const char *FatbinConstantName;
356
- if (RelocatableDeviceCode)
363
+ const char *FatbinSectionName;
364
+ const char *ModuleIDSectionName;
365
+ StringRef ModuleIDPrefix;
366
+ llvm::Constant *FatBinStr;
367
+ unsigned FatMagic;
368
+ if (IsHIP) {
369
+ FatbinConstantName = " .hip_fatbin" ;
370
+ FatbinSectionName = " .hipFatBinSegment" ;
371
+
372
+ ModuleIDSectionName = " __hip_module_id" ;
373
+ ModuleIDPrefix = " __hip_" ;
374
+
375
+ // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
376
+ // The external symbol is supposed to contain the fat binary but will be
377
+ // populated somewhere else, e.g. by lld through link script.
378
+ FatBinStr = new llvm::GlobalVariable (
379
+ CGM.getModule (), CGM.Int8Ty ,
380
+ /* isConstant=*/ true , llvm::GlobalValue::ExternalLinkage, nullptr ,
381
+ " __hip_fatbin" , nullptr ,
382
+ llvm::GlobalVariable::NotThreadLocal);
383
+ cast<llvm::GlobalVariable>(FatBinStr)->setSection (FatbinConstantName);
384
+
385
+ FatMagic = HIPFatMagic;
386
+ } else {
387
+ if (RelocatableDeviceCode)
388
+ // TODO: Figure out how this is called on mac OS!
389
+ FatbinConstantName = " __nv_relfatbin" ;
390
+ else
391
+ FatbinConstantName =
392
+ CGM.getTriple ().isMacOSX () ? " __NV_CUDA,__nv_fatbin" : " .nv_fatbin" ;
393
+ // NVIDIA's cuobjdump looks for fatbins in this section.
394
+ FatbinSectionName =
395
+ CGM.getTriple ().isMacOSX () ? " __NV_CUDA,__fatbin" : " .nvFatBinSegment" ;
396
+
357
397
// TODO: Figure out how this is called on mac OS!
358
- FatbinConstantName = " __nv_relfatbin " ;
359
- else
360
- FatbinConstantName =
361
- CGM. getTriple (). isMacOSX () ? " __NV_CUDA,__nv_fatbin " : " .nv_fatbin " ;
362
- // NVIDIA's cuobjdump looks for fatbins in this section .
363
- const char *FatbinSectionName =
364
- CGM. getTriple (). isMacOSX () ? " __NV_CUDA,__fatbin " : " .nvFatBinSegment " ;
365
- // TODO: Figure out how this is called on mac OS!
366
- const char *NVModuleIDSectionName = " __nv_module_id " ;
398
+ ModuleIDSectionName = " __nv_module_id " ;
399
+ ModuleIDPrefix = " __nv_ " ;
400
+
401
+ // For CUDA, create a string literal containing the fat binary loaded from
402
+ // the given file .
403
+ FatBinStr = makeConstantString (CudaGpuBinary-> getBuffer (), " " ,
404
+ FatbinConstantName, 8 ) ;
405
+ FatMagic = CudaFatMagic;
406
+ }
367
407
368
408
// Create initialized wrapper structure that points to the loaded GPU binary
369
409
ConstantInitBuilder Builder (CGM);
370
410
auto Values = Builder.beginStruct (FatbinWrapperTy);
371
411
// Fatbin wrapper magic.
372
- Values.addInt (IntTy, 0x466243b1 );
412
+ Values.addInt (IntTy, FatMagic );
373
413
// Fatbin version.
374
414
Values.addInt (IntTy, 1 );
375
415
// Data.
376
- Values.add (makeConstantString (GpuBinaryOrErr.get ()->getBuffer (), " " ,
377
- FatbinConstantName, 8 ));
416
+ Values.add (FatBinStr);
378
417
// Unused in fatbin v1.
379
418
Values.add (llvm::ConstantPointerNull::get (VoidPtrTy));
380
419
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal (
381
420
addUnderscoredPrefixToName (" _fatbin_wrapper" ), CGM.getPointerAlign (),
382
421
/* constant*/ true );
383
422
FatbinWrapper->setSection (FatbinSectionName);
384
423
385
- // Register binary with CUDA runtime. This is substantially different in
424
+ // Register binary with CUDA/HIP runtime. This is substantially different in
386
425
// default mode vs. separate compilation!
387
426
if (!RelocatableDeviceCode) {
388
- // GpuBinaryHandle = __cudaRegisterFatBinary (&FatbinWrapper);
427
+ // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary (&FatbinWrapper);
389
428
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall (
390
429
RegisterFatbinFunc,
391
430
CtorBuilder.CreateBitCast (FatbinWrapper, VoidPtrTy));
@@ -397,34 +436,34 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
397
436
CtorBuilder.CreateAlignedStore (RegisterFatbinCall, GpuBinaryHandle,
398
437
CGM.getPointerAlign ());
399
438
400
- // Call __cuda_register_globals (GpuBinaryHandle);
439
+ // Call __{cuda|hip}_register_globals (GpuBinaryHandle);
401
440
if (RegisterGlobalsFunc)
402
441
CtorBuilder.CreateCall (RegisterGlobalsFunc, RegisterFatbinCall);
403
442
} else {
404
443
// Generate a unique module ID.
405
- SmallString<64 > NVModuleID;
406
- llvm::raw_svector_ostream OS (NVModuleID);
407
- OS << " __nv_" << llvm::format (" %x" , FatbinWrapper->getGUID ());
408
- llvm::Constant *NVModuleIDConstant =
409
- makeConstantString (NVModuleID.str (), " " , NVModuleIDSectionName, 32 );
410
-
411
- // Create an alias for the FatbinWrapper that nvcc will look for.
444
+ SmallString<64 > ModuleID;
445
+ llvm::raw_svector_ostream OS (ModuleID);
446
+ OS << ModuleIDPrefix << llvm::format (" %x" , FatbinWrapper->getGUID ());
447
+ llvm::Constant *ModuleIDConstant =
448
+ makeConstantString (ModuleID.str (), " " , ModuleIDSectionName, 32 );
449
+
450
+ // Create an alias for the FatbinWrapper that nvcc or hip backend will
451
+ // look for.
412
452
llvm::GlobalAlias::create (llvm::GlobalValue::ExternalLinkage,
413
- Twine (" __fatbinwrap" ) + NVModuleID,
414
- FatbinWrapper);
453
+ Twine (" __fatbinwrap" ) + ModuleID, FatbinWrapper);
415
454
416
- // void __cudaRegisterLinkedBinary%NVModuleID %(void (*)(void *), void *,
455
+ // void __{cuda|hip}RegisterLinkedBinary%ModuleID %(void (*)(void *), void *,
417
456
// void *, void (*)(void **))
418
457
SmallString<128 > RegisterLinkedBinaryName (
419
458
addUnderscoredPrefixToName (" RegisterLinkedBinary" ));
420
- RegisterLinkedBinaryName += NVModuleID ;
459
+ RegisterLinkedBinaryName += ModuleID ;
421
460
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction (
422
461
getRegisterLinkedBinaryFnTy (), RegisterLinkedBinaryName);
423
462
424
463
assert (RegisterGlobalsFunc && " Expecting at least dummy function!" );
425
464
llvm::Value *Args[] = {RegisterGlobalsFunc,
426
465
CtorBuilder.CreateBitCast (FatbinWrapper, VoidPtrTy),
427
- NVModuleIDConstant ,
466
+ ModuleIDConstant ,
428
467
makeDummyFunction (getCallbackFnTy ())};
429
468
CtorBuilder.CreateCall (RegisterLinkedBinaryFunc, Args);
430
469
}
0 commit comments