Skip to content

Commit 29155b0

Browse files
committed
[HIP] Support offloading by linker script
To support linking device code in different source files, it is necessary to embed fat binary at host linking stage. This patch emits an external symbol for fat binary in host codegen, then embed the fat binary by lld through a linker script. Differential Revision: https://reviews.llvm.org/D46472 llvm-svn: 332724
1 parent 655ef18 commit 29155b0

File tree

6 files changed

+237
-60
lines changed

6 files changed

+237
-60
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,8 @@ def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
586586
def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
587587
HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
588588
def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
589+
def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
590+
Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
589591
def dA : Flag<["-"], "dA">, Group<d_Group>;
590592
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
591593
HelpText<"Print macro definitions in -E mode in addition to normal output">;

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 77 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ using namespace clang;
2727
using namespace CodeGen;
2828

2929
namespace {
30+
constexpr unsigned CudaFatMagic = 0x466243b1;
31+
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
3032

3133
class CGNVCUDARuntime : public CGCUDARuntime {
3234

@@ -310,19 +312,20 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
310312
/// }
311313
/// \endcode
312314
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
315+
bool IsHIP = CGM.getLangOpts().HIP;
313316
// No need to generate ctors/dtors if there is no GPU binary.
314-
std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
315-
if (GpuBinaryFileName.empty())
317+
StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
318+
if (CudaGpuBinaryFileName.empty() && !IsHIP)
316319
return nullptr;
317320

318-
// void __cuda_register_globals(void* handle);
321+
// void __{cuda|hip}_register_globals(void* handle);
319322
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
320323
// We always need a function to pass in as callback. Create a dummy
321324
// implementation if we don't need to register anything.
322325
if (RelocatableDeviceCode && !RegisterGlobalsFunc)
323326
RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
324327

325-
// void ** __cudaRegisterFatBinary(void *);
328+
// void ** __{cuda|hip}RegisterFatBinary(void *);
326329
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
327330
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
328331
addUnderscoredPrefixToName("RegisterFatBinary"));
@@ -334,12 +337,16 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
334337
// global variable and save a reference in GpuBinaryHandle to be cleaned up
335338
// in destructor on exit. Then associate all known kernels with the GPU binary
336339
// handle so CUDA runtime can figure out what to call on the GPU side.
337-
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
338-
llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
339-
if (std::error_code EC = GpuBinaryOrErr.getError()) {
340-
CGM.getDiags().Report(diag::err_cannot_open_file)
341-
<< GpuBinaryFileName << EC.message();
342-
return nullptr;
340+
std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
341+
if (!IsHIP) {
342+
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
343+
llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
344+
if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
345+
CGM.getDiags().Report(diag::err_cannot_open_file)
346+
<< CudaGpuBinaryFileName << EC.message();
347+
return nullptr;
348+
}
349+
CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
343350
}
344351

345352
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
@@ -353,39 +360,71 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
353360
CtorBuilder.SetInsertPoint(CtorEntryBB);
354361

355362
const char *FatbinConstantName;
356-
if (RelocatableDeviceCode)
363+
const char *FatbinSectionName;
364+
const char *ModuleIDSectionName;
365+
StringRef ModuleIDPrefix;
366+
llvm::Constant *FatBinStr;
367+
unsigned FatMagic;
368+
if (IsHIP) {
369+
FatbinConstantName = ".hip_fatbin";
370+
FatbinSectionName = ".hipFatBinSegment";
371+
372+
ModuleIDSectionName = "__hip_module_id";
373+
ModuleIDPrefix = "__hip_";
374+
375+
// For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
376+
// The external symbol is supposed to contain the fat binary but will be
377+
// populated somewhere else, e.g. by lld through link script.
378+
FatBinStr = new llvm::GlobalVariable(
379+
CGM.getModule(), CGM.Int8Ty,
380+
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
381+
"__hip_fatbin", nullptr,
382+
llvm::GlobalVariable::NotThreadLocal);
383+
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
384+
385+
FatMagic = HIPFatMagic;
386+
} else {
387+
if (RelocatableDeviceCode)
388+
// TODO: Figure out how this is called on mac OS!
389+
FatbinConstantName = "__nv_relfatbin";
390+
else
391+
FatbinConstantName =
392+
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
393+
// NVIDIA's cuobjdump looks for fatbins in this section.
394+
FatbinSectionName =
395+
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
396+
357397
// TODO: Figure out how this is called on mac OS!
358-
FatbinConstantName = "__nv_relfatbin";
359-
else
360-
FatbinConstantName =
361-
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
362-
// NVIDIA's cuobjdump looks for fatbins in this section.
363-
const char *FatbinSectionName =
364-
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
365-
// TODO: Figure out how this is called on mac OS!
366-
const char *NVModuleIDSectionName = "__nv_module_id";
398+
ModuleIDSectionName = "__nv_module_id";
399+
ModuleIDPrefix = "__nv_";
400+
401+
// For CUDA, create a string literal containing the fat binary loaded from
402+
// the given file.
403+
FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
404+
FatbinConstantName, 8);
405+
FatMagic = CudaFatMagic;
406+
}
367407

368408
// Create initialized wrapper structure that points to the loaded GPU binary
369409
ConstantInitBuilder Builder(CGM);
370410
auto Values = Builder.beginStruct(FatbinWrapperTy);
371411
// Fatbin wrapper magic.
372-
Values.addInt(IntTy, 0x466243b1);
412+
Values.addInt(IntTy, FatMagic);
373413
// Fatbin version.
374414
Values.addInt(IntTy, 1);
375415
// Data.
376-
Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
377-
FatbinConstantName, 8));
416+
Values.add(FatBinStr);
378417
// Unused in fatbin v1.
379418
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
380419
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
381420
addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
382421
/*constant*/ true);
383422
FatbinWrapper->setSection(FatbinSectionName);
384423

385-
// Register binary with CUDA runtime. This is substantially different in
424+
// Register binary with CUDA/HIP runtime. This is substantially different in
386425
// default mode vs. separate compilation!
387426
if (!RelocatableDeviceCode) {
388-
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
427+
// GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
389428
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
390429
RegisterFatbinFunc,
391430
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
@@ -397,34 +436,34 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
397436
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
398437
CGM.getPointerAlign());
399438

400-
// Call __cuda_register_globals(GpuBinaryHandle);
439+
// Call __{cuda|hip}_register_globals(GpuBinaryHandle);
401440
if (RegisterGlobalsFunc)
402441
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
403442
} else {
404443
// Generate a unique module ID.
405-
SmallString<64> NVModuleID;
406-
llvm::raw_svector_ostream OS(NVModuleID);
407-
OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
408-
llvm::Constant *NVModuleIDConstant =
409-
makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
410-
411-
// Create an alias for the FatbinWrapper that nvcc will look for.
444+
SmallString<64> ModuleID;
445+
llvm::raw_svector_ostream OS(ModuleID);
446+
OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
447+
llvm::Constant *ModuleIDConstant =
448+
makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
449+
450+
// Create an alias for the FatbinWrapper that nvcc or hip backend will
451+
// look for.
412452
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
413-
Twine("__fatbinwrap") + NVModuleID,
414-
FatbinWrapper);
453+
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
415454

416-
// void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
455+
// void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
417456
// void *, void (*)(void **))
418457
SmallString<128> RegisterLinkedBinaryName(
419458
addUnderscoredPrefixToName("RegisterLinkedBinary"));
420-
RegisterLinkedBinaryName += NVModuleID;
459+
RegisterLinkedBinaryName += ModuleID;
421460
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
422461
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
423462

424463
assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
425464
llvm::Value *Args[] = {RegisterGlobalsFunc,
426465
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
427-
NVModuleIDConstant,
466+
ModuleIDConstant,
428467
makeDummyFunction(getCallbackFnTy())};
429468
CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
430469
}

clang/lib/Driver/ToolChains/CommonArgs.cpp

Lines changed: 125 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,14 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
146146
Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input);
147147

148148
for (const auto &II : Inputs) {
149-
// If the current tool chain refers to an OpenMP offloading host, we should
150-
// ignore inputs that refer to OpenMP offloading devices - they will be
151-
// embedded according to a proper linker script.
149+
// If the current tool chain refers to an OpenMP or HIP offloading host, we
150+
// should ignore inputs that refer to OpenMP or HIP offloading devices -
151+
// they will be embedded according to a proper linker script.
152152
if (auto *IA = II.getAction())
153-
if (JA.isHostOffloading(Action::OFK_OpenMP) &&
154-
IA->isDeviceOffloading(Action::OFK_OpenMP))
153+
if ((JA.isHostOffloading(Action::OFK_OpenMP) &&
154+
IA->isDeviceOffloading(Action::OFK_OpenMP)) ||
155+
(JA.isHostOffloading(Action::OFK_HIP) &&
156+
IA->isDeviceOffloading(Action::OFK_HIP)))
155157
continue;
156158

157159
if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType()))
@@ -1288,6 +1290,124 @@ void tools::AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
12881290
Lksf << LksBuffer;
12891291
}
12901292

1293+
/// Add HIP linker script arguments at the end of the argument list so that
1294+
/// the fat binary is built by embedding the device images into the host. The
1295+
/// linker script also defines a symbol required by the code generation so that
1296+
/// the image can be retrieved at runtime. This should be used only in tool
1297+
/// chains that support linker scripts.
1298+
void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
1299+
const InputInfo &Output,
1300+
const InputInfoList &Inputs, const ArgList &Args,
1301+
ArgStringList &CmdArgs, const JobAction &JA,
1302+
const Tool &T) {
1303+
1304+
// If this is not a HIP host toolchain, we don't need to do anything.
1305+
if (!JA.isHostOffloading(Action::OFK_HIP))
1306+
return;
1307+
1308+
// Create temporary linker script. Keep it if save-temps is enabled.
1309+
const char *LKS;
1310+
SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
1311+
if (C.getDriver().isSaveTempsEnabled()) {
1312+
llvm::sys::path::replace_extension(Name, "lk");
1313+
LKS = C.getArgs().MakeArgString(Name.c_str());
1314+
} else {
1315+
llvm::sys::path::replace_extension(Name, "");
1316+
Name = C.getDriver().GetTemporaryPath(Name, "lk");
1317+
LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
1318+
}
1319+
1320+
// Add linker script option to the command.
1321+
CmdArgs.push_back("-T");
1322+
CmdArgs.push_back(LKS);
1323+
1324+
// Create a buffer to write the contents of the linker script.
1325+
std::string LksBuffer;
1326+
llvm::raw_string_ostream LksStream(LksBuffer);
1327+
1328+
// Get the HIP offload tool chain.
1329+
auto *HIPTC = static_cast<const toolchains::CudaToolChain *>(
1330+
C.getSingleOffloadToolChain<Action::OFK_HIP>());
1331+
assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn &&
1332+
"Wrong platform");
1333+
1334+
// Construct clang-offload-bundler command to bundle object files for
1335+
// for different GPU archs.
1336+
ArgStringList BundlerArgs;
1337+
BundlerArgs.push_back(Args.MakeArgString("-type=o"));
1338+
1339+
// ToDo: Remove the dummy host binary entry which is required by
1340+
// clang-offload-bundler.
1341+
std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
1342+
std::string BundlerInputArg = "-inputs=/dev/null";
1343+
1344+
for (const auto &II : Inputs) {
1345+
const Action *A = II.getAction();
1346+
// Is this a device linking action?
1347+
if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
1348+
BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
1349+
StringRef(A->getOffloadingArch()).str();
1350+
BundlerInputArg = BundlerInputArg + "," + II.getFilename();
1351+
}
1352+
}
1353+
BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
1354+
BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
1355+
1356+
std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
1357+
const char *BundleFile =
1358+
C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
1359+
auto BundlerOutputArg =
1360+
Args.MakeArgString(std::string("-outputs=").append(BundleFile));
1361+
BundlerArgs.push_back(BundlerOutputArg);
1362+
1363+
SmallString<128> BundlerPath(C.getDriver().Dir);
1364+
llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
1365+
const char *Bundler = Args.MakeArgString(BundlerPath);
1366+
C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
1367+
1368+
// Add commands to embed target binaries. We ensure that each section and
1369+
// image is 16-byte aligned. This is not mandatory, but increases the
1370+
// likelihood of data to be aligned with a cache block in several main host
1371+
// machines.
1372+
LksStream << "/*\n";
1373+
LksStream << " HIP Offload Linker Script\n";
1374+
LksStream << " *** Automatically generated by Clang ***\n";
1375+
LksStream << "*/\n";
1376+
LksStream << "TARGET(binary)\n";
1377+
LksStream << "INPUT(" << BundleFileName << ")\n";
1378+
LksStream << "SECTIONS\n";
1379+
LksStream << "{\n";
1380+
LksStream << " .hip_fatbin :\n";
1381+
LksStream << " ALIGN(0x10)\n";
1382+
LksStream << " {\n";
1383+
LksStream << " PROVIDE_HIDDEN(__hip_fatbin = .);\n";
1384+
LksStream << " " << BundleFileName << "\n";
1385+
LksStream << " }\n";
1386+
LksStream << "}\n";
1387+
LksStream << "INSERT BEFORE .data\n";
1388+
LksStream.flush();
1389+
1390+
// Dump the contents of the linker script if the user requested that. We
1391+
// support this option to enable testing of behavior with -###.
1392+
if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
1393+
llvm::errs() << LksBuffer;
1394+
1395+
// If this is a dry run, do not create the linker script file.
1396+
if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
1397+
return;
1398+
1399+
// Open script file and write the contents.
1400+
std::error_code EC;
1401+
llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
1402+
1403+
if (EC) {
1404+
C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
1405+
return;
1406+
}
1407+
1408+
Lksf << LksBuffer;
1409+
}
1410+
12911411
SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args,
12921412
const InputInfo &Output,
12931413
const InputInfo &Input,

clang/lib/Driver/ToolChains/CommonArgs.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
5252
llvm::opt::ArgStringList &CmdArgs,
5353
const JobAction &JA);
5454

55+
void AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
56+
const InputInfo &Output, const InputInfoList &Inputs,
57+
const llvm::opt::ArgList &Args,
58+
llvm::opt::ArgStringList &CmdArgs, const JobAction &JA,
59+
const Tool &T);
60+
5561
const char *SplitDebugName(const llvm::opt::ArgList &Args,
5662
const InputInfo &Input);
5763

clang/lib/Driver/ToolChains/Gnu.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,10 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
535535
// Add OpenMP offloading linker script args if required.
536536
AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
537537

538+
// Add HIP offloading linker script args if required.
539+
AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
540+
*this);
541+
538542
C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
539543
}
540544

0 commit comments

Comments
 (0)