Skip to content

Commit d54c28b

Browse files
authored
[HIP] use offload wrapper for non-device-only non-rdc (#132869)
Currently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior.
1 parent 6e7c40b commit d54c28b

File tree

6 files changed

+142
-77
lines changed

6 files changed

+142
-77
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1281,7 +1281,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
12811281
return nullptr;
12821282
}
12831283
if (CGM.getLangOpts().OffloadViaLLVM ||
1284-
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
1284+
(CGM.getLangOpts().OffloadingNewDriver &&
1285+
(CGM.getLangOpts().HIP || RelocatableDeviceCode)))
12851286
createOffloadingEntries();
12861287
else
12871288
return makeModuleCtorFunction();

clang/lib/Driver/Driver.cpp

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4405,6 +4405,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
44054405
options::OPT_no_offload_new_driver,
44064406
C.isOffloadingHostKind(Action::OFK_Cuda));
44074407

4408+
bool HIPNoRDC =
4409+
C.isOffloadingHostKind(Action::OFK_HIP) &&
4410+
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4411+
44084412
// Builder to be used to build offloading actions.
44094413
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
44104414
!UseNewOffloadingDriver
@@ -4538,7 +4542,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
45384542
// Check if this Linker Job should emit a static library.
45394543
if (ShouldEmitStaticLibrary(Args)) {
45404544
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
4541-
} else if (UseNewOffloadingDriver ||
4545+
} else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
45424546
Args.hasArg(options::OPT_offload_link)) {
45434547
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
45444548
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4849,10 +4853,28 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
48494853
const InputTy &Input, StringRef CUID,
48504854
Action *HostAction) const {
48514855
// Don't build offloading actions if explicitly disabled or we do not have a
4852-
// valid source input and compile action to embed it in. If preprocessing only
4853-
// ignore embedding.
4854-
if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
4855-
!(isa<CompileJobAction>(HostAction) ||
4856+
// valid source input.
4857+
if (offloadHostOnly() || !types::isSrcFile(Input.first))
4858+
return HostAction;
4859+
4860+
bool HIPNoRDC =
4861+
C.isOffloadingHostKind(Action::OFK_HIP) &&
4862+
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4863+
4864+
// For HIP non-rdc non-device-only compilation, create a linker wrapper
4865+
// action for each host object to link, bundle and wrap device files in
4866+
// it.
4867+
if (isa<AssembleJobAction>(HostAction) && HIPNoRDC && !offloadDeviceOnly()) {
4868+
ActionList AL{HostAction};
4869+
HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
4870+
HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
4871+
/*BoundArch=*/nullptr);
4872+
return HostAction;
4873+
}
4874+
4875+
// Don't build offloading actions if we do not have a compile action. If
4876+
// preprocessing only ignore embedding.
4877+
if (!(isa<CompileJobAction>(HostAction) ||
48564878
getFinalPhase(Args) == phases::Preprocess))
48574879
return HostAction;
48584880

@@ -4948,12 +4970,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
49484970
}
49494971
}
49504972

4951-
// Compiling HIP in non-RDC mode requires linking each action individually.
4973+
// Compiling HIP in device-only non-RDC mode requires linking each action
4974+
// individually.
49524975
for (Action *&A : DeviceActions) {
49534976
if ((A->getType() != types::TY_Object &&
49544977
A->getType() != types::TY_LTO_BC) ||
4955-
Kind != Action::OFK_HIP ||
4956-
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
4978+
!HIPNoRDC || !offloadDeviceOnly())
49574979
continue;
49584980
ActionList LinkerInput = {A};
49594981
A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4977,12 +4999,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
49774999
}
49785000
}
49795001

4980-
// HIP code in non-RDC mode will bundle the output if it invoked the linker.
5002+
// HIP code in device-only non-RDC mode will bundle the output if it invoked
5003+
// the linker.
49815004
bool ShouldBundleHIP =
4982-
C.isOffloadingHostKind(Action::OFK_HIP) &&
5005+
HIPNoRDC && offloadDeviceOnly() &&
49835006
Args.hasFlag(options::OPT_gpu_bundle_output,
49845007
options::OPT_no_gpu_bundle_output, true) &&
4985-
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
49865008
!llvm::any_of(OffloadActions,
49875009
[](Action *A) { return A->getType() != types::TY_Image; });
49885010

@@ -5002,11 +5024,9 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
50025024
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
50035025
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
50045026
nullptr, Action::OFK_Cuda);
5005-
} else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
5006-
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5007-
false)) {
5008-
// If we are not in RDC-mode we just emit the final HIP fatbinary for each
5009-
// translation unit, linking each input individually.
5027+
} else if (HIPNoRDC && offloadDeviceOnly()) {
5028+
// If we are in device-only non-RDC-mode we just emit the final HIP
5029+
// fatbinary for each translation unit, linking each input individually.
50105030
Action *FatbinAction =
50115031
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
50125032
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5159,8 +5179,11 @@ Action *Driver::ConstructPhaseAction(
51595179
(((Input->getOffloadingToolChain() &&
51605180
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
51615181
TargetDeviceOffloadKind == Action::OFK_HIP) &&
5162-
(Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5163-
false) ||
5182+
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5183+
false) ||
5184+
(Args.hasFlag(options::OPT_offload_new_driver,
5185+
options::OPT_no_offload_new_driver, false) &&
5186+
!offloadDeviceOnly())) ||
51645187
TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
51655188
types::ID Output =
51665189
Args.hasArg(options::OPT_S) &&

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7832,7 +7832,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
78327832
CmdArgs.push_back("-fcuda-include-gpubinary");
78337833
CmdArgs.push_back(CudaDeviceInput->getFilename());
78347834
} else if (!HostOffloadingInputs.empty()) {
7835-
if ((IsCuda || IsHIP) && !IsRDCMode) {
7835+
if (IsCuda && !IsRDCMode) {
78367836
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
78377837
CmdArgs.push_back("-fcuda-include-gpubinary");
78387838
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9344,11 +9344,22 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
93449344
// Add the linker arguments to be forwarded by the wrapper.
93459345
CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
93469346
LinkCommand->getExecutable()));
9347-
for (const char *LinkArg : LinkCommand->getArguments())
9348-
CmdArgs.push_back(LinkArg);
93499347

9350-
addOffloadCompressArgs(Args, CmdArgs);
9348+
// We use action type to differentiate two use cases of the linker wrapper.
9349+
// TY_Image for normal linker wrapper work.
9350+
// TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
9351+
// object.
9352+
assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
9353+
if (JA.getType() == types::TY_Object) {
9354+
CmdArgs.append({"-o", Output.getFilename()});
9355+
for (auto Input : Inputs)
9356+
CmdArgs.push_back(Input.getFilename());
9357+
CmdArgs.push_back("-r");
9358+
} else
9359+
for (const char *LinkArg : LinkCommand->getArguments())
9360+
CmdArgs.push_back(LinkArg);
93519361

9362+
addOffloadCompressArgs(Args, CmdArgs);
93529363
const char *Exec =
93539364
Args.MakeArgString(getToolChain().GetProgramPath("clang-linker-wrapper"));
93549365

clang/test/Driver/hip-binding.hip

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
// RUN: -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
9494
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
9595
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
96-
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
9796
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
98-
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
99-
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
97+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
98+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
99+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"

clang/test/Driver/hip-phases.hip

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,39 +8,50 @@
88
//
99
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1010
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
11-
// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
11+
// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
1212
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1313
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
14-
// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
14+
// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
1515
//
1616
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1717
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
18-
// RUN: | FileCheck -check-prefixes=BIN,RDC %s
18+
// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
19+
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
20+
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
21+
// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
1922
//
2023
// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
2124
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
2225
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
23-
// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
24-
// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
26+
// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
27+
// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
2528

2629
// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
2730
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
2831
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
29-
// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
30-
// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
31-
// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
32-
// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
33-
// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
34-
// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
35-
// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
36-
37-
// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
38-
// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
39-
// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
40-
// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
41-
// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
42-
// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
43-
// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
32+
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
33+
// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
34+
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
35+
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
36+
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
37+
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
38+
// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
39+
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
40+
// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
41+
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
42+
43+
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
44+
// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
45+
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
46+
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
47+
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
48+
// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
49+
// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
50+
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
51+
// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
52+
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
53+
// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
54+
// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
4455

4556
//
4657
// Test single gpu architecture up to the assemble phase.

0 commit comments

Comments
 (0)