Skip to content

Commit 1246339

Browse files
committed
[HIP] use offload wrapper for non-device-only non-rdc
Currently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior.
1 parent a6d3662 commit 1246339

File tree

6 files changed

+132
-69
lines changed

6 files changed

+132
-69
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1281,7 +1281,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
12811281
return nullptr;
12821282
}
12831283
if (CGM.getLangOpts().OffloadViaLLVM ||
1284-
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
1284+
(CGM.getLangOpts().OffloadingNewDriver &&
1285+
(CGM.getLangOpts().HIP || RelocatableDeviceCode)))
12851286
createOffloadingEntries();
12861287
else
12871288
return makeModuleCtorFunction();

clang/lib/Driver/Driver.cpp

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4402,6 +4402,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
44024402
options::OPT_no_offload_new_driver,
44034403
C.isOffloadingHostKind(Action::OFK_Cuda));
44044404

4405+
bool HIPNoRDC =
4406+
C.isOffloadingHostKind(Action::OFK_HIP) &&
4407+
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4408+
44054409
// Builder to be used to build offloading actions.
44064410
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
44074411
!UseNewOffloadingDriver
@@ -4502,6 +4506,16 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
45024506
InputArg))
45034507
break;
45044508

4509+
// For HIP non-rdc non-device-only compilation, create a linker wrapper
4510+
// action for each host object to link, bundle and wrap device files in
4511+
// it.
4512+
if (Phase == phases::Assemble && UseNewOffloadingDriver && HIPNoRDC &&
4513+
!offloadDeviceOnly()) {
4514+
ActionList AL{Current};
4515+
Current = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
4516+
Current->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
4517+
/*BoundArch=*/nullptr);
4518+
}
45054519
if (Current->getType() == types::TY_Nothing)
45064520
break;
45074521
}
@@ -4535,7 +4549,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
45354549
// Check if this Linker Job should emit a static library.
45364550
if (ShouldEmitStaticLibrary(Args)) {
45374551
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
4538-
} else if (UseNewOffloadingDriver ||
4552+
} else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
45394553
Args.hasArg(options::OPT_offload_link)) {
45404554
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
45414555
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4945,12 +4959,15 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
49454959
}
49464960
}
49474961

4948-
// Compiling HIP in non-RDC mode requires linking each action individually.
4962+
// Compiling HIP in device-only non-RDC mode requires linking each action
4963+
// individually.
49494964
for (Action *&A : DeviceActions) {
49504965
if ((A->getType() != types::TY_Object &&
49514966
A->getType() != types::TY_LTO_BC) ||
49524967
Kind != Action::OFK_HIP ||
4953-
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
4968+
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
4969+
false) ||
4970+
!offloadDeviceOnly())
49544971
continue;
49554972
ActionList LinkerInput = {A};
49564973
A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4974,9 +4991,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
49744991
}
49754992
}
49764993

4977-
// HIP code in non-RDC mode will bundle the output if it invoked the linker.
4994+
// HIP code in device-only non-RDC mode will bundle the output if it invoked
4995+
// the linker.
49784996
bool ShouldBundleHIP =
4979-
C.isOffloadingHostKind(Action::OFK_HIP) &&
4997+
C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
49804998
Args.hasFlag(options::OPT_gpu_bundle_output,
49814999
options::OPT_no_gpu_bundle_output, true) &&
49825000
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
@@ -4999,11 +5017,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
49995017
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
50005018
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
50015019
nullptr, Action::OFK_Cuda);
5002-
} else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
5020+
} else if (C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
50035021
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
50045022
false)) {
5005-
// If we are not in RDC-mode we just emit the final HIP fatbinary for each
5006-
// translation unit, linking each input individually.
5023+
// If we are in device-only non-RDC-mode we just emit the final HIP
5024+
// fatbinary for each translation unit, linking each input individually.
50075025
Action *FatbinAction =
50085026
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
50095027
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5156,8 +5174,11 @@ Action *Driver::ConstructPhaseAction(
51565174
(((Input->getOffloadingToolChain() &&
51575175
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
51585176
TargetDeviceOffloadKind == Action::OFK_HIP) &&
5159-
(Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5160-
false) ||
5177+
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5178+
false) ||
5179+
(Args.hasFlag(options::OPT_offload_new_driver,
5180+
options::OPT_no_offload_new_driver, false) &&
5181+
!offloadDeviceOnly())) ||
51615182
TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
51625183
types::ID Output =
51635184
Args.hasArg(options::OPT_S) &&

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7838,7 +7838,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
78387838
CmdArgs.push_back("-fcuda-include-gpubinary");
78397839
CmdArgs.push_back(CudaDeviceInput->getFilename());
78407840
} else if (!HostOffloadingInputs.empty()) {
7841-
if ((IsCuda || IsHIP) && !IsRDCMode) {
7841+
if (IsCuda && !IsRDCMode) {
78427842
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
78437843
CmdArgs.push_back("-fcuda-include-gpubinary");
78447844
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9348,11 +9348,22 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
93489348
// Add the linker arguments to be forwarded by the wrapper.
93499349
CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
93509350
LinkCommand->getExecutable()));
9351-
for (const char *LinkArg : LinkCommand->getArguments())
9352-
CmdArgs.push_back(LinkArg);
93539351

9354-
addOffloadCompressArgs(Args, CmdArgs);
9352+
// We use action type to differentiate two use cases of the linker wrapper.
9353+
// TY_Image for normal linker wrapper work.
9354+
// TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
9355+
// object.
9356+
assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
9357+
if (JA.getType() == types::TY_Object) {
9358+
CmdArgs.append({"-o", Output.getFilename()});
9359+
for (auto Input : Inputs)
9360+
CmdArgs.push_back(Input.getFilename());
9361+
CmdArgs.push_back(Args.MakeArgString("-r"));
9362+
} else
9363+
for (const char *LinkArg : LinkCommand->getArguments())
9364+
CmdArgs.push_back(LinkArg);
93559365

9366+
addOffloadCompressArgs(Args, CmdArgs);
93569367
const char *Exec =
93579368
Args.MakeArgString(getToolChain().GetProgramPath("clang-linker-wrapper"));
93589369

clang/test/Driver/hip-binding.hip

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
// RUN: -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
9494
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
9595
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
96-
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
9796
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
98-
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
99-
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
97+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
98+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
99+
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"

clang/test/Driver/hip-phases.hip

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,39 +8,50 @@
88
//
99
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1010
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
11-
// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
11+
// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
1212
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1313
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
14-
// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
14+
// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
1515
//
1616
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1717
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
18-
// RUN: | FileCheck -check-prefixes=BIN,RDC %s
18+
// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
19+
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
20+
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
21+
// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
1922
//
2023
// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
2124
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
2225
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
23-
// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
24-
// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
26+
// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
27+
// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
2528

2629
// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
2730
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
2831
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
29-
// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
30-
// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
31-
// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
32-
// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
33-
// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
34-
// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
35-
// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
36-
37-
// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
38-
// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
39-
// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
40-
// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
41-
// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
42-
// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
43-
// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
32+
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
33+
// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
34+
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
35+
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
36+
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
37+
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
38+
// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
39+
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
40+
// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
41+
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
42+
43+
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
44+
// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
45+
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
46+
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
47+
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
48+
// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
49+
// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
50+
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
51+
// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
52+
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
53+
// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
54+
// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
4455

4556
//
4657
// Test single gpu architecture up to the assemble phase.

0 commit comments

Comments
 (0)