-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[HIP] use offload wrapper for non-device-only non-rdc #132869
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang-codegen Author: Yaxun (Sam) Liu (yxsamliu) ChangesCurrently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior. Patch is 23.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132869.diff 6 Files Affected:
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 5445a9278596d..395263331d141 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1281,7 +1281,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
return nullptr;
}
if (CGM.getLangOpts().OffloadViaLLVM ||
- (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+ (CGM.getLangOpts().OffloadingNewDriver &&
+ (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 056bfcf1b739a..6d63a55918687 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4402,6 +4402,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda));
+ bool HIPNoRDC =
+ C.isOffloadingHostKind(Action::OFK_HIP) &&
+ !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
// Builder to be used to build offloading actions.
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
!UseNewOffloadingDriver
@@ -4502,6 +4506,16 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
InputArg))
break;
+ // For HIP non-rdc non-device-only compilation, create a linker wrapper
+ // action for each host object to link, bundle and wrap device files in
+ // it.
+ if (Phase == phases::Assemble && UseNewOffloadingDriver && HIPNoRDC &&
+ !offloadDeviceOnly()) {
+ ActionList AL{Current};
+ Current = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
+ Current->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
+ /*BoundArch=*/nullptr);
+ }
if (Current->getType() == types::TY_Nothing)
break;
}
@@ -4535,7 +4549,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
// Check if this Linker Job should emit a static library.
if (ShouldEmitStaticLibrary(Args)) {
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
- } else if (UseNewOffloadingDriver ||
+ } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
Args.hasArg(options::OPT_offload_link)) {
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4945,12 +4959,15 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
}
}
- // Compiling HIP in non-RDC mode requires linking each action individually.
+ // Compiling HIP in device-only non-RDC mode requires linking each action
+ // individually.
for (Action *&A : DeviceActions) {
if ((A->getType() != types::TY_Object &&
A->getType() != types::TY_LTO_BC) ||
Kind != Action::OFK_HIP ||
- Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+ Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ false) ||
+ !offloadDeviceOnly())
continue;
ActionList LinkerInput = {A};
A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4974,9 +4991,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
}
}
- // HIP code in non-RDC mode will bundle the output if it invoked the linker.
+ // HIP code in device-only non-RDC mode will bundle the output if it invoked
+ // the linker.
bool ShouldBundleHIP =
- C.isOffloadingHostKind(Action::OFK_HIP) &&
+ C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
Args.hasFlag(options::OPT_gpu_bundle_output,
options::OPT_no_gpu_bundle_output, true) &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
@@ -4999,11 +5017,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
nullptr, Action::OFK_Cuda);
- } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
+ } else if (C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false)) {
- // If we are not in RDC-mode we just emit the final HIP fatbinary for each
- // translation unit, linking each input individually.
+ // If we are in device-only non-RDC-mode we just emit the final HIP
+ // fatbinary for each translation unit, linking each input individually.
Action *FatbinAction =
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5156,8 +5174,11 @@ Action *Driver::ConstructPhaseAction(
(((Input->getOffloadingToolChain() &&
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
TargetDeviceOffloadKind == Action::OFK_HIP) &&
- (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
- false) ||
+ ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ false) ||
+ (Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false) &&
+ !offloadDeviceOnly())) ||
TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
types::ID Output =
Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 7aa2b32acc235..5570da2b18017 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7838,7 +7838,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
} else if (!HostOffloadingInputs.empty()) {
- if ((IsCuda || IsHIP) && !IsRDCMode) {
+ if (IsCuda && !IsRDCMode) {
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9348,11 +9348,22 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// Add the linker arguments to be forwarded by the wrapper.
CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
LinkCommand->getExecutable()));
- for (const char *LinkArg : LinkCommand->getArguments())
- CmdArgs.push_back(LinkArg);
- addOffloadCompressArgs(Args, CmdArgs);
+ // We use action type to differentiate two use cases of the linker wrapper.
+ // TY_Image for normal linker wrapper work.
+ // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
+ // object.
+ assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
+ if (JA.getType() == types::TY_Object) {
+ CmdArgs.append({"-o", Output.getFilename()});
+ for (auto Input : Inputs)
+ CmdArgs.push_back(Input.getFilename());
+ CmdArgs.push_back(Args.MakeArgString("-r"));
+ } else
+ for (const char *LinkArg : LinkCommand->getArguments())
+ CmdArgs.push_back(LinkArg);
+ addOffloadCompressArgs(Args, CmdArgs);
const char *Exec =
Args.MakeArgString(getToolChain().GetProgramPath("clang-linker-wrapper"));
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 57e57194ec87b..d8b3f1e242018 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
// RUN: -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 5fd2c0216ccc3..996d72e58755a 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,39 +8,50 @@
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,RDC %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
//
// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
-
-// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
+// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+
+// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
//
// Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 054db261d8e57..8205169be493f 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
// RUN: touch %t/a.o %t/b.o
// RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,22 +47,23 @@
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// OLD-SAME: "-emit-obj"
+// NEW-SAME: "-emit-llvm-bc"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
// CHECK-NOT: {{".*llvm-link"}}
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
//
// Compile device code in a.cu to code object for gfx900.
@@ -70,62 +71,71 @@
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
// CHECK-NOT: {{".*llvm-link"}}
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
//
// Bundle and embed device code in host object for a.cu.
//
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-emit-obj"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
+// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" "--host-triple=x86_64-unknown-linux-gnu"
+// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
+
//
// Compile device code in b.hip to code object for gfx803.
//
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
// CHECK-NOT: {{".*llvm-link"...
[truncated]
|
@llvm/pr-subscribers-clang-driver Author: Yaxun (Sam) Liu (yxsamliu) ChangesCurrently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior. Patch is 23.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132869.diff 6 Files Affected:
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 5445a9278596d..395263331d141 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1281,7 +1281,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
return nullptr;
}
if (CGM.getLangOpts().OffloadViaLLVM ||
- (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+ (CGM.getLangOpts().OffloadingNewDriver &&
+ (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 056bfcf1b739a..6d63a55918687 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4402,6 +4402,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda));
+ bool HIPNoRDC =
+ C.isOffloadingHostKind(Action::OFK_HIP) &&
+ !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
// Builder to be used to build offloading actions.
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
!UseNewOffloadingDriver
@@ -4502,6 +4506,16 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
InputArg))
break;
+ // For HIP non-rdc non-device-only compilation, create a linker wrapper
+ // action for each host object to link, bundle and wrap device files in
+ // it.
+ if (Phase == phases::Assemble && UseNewOffloadingDriver && HIPNoRDC &&
+ !offloadDeviceOnly()) {
+ ActionList AL{Current};
+ Current = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
+ Current->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
+ /*BoundArch=*/nullptr);
+ }
if (Current->getType() == types::TY_Nothing)
break;
}
@@ -4535,7 +4549,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
// Check if this Linker Job should emit a static library.
if (ShouldEmitStaticLibrary(Args)) {
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
- } else if (UseNewOffloadingDriver ||
+ } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
Args.hasArg(options::OPT_offload_link)) {
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4945,12 +4959,15 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
}
}
- // Compiling HIP in non-RDC mode requires linking each action individually.
+ // Compiling HIP in device-only non-RDC mode requires linking each action
+ // individually.
for (Action *&A : DeviceActions) {
if ((A->getType() != types::TY_Object &&
A->getType() != types::TY_LTO_BC) ||
Kind != Action::OFK_HIP ||
- Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+ Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ false) ||
+ !offloadDeviceOnly())
continue;
ActionList LinkerInput = {A};
A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4974,9 +4991,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
}
}
- // HIP code in non-RDC mode will bundle the output if it invoked the linker.
+ // HIP code in device-only non-RDC mode will bundle the output if it invoked
+ // the linker.
bool ShouldBundleHIP =
- C.isOffloadingHostKind(Action::OFK_HIP) &&
+ C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
Args.hasFlag(options::OPT_gpu_bundle_output,
options::OPT_no_gpu_bundle_output, true) &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
@@ -4999,11 +5017,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
nullptr, Action::OFK_Cuda);
- } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
+ } else if (C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false)) {
- // If we are not in RDC-mode we just emit the final HIP fatbinary for each
- // translation unit, linking each input individually.
+ // If we are in device-only non-RDC-mode we just emit the final HIP
+ // fatbinary for each translation unit, linking each input individually.
Action *FatbinAction =
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5156,8 +5174,11 @@ Action *Driver::ConstructPhaseAction(
(((Input->getOffloadingToolChain() &&
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
TargetDeviceOffloadKind == Action::OFK_HIP) &&
- (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
- false) ||
+ ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ false) ||
+ (Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false) &&
+ !offloadDeviceOnly())) ||
TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
types::ID Output =
Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 7aa2b32acc235..5570da2b18017 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7838,7 +7838,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
} else if (!HostOffloadingInputs.empty()) {
- if ((IsCuda || IsHIP) && !IsRDCMode) {
+ if (IsCuda && !IsRDCMode) {
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9348,11 +9348,22 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// Add the linker arguments to be forwarded by the wrapper.
CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
LinkCommand->getExecutable()));
- for (const char *LinkArg : LinkCommand->getArguments())
- CmdArgs.push_back(LinkArg);
- addOffloadCompressArgs(Args, CmdArgs);
+ // We use action type to differentiate two use cases of the linker wrapper.
+ // TY_Image for normal linker wrapper work.
+ // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
+ // object.
+ assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
+ if (JA.getType() == types::TY_Object) {
+ CmdArgs.append({"-o", Output.getFilename()});
+ for (auto Input : Inputs)
+ CmdArgs.push_back(Input.getFilename());
+ CmdArgs.push_back(Args.MakeArgString("-r"));
+ } else
+ for (const char *LinkArg : LinkCommand->getArguments())
+ CmdArgs.push_back(LinkArg);
+ addOffloadCompressArgs(Args, CmdArgs);
const char *Exec =
Args.MakeArgString(getToolChain().GetProgramPath("clang-linker-wrapper"));
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 57e57194ec87b..d8b3f1e242018 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
// RUN: -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 5fd2c0216ccc3..996d72e58755a 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,39 +8,50 @@
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,RDC %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
//
// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
-
-// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
+// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+
+// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
//
// Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 054db261d8e57..8205169be493f 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
// RUN: touch %t/a.o %t/b.o
// RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,22 +47,23 @@
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// OLD-SAME: "-emit-obj"
+// NEW-SAME: "-emit-llvm-bc"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
// CHECK-NOT: {{".*llvm-link"}}
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
//
// Compile device code in a.cu to code object for gfx900.
@@ -70,62 +71,71 @@
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
// CHECK-NOT: {{".*llvm-link"}}
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
//
// Bundle and embed device code in host object for a.cu.
//
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-emit-obj"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
+// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" "--host-triple=x86_64-unknown-linux-gnu"
+// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
+
//
// Compile device code in b.hip to code object for gfx803.
//
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
// CHECK-NOT: {{".*llvm-link"...
[truncated]
|
@llvm/pr-subscribers-clang Author: Yaxun (Sam) Liu (yxsamliu) ChangesCurrently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior. Patch is 23.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132869.diff 6 Files Affected:
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 5445a9278596d..395263331d141 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1281,7 +1281,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
return nullptr;
}
if (CGM.getLangOpts().OffloadViaLLVM ||
- (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+ (CGM.getLangOpts().OffloadingNewDriver &&
+ (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 056bfcf1b739a..6d63a55918687 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4402,6 +4402,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda));
+ bool HIPNoRDC =
+ C.isOffloadingHostKind(Action::OFK_HIP) &&
+ !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
// Builder to be used to build offloading actions.
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
!UseNewOffloadingDriver
@@ -4502,6 +4506,16 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
InputArg))
break;
+ // For HIP non-rdc non-device-only compilation, create a linker wrapper
+ // action for each host object to link, bundle and wrap device files in
+ // it.
+ if (Phase == phases::Assemble && UseNewOffloadingDriver && HIPNoRDC &&
+ !offloadDeviceOnly()) {
+ ActionList AL{Current};
+ Current = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
+ Current->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
+ /*BoundArch=*/nullptr);
+ }
if (Current->getType() == types::TY_Nothing)
break;
}
@@ -4535,7 +4549,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
// Check if this Linker Job should emit a static library.
if (ShouldEmitStaticLibrary(Args)) {
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
- } else if (UseNewOffloadingDriver ||
+ } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
Args.hasArg(options::OPT_offload_link)) {
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4945,12 +4959,15 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
}
}
- // Compiling HIP in non-RDC mode requires linking each action individually.
+ // Compiling HIP in device-only non-RDC mode requires linking each action
+ // individually.
for (Action *&A : DeviceActions) {
if ((A->getType() != types::TY_Object &&
A->getType() != types::TY_LTO_BC) ||
Kind != Action::OFK_HIP ||
- Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+ Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ false) ||
+ !offloadDeviceOnly())
continue;
ActionList LinkerInput = {A};
A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4974,9 +4991,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
}
}
- // HIP code in non-RDC mode will bundle the output if it invoked the linker.
+ // HIP code in device-only non-RDC mode will bundle the output if it invoked
+ // the linker.
bool ShouldBundleHIP =
- C.isOffloadingHostKind(Action::OFK_HIP) &&
+ C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
Args.hasFlag(options::OPT_gpu_bundle_output,
options::OPT_no_gpu_bundle_output, true) &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
@@ -4999,11 +5017,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
nullptr, Action::OFK_Cuda);
- } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
+ } else if (C.isOffloadingHostKind(Action::OFK_HIP) && offloadDeviceOnly() &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false)) {
- // If we are not in RDC-mode we just emit the final HIP fatbinary for each
- // translation unit, linking each input individually.
+ // If we are in device-only non-RDC-mode we just emit the final HIP
+ // fatbinary for each translation unit, linking each input individually.
Action *FatbinAction =
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5156,8 +5174,11 @@ Action *Driver::ConstructPhaseAction(
(((Input->getOffloadingToolChain() &&
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
TargetDeviceOffloadKind == Action::OFK_HIP) &&
- (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
- false) ||
+ ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ false) ||
+ (Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false) &&
+ !offloadDeviceOnly())) ||
TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
types::ID Output =
Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 7aa2b32acc235..5570da2b18017 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7838,7 +7838,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
} else if (!HostOffloadingInputs.empty()) {
- if ((IsCuda || IsHIP) && !IsRDCMode) {
+ if (IsCuda && !IsRDCMode) {
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9348,11 +9348,22 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// Add the linker arguments to be forwarded by the wrapper.
CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
LinkCommand->getExecutable()));
- for (const char *LinkArg : LinkCommand->getArguments())
- CmdArgs.push_back(LinkArg);
- addOffloadCompressArgs(Args, CmdArgs);
+ // We use action type to differentiate two use cases of the linker wrapper.
+ // TY_Image for normal linker wrapper work.
+ // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
+ // object.
+ assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
+ if (JA.getType() == types::TY_Object) {
+ CmdArgs.append({"-o", Output.getFilename()});
+ for (auto Input : Inputs)
+ CmdArgs.push_back(Input.getFilename());
+ CmdArgs.push_back(Args.MakeArgString("-r"));
+ } else
+ for (const char *LinkArg : LinkCommand->getArguments())
+ CmdArgs.push_back(LinkArg);
+ addOffloadCompressArgs(Args, CmdArgs);
const char *Exec =
Args.MakeArgString(getToolChain().GetProgramPath("clang-linker-wrapper"));
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 57e57194ec87b..d8b3f1e242018 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
// RUN: -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 5fd2c0216ccc3..996d72e58755a 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,39 +8,50 @@
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
//
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,RDC %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
//
// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
-
-// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
+// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+
+// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
//
// Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 054db261d8e57..8205169be493f 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
// RUN: -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
// RUN: touch %t/a.o %t/b.o
// RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,22 +47,23 @@
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// OLD-SAME: "-emit-obj"
+// NEW-SAME: "-emit-llvm-bc"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
// CHECK-NOT: {{".*llvm-link"}}
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
//
// Compile device code in a.cu to code object for gfx900.
@@ -70,62 +71,71 @@
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
// CHECK-NOT: {{".*llvm-link"}}
// CHECK-NOT: {{".*opt"}}
// CHECK-NOT: {{".*llc"}}
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
//
// Bundle and embed device code in host object for a.cu.
//
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-emit-obj"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
+// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC]]
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" "--host-triple=x86_64-unknown-linux-gnu"
+// NEW: "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
+
//
// Compile device code in b.hip to code object for gfx803.
//
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
// CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
// CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
// CHECK-SAME: "-fapply-global-visibility-to-externs"
// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
// CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
// CHECK-NOT: {{".*llvm-link"...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very interesting. So, instead of having clang
create a link job for each target individually we basically handle them all at once? The -r
support was basically intended as linkable RDC-mode, so it makes sense.
1246339
to
987007b
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
a7b74b8
to
766d86b
Compare
Currently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior.
766d86b
to
205999e
Compare
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, the main benefit of this is that is can parallelize the linker jobs? Doing that requires a special flag passed to the linker wrapper.
I can add a clang option |
Currently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior.
Currently HIP still uses offload bundler for non-rdc mode for the new offload driver. This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled. This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's. It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries. For device-only non-RDC mode, the new driver keeps the original behavior.
…32869)" This breaks a lot of new driver HIP compilation. We should probably revert this for now until we can make a fixed version. ```c++ static __global__ void print() { printf("%s\n", "foo"); } void b(); int main() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); b(); } ``` ```c++ static __global__ void print() { printf("%s\n", "bar"); } void b() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); } ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver $ ./a.out foo foo ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver -flto <crash> ``` This reverts commit d54c28b.
…" (#143432) This breaks a lot of new driver HIP compilation. We should probably revert this for now until we can make a fixed version. ```c++ static __global__ void print() { printf("%s\n", "foo"); } void b(); int main() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); b(); } ``` ```c++ static __global__ void print() { printf("%s\n", "bar"); } void b() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); } ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver $ ./a.out foo foo ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver -flto <crash> ``` This reverts commit d54c28b.
…32869)" (llvm#143432) This breaks a lot of new driver HIP compilation. We should probably revert this for now until we can make a fixed version. ```c++ static __global__ void print() { printf("%s\n", "foo"); } void b(); int main() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); b(); } ``` ```c++ static __global__ void print() { printf("%s\n", "bar"); } void b() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); } ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver $ ./a.out foo foo ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver -flto <crash> ``` This reverts commit d54c28b.
…32869)" (llvm#143432) This breaks a lot of new driver HIP compilation. We should probably revert this for now until we can make a fixed version. ```c++ static __global__ void print() { printf("%s\n", "foo"); } void b(); int main() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); b(); } ``` ```c++ static __global__ void print() { printf("%s\n", "bar"); } void b() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); } ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver $ ./a.out foo foo ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver -flto <crash> ``` This reverts commit d54c28b.
…llvm#132869)" (llvm#143432)" This reverts commit f5e499a.
…llvm#132869)" (llvm#143432)" This reverts commit f5e499a.
#143964) Fixed two issues: 1. assertion with -flto. the linker wrapper action is missing for wrapping the device binary. Added it for -flto. 2. when there are two HIP files, the kernels in the second file were not found. This is because the -r option of linker wrapper assumes offload entries section of HIP to be hip_offloading_entries but it is actually llvm_offload_entries, causing the offload entries sections not made unique for different object files. Fixed and tested working for both -fgpu-rdc and -fno-gpu-rdc case with and without -r
…32869)" (llvm#143432) This breaks a lot of new driver HIP compilation. We should probably revert this for now until we can make a fixed version. ```c++ static __global__ void print() { printf("%s\n", "foo"); } void b(); int main() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); b(); } ``` ```c++ static __global__ void print() { printf("%s\n", "bar"); } void b() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); } ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver $ ./a.out foo foo ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver -flto <crash> ``` This reverts commit d54c28b.
…2869) (llvm#143964) Fixed two issues: 1. assertion with -flto. the linker wrapper action is missing for wrapping the device binary. Added it for -flto. 2. when there are two HIP files, the kernels in the second file were not found. This is because the -r option of linker wrapper assumes offload entries section of HIP to be hip_offloading_entries but it is actually llvm_offload_entries, causing the offload entries sections not made unique for different object files. Fixed and tested working for both -fgpu-rdc and -fno-gpu-rdc case with and without -r
…lvm#132869) (llvm#143964)" This reverts commit 22f9b4a.
…2869) (llvm#143964) Fixed two issues: 1. assertion with -flto. the linker wrapper action is missing for wrapping the device binary. Added it for -flto. 2. when there are two HIP files, the kernels in the second file were not found. This is because the -r option of linker wrapper assumes offload entries section of HIP to be hip_offloading_entries but it is actually llvm_offload_entries, causing the offload entries sections not made unique for different object files. Fixed and tested working for both -fgpu-rdc and -fno-gpu-rdc case with and without -r
…lvm#132869) (llvm#143964)" This reverts commit 22f9b4a.
Currently HIP still uses offload bundler for non-rdc mode for the new offload driver.
This patch switches to use offload wrapper for non-device-only non-rdc mode when new offload driver is enabled.
This makes the rdc and non-rdc compilation more consistent and speeds up compilation since the offload wrapper supports parallel compilation for different GPU arch's.
It is implemented by adding a linker wrapper action for each assemble action of input file. Linker wrapper action differentiates this special type of work vs normal linker wrapper work by the fle type. This type of work results in object instead of image. The linker wrapper adds "-r" for it and only includes the object file as input, not the host libraries.
For device-only non-RDC mode, the new driver keeps the original behavior.