Skip to content

Commit c79cbf2

Browse files
jhuber6rorth
authored andcommitted
Revert "[HIP] use offload wrapper for non-device-only non-rdc (llvm#132869)" (llvm#143432)
This breaks a lot of new driver HIP compilation. We should probably revert this for now until we can make a fixed version. ```c++ static __global__ void print() { printf("%s\n", "foo"); } void b(); int main() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); b(); } ``` ```c++ static __global__ void print() { printf("%s\n", "bar"); } void b() { hipLaunchKernelGGL(print, dim3(1), dim3(1), 0, 0); auto y = hipDeviceSynchronize(); } ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver $ ./a.out foo foo ``` ```console $ clang++ a.hip b.hip --offload-arch=gfx1030 --offload-new-driver -flto <crash> ``` This reverts commit d54c28b.
1 parent 6bb6553 commit c79cbf2

File tree

6 files changed

+76
-142
lines changed

6 files changed

+76
-142
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
12801280
return nullptr;
12811281
}
12821282
if (CGM.getLangOpts().OffloadViaLLVM ||
1283-
(CGM.getLangOpts().OffloadingNewDriver &&
1284-
(CGM.getLangOpts().HIP || RelocatableDeviceCode)))
1283+
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
12851284
createOffloadingEntries();
12861285
else
12871286
return makeModuleCtorFunction();

clang/lib/Driver/Driver.cpp

Lines changed: 18 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4424,10 +4424,6 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
44244424
options::OPT_no_offload_new_driver,
44254425
C.isOffloadingHostKind(Action::OFK_Cuda));
44264426

4427-
bool HIPNoRDC =
4428-
C.isOffloadingHostKind(Action::OFK_HIP) &&
4429-
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4430-
44314427
// Builder to be used to build offloading actions.
44324428
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
44334429
!UseNewOffloadingDriver
@@ -4561,7 +4557,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
45614557
// Check if this Linker Job should emit a static library.
45624558
if (ShouldEmitStaticLibrary(Args)) {
45634559
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
4564-
} else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
4560+
} else if (UseNewOffloadingDriver ||
45654561
Args.hasArg(options::OPT_offload_link)) {
45664562
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
45674563
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4872,28 +4868,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
48724868
const InputTy &Input, StringRef CUID,
48734869
Action *HostAction) const {
48744870
// Don't build offloading actions if explicitly disabled or we do not have a
4875-
// valid source input.
4876-
if (offloadHostOnly() || !types::isSrcFile(Input.first))
4877-
return HostAction;
4878-
4879-
bool HIPNoRDC =
4880-
C.isOffloadingHostKind(Action::OFK_HIP) &&
4881-
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
4882-
4883-
// For HIP non-rdc non-device-only compilation, create a linker wrapper
4884-
// action for each host object to link, bundle and wrap device files in
4885-
// it.
4886-
if (isa<AssembleJobAction>(HostAction) && HIPNoRDC && !offloadDeviceOnly()) {
4887-
ActionList AL{HostAction};
4888-
HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
4889-
HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
4890-
/*BoundArch=*/nullptr);
4891-
return HostAction;
4892-
}
4893-
4894-
// Don't build offloading actions if we do not have a compile action. If
4895-
// preprocessing only ignore embedding.
4896-
if (!(isa<CompileJobAction>(HostAction) ||
4871+
// valid source input and compile action to embed it in. If preprocessing only
4872+
// ignore embedding.
4873+
if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
4874+
!(isa<CompileJobAction>(HostAction) ||
48974875
getFinalPhase(Args) == phases::Preprocess))
48984876
return HostAction;
48994877

@@ -4989,12 +4967,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
49894967
}
49904968
}
49914969

4992-
// Compiling HIP in device-only non-RDC mode requires linking each action
4993-
// individually.
4970+
// Compiling HIP in non-RDC mode requires linking each action individually.
49944971
for (Action *&A : DeviceActions) {
49954972
if ((A->getType() != types::TY_Object &&
49964973
A->getType() != types::TY_LTO_BC) ||
4997-
!HIPNoRDC || !offloadDeviceOnly())
4974+
Kind != Action::OFK_HIP ||
4975+
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
49984976
continue;
49994977
ActionList LinkerInput = {A};
50004978
A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -5018,12 +4996,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
50184996
}
50194997
}
50204998

5021-
// HIP code in device-only non-RDC mode will bundle the output if it invoked
5022-
// the linker.
4999+
// HIP code in non-RDC mode will bundle the output if it invoked the linker.
50235000
bool ShouldBundleHIP =
5024-
HIPNoRDC && offloadDeviceOnly() &&
5001+
C.isOffloadingHostKind(Action::OFK_HIP) &&
50255002
Args.hasFlag(options::OPT_gpu_bundle_output,
50265003
options::OPT_no_gpu_bundle_output, true) &&
5004+
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
50275005
!llvm::any_of(OffloadActions,
50285006
[](Action *A) { return A->getType() != types::TY_Image; });
50295007

@@ -5043,9 +5021,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
50435021
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
50445022
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
50455023
nullptr, Action::OFK_Cuda);
5046-
} else if (HIPNoRDC && offloadDeviceOnly()) {
5047-
// If we are in device-only non-RDC-mode we just emit the final HIP
5048-
// fatbinary for each translation unit, linking each input individually.
5024+
} else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
5025+
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5026+
false)) {
5027+
// If we are not in RDC-mode we just emit the final HIP fatbinary for each
5028+
// translation unit, linking each input individually.
50495029
Action *FatbinAction =
50505030
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
50515031
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5198,11 +5178,8 @@ Action *Driver::ConstructPhaseAction(
51985178
(((Input->getOffloadingToolChain() &&
51995179
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
52005180
TargetDeviceOffloadKind == Action::OFK_HIP) &&
5201-
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5202-
false) ||
5203-
(Args.hasFlag(options::OPT_offload_new_driver,
5204-
options::OPT_no_offload_new_driver, false) &&
5205-
!offloadDeviceOnly())) ||
5181+
(Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
5182+
false) ||
52065183
TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
52075184
types::ID Output =
52085185
Args.hasArg(options::OPT_S) &&

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7821,7 +7821,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
78217821
CmdArgs.push_back("-fcuda-include-gpubinary");
78227822
CmdArgs.push_back(CudaDeviceInput->getFilename());
78237823
} else if (!HostOffloadingInputs.empty()) {
7824-
if (IsCuda && !IsRDCMode) {
7824+
if ((IsCuda || IsHIP) && !IsRDCMode) {
78257825
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
78267826
CmdArgs.push_back("-fcuda-include-gpubinary");
78277827
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9368,20 +9368,8 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
93689368
// Add the linker arguments to be forwarded by the wrapper.
93699369
CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
93709370
LinkCommand->getExecutable()));
9371-
9372-
// We use action type to differentiate two use cases of the linker wrapper.
9373-
// TY_Image for normal linker wrapper work.
9374-
// TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
9375-
// object.
9376-
assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
9377-
if (JA.getType() == types::TY_Object) {
9378-
CmdArgs.append({"-o", Output.getFilename()});
9379-
for (auto Input : Inputs)
9380-
CmdArgs.push_back(Input.getFilename());
9381-
CmdArgs.push_back("-r");
9382-
} else
9383-
for (const char *LinkArg : LinkCommand->getArguments())
9384-
CmdArgs.push_back(LinkArg);
9371+
for (const char *LinkArg : LinkCommand->getArguments())
9372+
CmdArgs.push_back(LinkArg);
93859373

93869374
addOffloadCompressArgs(Args, CmdArgs);
93879375

clang/test/Driver/hip-binding.hip

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
// RUN: -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
9494
// RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
9595
// LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
96+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
9697
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
97-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
98-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
99-
// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
98+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
99+
// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"

clang/test/Driver/hip-phases.hip

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,50 +8,39 @@
88
//
99
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1010
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
11-
// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
11+
// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
1212
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1313
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
14-
// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
14+
// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
1515
//
1616
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
1717
// RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
18-
// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
19-
// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
20-
// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
21-
// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
18+
// RUN: | FileCheck -check-prefixes=BIN,RDC %s
2219
//
2320
// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
2421
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
2522
// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
26-
// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
27-
// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
23+
// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
24+
// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
2825

2926
// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
3027
// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
3128
// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
32-
// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
33-
// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
34-
// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
35-
// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
36-
// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
37-
// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
38-
// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
39-
// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
40-
// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
41-
// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
42-
43-
// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
44-
// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
45-
// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
46-
// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
47-
// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
48-
// NEW-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
49-
// NEW-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
50-
// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
51-
// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
52-
// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
53-
// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
54-
// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
29+
// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
30+
// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
31+
// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
32+
// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
33+
// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
34+
// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
35+
// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
36+
37+
// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
38+
// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
39+
// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
40+
// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
41+
// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
42+
// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
43+
// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
5544

5645
//
5746
// Test single gpu architecture up to the assemble phase.

0 commit comments

Comments
 (0)