Skip to content

Commit 2e08d0e

Browse files
npmillerArtem GindinsonNaghasan
authored
[SYCL][ROCm] Use offload-arch instead of mcpu for AMD arch (#4239)
This patch changes using `-mcpu` for SYCL applications targeting AMD to `-Xsycl-target-backend --offload-arch`. Before this patch the offloading arch wasn't set correctly for AMD architectures. This is fixing an issue with HIP that was talked about in #4133, regarding having `v4` in the hip part of the triple, without the `v4` HIP seems to be ignoring the fact that the offloading arch is missing from the triple, which is why there was a workaround orignally to force not using `v4` with SYCL. By fixing the offloading arch this patch fixes the issue properly and now the triple with `v4` works because it also contains the offloading architecture. Co-authored-by: Artem Gindinson <[email protected]> Co-authored-by: Victor Lomuller <[email protected]>
1 parent 7c8af77 commit 2e08d0e

File tree

6 files changed

+116
-44
lines changed

6 files changed

+116
-44
lines changed

clang/include/clang/Basic/DiagnosticDriverKinds.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ def err_drv_expecting_fsycl_with_sycl_opt : Error<
303303
"'%0' must be used in conjunction with '-fsycl' to enable offloading">;
304304
def err_drv_fsycl_with_c_type : Error<
305305
"'%0' must not be used in conjunction with '-fsycl', which expects C++ source">;
306+
def err_drv_sycl_missing_amdgpu_arch : Error<
307+
"missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend --offload-arch'">;
306308
def warn_drv_sycl_offload_target_duplicate : Warning<
307309
"SYCL offloading target '%0' is similar to target '%1' already specified; "
308310
"will be ignored">, InGroup<SyclTarget>;

clang/lib/Driver/Driver.cpp

Lines changed: 81 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3945,8 +3945,9 @@ class OffloadingActionBuilder final {
39453945
/// List of static archives to extract FPGA dependency info from
39463946
ActionList FPGAArchiveInputs;
39473947

3948-
/// List of CUDA architectures to use in this compilation with NVPTX targets.
3949-
SmallVector<CudaArch, 8> GpuArchList;
3948+
/// List of GPU architectures to use in this compilation with NVPTX/AMDGCN
3949+
/// targets.
3950+
SmallVector<std::pair<llvm::Triple, std::string>, 8> GpuArchList;
39503951

39513952
/// Build the last steps for CUDA after all BC files have been linked.
39523953
JobAction *finalizeNVPTXDependences(Action *Input, const llvm::Triple &TT) {
@@ -3983,13 +3984,17 @@ class OffloadingActionBuilder final {
39833984
const Driver::InputList &Inputs)
39843985
: DeviceActionBuilder(C, Args, Inputs, Action::OFK_SYCL) {}
39853986

3986-
void withBoundArchForToolChain(const ToolChain* TC,
3987+
void withBoundArchForToolChain(const ToolChain *TC,
39873988
llvm::function_ref<void(const char *)> Op) {
3988-
if (TC->getTriple().isNVPTX())
3989-
for (CudaArch A : GpuArchList)
3990-
Op(CudaArchToString(A));
3991-
else
3992-
Op(nullptr);
3989+
for (auto &A : GpuArchList) {
3990+
if (TC->getTriple() == A.first) {
3991+
Op(Args.MakeArgString(A.second.c_str()));
3992+
return;
3993+
}
3994+
}
3995+
3996+
// no bound arch for this toolchain
3997+
Op(nullptr);
39933998
}
39943999

39954000
ActionBuilderReturnCode
@@ -4043,8 +4048,8 @@ class OffloadingActionBuilder final {
40434048
}
40444049
const auto *TC = ToolChains.front();
40454050
const char *BoundArch = nullptr;
4046-
if (TC->getTriple().isNVPTX())
4047-
BoundArch = CudaArchToString(GpuArchList.front());
4051+
if (TC->getTriple().isNVPTX() || TC->getTriple().isAMDGCN())
4052+
BoundArch = GpuArchList.front().second.c_str();
40484053
DA.add(*DeviceCompilerInput, *TC, BoundArch, Action::OFK_SYCL);
40494054
// Clear the input file, it is already a dependence to a host
40504055
// action.
@@ -4627,39 +4632,94 @@ class OffloadingActionBuilder final {
46274632
}
46284633
}
46294634

4630-
/// Initialize the GPU architecture list from arguments - this populates `GpuArchList` from
4631-
/// `--cuda-gpu-arch` flags. Only relevant if compiling to CUDA. Return true if any
4632-
/// initialization errors are found.
4635+
/// Initialize the GPU architecture list from arguments - this populates
4636+
/// `GpuArchList` from `--offload-arch` flags. Only relevant if compiling to
4637+
/// CUDA or AMDGCN. Return true if any initialization errors are found.
4638+
/// FIXME: "offload-arch" and the BoundArch mechanism should also be
4639+
// used in the SYCLToolChain for SPIR-V AOT to track the offload
4640+
// architecture instead of the Triple sub-arch it currently uses.
46334641
bool initializeGpuArchMap() {
46344642
const OptTable &Opts = C.getDriver().getOpts();
46354643
for (auto *A : Args) {
46364644
unsigned Index;
4645+
llvm::Triple *TargetBE = nullptr;
46374646

4638-
if (A->getOption().matches(options::OPT_Xsycl_backend_EQ))
4647+
auto GetTripleIt = [&, this](llvm::StringRef Triple) {
4648+
llvm::Triple TargetTriple{Triple};
4649+
auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto &SYCLTriple) {
4650+
return SYCLTriple == TargetTriple;
4651+
});
4652+
return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
4653+
};
4654+
4655+
if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
4656+
TargetBE = GetTripleIt(A->getValue(0));
46394657
// Passing device args: -Xsycl-target-backend=<triple> -opt=val.
4640-
if (llvm::Triple(A->getValue(0)).isNVPTX())
4658+
if (TargetBE)
46414659
Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
46424660
else
46434661
continue;
4644-
else if (A->getOption().matches(options::OPT_Xsycl_backend))
4662+
} else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
4663+
if (SYCLTripleList.size() > 1) {
4664+
C.getDriver().Diag(diag::err_drv_Xsycl_target_missing_triple)
4665+
<< A->getSpelling();
4666+
continue;
4667+
}
46454668
// Passing device args: -Xsycl-target-backend -opt=val.
4669+
TargetBE = &SYCLTripleList.front();
46464670
Index = Args.getBaseArgs().MakeIndex(A->getValue(0));
4647-
else
4671+
} else
46484672
continue;
46494673

46504674
A->claim();
46514675
auto ParsedArg = Opts.ParseOneArg(Args, Index);
4676+
46524677
// TODO: Support --no-cuda-gpu-arch, --{,no-}cuda-gpu-arch=all.
46534678
if (ParsedArg &&
46544679
ParsedArg->getOption().matches(options::OPT_offload_arch_EQ)) {
4680+
llvm::StringRef ArchStr = ParsedArg->getValue(0);
4681+
if (TargetBE->isNVPTX()) {
4682+
// CUDA arch also applies to AMDGCN ...
4683+
CudaArch Arch = StringToCudaArch(ArchStr);
4684+
if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
4685+
C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch)
4686+
<< ArchStr;
4687+
continue;
4688+
}
4689+
ArchStr = CudaArchToString(Arch);
4690+
} else if (TargetBE->isAMDGCN()) {
4691+
llvm::StringMap<bool> Features;
4692+
auto Arch =
4693+
parseTargetID(getHIPOffloadTargetTriple(), ArchStr, &Features);
4694+
if (!Arch) {
4695+
C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr;
4696+
continue;
4697+
}
4698+
auto CanId = getCanonicalTargetID(Arch.getValue(), Features);
4699+
ArchStr = Args.MakeArgStringRef(CanId);
4700+
}
46554701
ParsedArg->claim();
4656-
GpuArchList.push_back(StringToCudaArch(ParsedArg->getValue(0)));
4702+
GpuArchList.emplace_back(*TargetBE, ArchStr);
46574703
}
46584704
}
46594705

4660-
// If there are no CUDA architectures provided then default to SM_50.
4661-
if (GpuArchList.empty()) {
4662-
GpuArchList.push_back(CudaArch::SM_50);
4706+
// Handle defaults architectures
4707+
for (auto &Triple : SYCLTripleList) {
4708+
// For NVIDIA use SM_50 as a default
4709+
if (Triple.isNVPTX() && llvm::none_of(GpuArchList, [&](auto &P) {
4710+
return P.first.isNVPTX();
4711+
})) {
4712+
llvm::StringRef DefaultArch = CudaArchToString(CudaArch::SM_50);
4713+
GpuArchList.emplace_back(Triple, DefaultArch);
4714+
}
4715+
4716+
// For AMD require the architecture to be set by the user
4717+
if (Triple.isAMDGCN() && llvm::none_of(GpuArchList, [&](auto &P) {
4718+
return P.first.isAMDGCN();
4719+
})) {
4720+
C.getDriver().Diag(clang::diag::err_drv_sycl_missing_amdgpu_arch);
4721+
return true;
4722+
}
46634723
}
46644724

46654725
return false;

clang/lib/Driver/ToolChain.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,8 +1174,9 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOffloadTargetArgs(
11741174
// at all, target and host share a toolchain.
11751175
if (A->getOption().matches(options::OPT_m_Group)) {
11761176
// AMD GPU is a special case, as -mcpu is required for the device
1177-
// compilation.
1178-
if (SameTripleAsHost || getTriple().getArch() == llvm::Triple::amdgcn)
1177+
// compilation, except for SYCL which uses --offload-arch.
1178+
if (SameTripleAsHost || (getTriple().getArch() == llvm::Triple::amdgcn &&
1179+
DeviceOffloadKind != Action::OFK_SYCL))
11791180
DAL->append(A);
11801181
else
11811182
Modified = true;

clang/test/Driver/sycl-offload-amdgcn.cpp

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,38 +3,44 @@
33

44
// UNSUPPORTED: system-windows
55

6+
// Check that the offload arch is required
7+
// RUN: %clangxx -### -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
8+
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice %s 2>&1 \
9+
// RUN: | FileCheck -check-prefix=CHK-ARCH %s
10+
// CHK-ARCH: error: missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend --offload-arch'
11+
612
/// Check action graph.
713
// RUN: %clangxx -### -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
8-
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
14+
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 \
915
// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
1016
// RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
1117
// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
1218
// CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
13-
// CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
19+
// CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-compile-opts=--offload-arch=gfx906" "-target=amdgcn" "-kind=sycl"{{.*}}
1420

1521
/// Check phases w/out specifying a compute capability.
1622
// RUN: %clangxx -ccc-print-phases -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
17-
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 %s 2>&1 \
23+
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 %s 2>&1 \
1824
// RUN: | FileCheck -check-prefix=CHK-PHASES-NO-CC %s
1925
// CHK-PHASES-NO-CC: 0: input, "{{.*}}", c++, (host-sycl)
2026
// CHK-PHASES-NO-CC: 1: append-footer, {0}, c++, (host-sycl)
2127
// CHK-PHASES-NO-CC: 2: preprocessor, {1}, c++-cpp-output, (host-sycl)
22-
// CHK-PHASES-NO-CC: 3: input, "{{.*}}", c++, (device-sycl)
23-
// CHK-PHASES-NO-CC: 4: preprocessor, {3}, c++-cpp-output, (device-sycl)
24-
// CHK-PHASES-NO-CC: 5: compiler, {4}, ir, (device-sycl)
25-
// CHK-PHASES-NO-CC: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {5}, c++-cpp-output
28+
// CHK-PHASES-NO-CC: 3: input, "{{.*}}", c++, (device-sycl, gfx906)
29+
// CHK-PHASES-NO-CC: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, gfx906)
30+
// CHK-PHASES-NO-CC: 5: compiler, {4}, ir, (device-sycl, gfx906)
31+
// CHK-PHASES-NO-CC: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (amdgcn-amd-amdhsa-sycldevice:gfx906)" {5}, c++-cpp-output
2632
// CHK-PHASES-NO-CC: 7: compiler, {6}, ir, (host-sycl)
2733
// CHK-PHASES-NO-CC: 8: backend, {7}, assembler, (host-sycl)
2834
// CHK-PHASES-NO-CC: 9: assembler, {8}, object, (host-sycl)
2935
// CHK-PHASES-NO-CC: 10: linker, {9}, image, (host-sycl)
30-
// CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl)
31-
// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl)
32-
// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl)
33-
// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl)
34-
// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl)
35-
// CHK-PHASES-NO-CC: 16: linker, {15}, image, (device-sycl)
36-
// CHK-PHASES-NO-CC: 17: linker, {16}, hip-fatbin, (device-sycl)
37-
// CHK-PHASES-NO-CC: 18: foreach, {13, 17}, hip-fatbin, (device-sycl)
38-
// CHK-PHASES-NO-CC: 19: file-table-tform, {12, 18}, tempfiletable, (device-sycl)
39-
// CHK-PHASES-NO-CC: 20: clang-offload-wrapper, {19}, object, (device-sycl)
40-
// CHK-PHASES-NO-CC: 21: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {20}, image
36+
// CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl, gfx906)
37+
// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl, gfx906)
38+
// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl, gfx906)
39+
// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl, gfx906)
40+
// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl, gfx906)
41+
// CHK-PHASES-NO-CC: 16: linker, {15}, image, (device-sycl, gfx906)
42+
// CHK-PHASES-NO-CC: 17: linker, {16}, hip-fatbin, (device-sycl, gfx906)
43+
// CHK-PHASES-NO-CC: 18: foreach, {13, 17}, hip-fatbin, (device-sycl, gfx906)
44+
// CHK-PHASES-NO-CC: 19: file-table-tform, {12, 18}, tempfiletable, (device-sycl, gfx906)
45+
// CHK-PHASES-NO-CC: 20: clang-offload-wrapper, {19}, object, (device-sycl, gfx906)
46+
// CHK-PHASES-NO-CC: 21: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice:gfx906)" {20}, image

clang/test/Driver/sycl-triple-dae-flags.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// RUN: %clangxx -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-dead-args-optimization %s 2> %t.cuda.out
22
// RUN: FileCheck %s --input-file %t.cuda.out
33
//
4-
// RUN: %clangxx -### -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -fsycl-dead-args-optimization %s 2> %t.rocm.out
4+
// RUN: %clangxx -### -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 -fsycl-dead-args-optimization %s 2> %t.rocm.out
55
// RUN: FileCheck %s --input-file %t.rocm.out
66
// CHECK-NOT: -fenable-sycl-dae
77
// CHECK-NOT: -emit-param-info

sycl/doc/GetStartedGuide.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -547,11 +547,14 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \
547547
simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
548548
```
549549
550-
When building for ROCm, please note that the option `mcpu` must be specified, use the ROCm target triple as follows:
550+
When building for ROCm, use the ROCm target triple and specify the
551+
target architecture with `-Xsycl-target-backend --offload-arch=<arch>`
552+
as follows:
551553
552554
```bash
553555
clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice \
554-
-mcpu=gfx906 simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
556+
-Xsycl-target-backend --offload-arch=gfx906 \
557+
simple-sycl-app.cpp -o simple-sycl-app-amd.exe
555558
```
556559
557560
To build simple-sycl-app ahead of time for GPU, CPU or Accelerator devices,

0 commit comments

Comments
 (0)