[SYCL][ROCm] Use offload-arch instead of mcpu for AMD arch (#4239)

npmiller · Artem Gindinson · Naghasan · web-flow · commit 2e08d0ee39c8 · 2021-08-19T18:21:00.000+03:00
This patch changes using `-mcpu` for SYCL applications targeting AMD to `-Xsycl-target-backend --offload-arch`. Before this patch the offloading arch wasn't set correctly for AMD architectures. This is fixing an issue with HIP that was talked about in #4133, regarding having `v4` in the hip part of the triple, without the `v4` HIP seems to be ignoring the fact that the offloading arch is missing from the triple, which is why there was a workaround orignally to force not using `v4` with SYCL. By fixing the offloading arch this patch fixes the issue properly and now the triple with `v4` works because it also contains the offloading architecture. Co-authored-by: Artem Gindinson <artem.gindinson@intel.com> Co-authored-by: Victor Lomuller <victor@codeplay.com>
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -303,6 +303,8 @@ def err_drv_expecting_fsycl_with_sycl_opt : Error<
   "'%0' must be used in conjunction with '-fsycl' to enable offloading">;
 def err_drv_fsycl_with_c_type : Error<
   "'%0' must not be used in conjunction with '-fsycl', which expects C++ source">;
+def err_drv_sycl_missing_amdgpu_arch : Error<
+  "missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend --offload-arch'">;
 def warn_drv_sycl_offload_target_duplicate : Warning<
   "SYCL offloading target '%0' is similar to target '%1' already specified; "
   "will be ignored">, InGroup<SyclTarget>;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
@@ -3945,8 +3945,9 @@ class OffloadingActionBuilder final {
     /// List of static archives to extract FPGA dependency info from
     ActionList FPGAArchiveInputs;
 
-    /// List of CUDA architectures to use in this compilation with NVPTX targets.
-    SmallVector<CudaArch, 8> GpuArchList;
+    /// List of GPU architectures to use in this compilation with NVPTX/AMDGCN
+    /// targets.
+    SmallVector<std::pair<llvm::Triple, std::string>, 8> GpuArchList;
 
     /// Build the last steps for CUDA after all BC files have been linked.
     JobAction *finalizeNVPTXDependences(Action *Input, const llvm::Triple &TT) {
@@ -3983,13 +3984,17 @@ class OffloadingActionBuilder final {
                       const Driver::InputList &Inputs)
         : DeviceActionBuilder(C, Args, Inputs, Action::OFK_SYCL) {}
 
-    void withBoundArchForToolChain(const ToolChain* TC,
+    void withBoundArchForToolChain(const ToolChain *TC,
                                    llvm::function_ref<void(const char *)> Op) {
-      if (TC->getTriple().isNVPTX())
-        for (CudaArch A : GpuArchList)
-          Op(CudaArchToString(A));
-      else
-        Op(nullptr);
+      for (auto &A : GpuArchList) {
+        if (TC->getTriple() == A.first) {
+          Op(Args.MakeArgString(A.second.c_str()));
+          return;
+        }
+      }
+
+      // no bound arch for this toolchain
+      Op(nullptr);
     }
 
     ActionBuilderReturnCode
@@ -4043,8 +4048,8 @@ class OffloadingActionBuilder final {
         }
         const auto *TC = ToolChains.front();
         const char *BoundArch = nullptr;
-        if (TC->getTriple().isNVPTX())
-          BoundArch = CudaArchToString(GpuArchList.front());
+        if (TC->getTriple().isNVPTX() || TC->getTriple().isAMDGCN())
+          BoundArch = GpuArchList.front().second.c_str();
         DA.add(*DeviceCompilerInput, *TC, BoundArch, Action::OFK_SYCL);
         // Clear the input file, it is already a dependence to a host
         // action.
@@ -4627,39 +4632,94 @@ class OffloadingActionBuilder final {
       }
     }
 
-    /// Initialize the GPU architecture list from arguments - this populates `GpuArchList` from
-    /// `--cuda-gpu-arch` flags. Only relevant if compiling to CUDA. Return true if any
-    /// initialization errors are found.
+    /// Initialize the GPU architecture list from arguments - this populates
+    /// `GpuArchList` from `--offload-arch` flags. Only relevant if compiling to
+    /// CUDA or AMDGCN. Return true if any initialization errors are found.
+    /// FIXME: "offload-arch" and the BoundArch mechanism should also be
+    // used in the SYCLToolChain for SPIR-V AOT to track the offload
+    // architecture instead of the Triple sub-arch it currently uses.
     bool initializeGpuArchMap() {
       const OptTable &Opts = C.getDriver().getOpts();
       for (auto *A : Args) {
         unsigned Index;
+        llvm::Triple *TargetBE = nullptr;
 
-        if (A->getOption().matches(options::OPT_Xsycl_backend_EQ))
+        auto GetTripleIt = [&, this](llvm::StringRef Triple) {
+          llvm::Triple TargetTriple{Triple};
+          auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto &SYCLTriple) {
+            return SYCLTriple == TargetTriple;
+          });
+          return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
+        };
+
+        if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
+          TargetBE = GetTripleIt(A->getValue(0));
           // Passing device args: -Xsycl-target-backend=<triple> -opt=val.
-          if (llvm::Triple(A->getValue(0)).isNVPTX())
+          if (TargetBE)
             Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
           else
             continue;
-        else if (A->getOption().matches(options::OPT_Xsycl_backend))
+        } else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
+          if (SYCLTripleList.size() > 1) {
+            C.getDriver().Diag(diag::err_drv_Xsycl_target_missing_triple)
+                << A->getSpelling();
+            continue;
+          }
           // Passing device args: -Xsycl-target-backend -opt=val.
+          TargetBE = &SYCLTripleList.front();
           Index = Args.getBaseArgs().MakeIndex(A->getValue(0));
-        else
+        } else
           continue;
 
         A->claim();
         auto ParsedArg = Opts.ParseOneArg(Args, Index);
+
         // TODO: Support --no-cuda-gpu-arch, --{,no-}cuda-gpu-arch=all.
         if (ParsedArg &&
             ParsedArg->getOption().matches(options::OPT_offload_arch_EQ)) {
+          llvm::StringRef ArchStr = ParsedArg->getValue(0);
+          if (TargetBE->isNVPTX()) {
+            // CUDA arch also applies to AMDGCN ...
+            CudaArch Arch = StringToCudaArch(ArchStr);
+            if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
+              C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch)
+                  << ArchStr;
+              continue;
+            }
+            ArchStr = CudaArchToString(Arch);
+          } else if (TargetBE->isAMDGCN()) {
+            llvm::StringMap<bool> Features;
+            auto Arch =
+                parseTargetID(getHIPOffloadTargetTriple(), ArchStr, &Features);
+            if (!Arch) {
+              C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr;
+              continue;
+            }
+            auto CanId = getCanonicalTargetID(Arch.getValue(), Features);
+            ArchStr = Args.MakeArgStringRef(CanId);
+          }
           ParsedArg->claim();
-          GpuArchList.push_back(StringToCudaArch(ParsedArg->getValue(0)));
+          GpuArchList.emplace_back(*TargetBE, ArchStr);
         }
       }
 
-      // If there are no CUDA architectures provided then default to SM_50.
-      if (GpuArchList.empty()) {
-        GpuArchList.push_back(CudaArch::SM_50);
+      // Handle defaults architectures
+      for (auto &Triple : SYCLTripleList) {
+        // For NVIDIA use SM_50 as a default
+        if (Triple.isNVPTX() && llvm::none_of(GpuArchList, [&](auto &P) {
+              return P.first.isNVPTX();
+            })) {
+          llvm::StringRef DefaultArch = CudaArchToString(CudaArch::SM_50);
+          GpuArchList.emplace_back(Triple, DefaultArch);
+        }
+
+        // For AMD require the architecture to be set by the user
+        if (Triple.isAMDGCN() && llvm::none_of(GpuArchList, [&](auto &P) {
+              return P.first.isAMDGCN();
+            })) {
+          C.getDriver().Diag(clang::diag::err_drv_sycl_missing_amdgpu_arch);
+          return true;
+        }
       }
 
       return false;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
@@ -1174,8 +1174,9 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOffloadTargetArgs(
     // at all, target and host share a toolchain.
     if (A->getOption().matches(options::OPT_m_Group)) {
       // AMD GPU is a special case, as -mcpu is required for the device
-      // compilation.
-      if (SameTripleAsHost || getTriple().getArch() == llvm::Triple::amdgcn)
+      // compilation, except for SYCL which uses --offload-arch.
+      if (SameTripleAsHost || (getTriple().getArch() == llvm::Triple::amdgcn &&
+                               DeviceOffloadKind != Action::OFK_SYCL))
         DAL->append(A);
       else
         Modified = true;
diff --git a/clang/test/Driver/sycl-offload-amdgcn.cpp b/clang/test/Driver/sycl-offload-amdgcn.cpp
@@ -3,38 +3,44 @@
 
 // UNSUPPORTED: system-windows
 
+// Check that the offload arch is required
+// RUN: %clangxx -### -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
+// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHK-ARCH %s
+// CHK-ARCH: error: missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend --offload-arch'
+
 /// Check action graph.
 // RUN: %clangxx -### -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
-// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
+// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 \
 // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
 // CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
 // CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
-// CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
+// CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-compile-opts=--offload-arch=gfx906" "-target=amdgcn" "-kind=sycl"{{.*}}
 
 /// Check phases w/out specifying a compute capability.
 // RUN: %clangxx -ccc-print-phases -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
-// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 %s 2>&1 \
+// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-PHASES-NO-CC %s
 // CHK-PHASES-NO-CC: 0: input, "{{.*}}", c++, (host-sycl)
 // CHK-PHASES-NO-CC: 1: append-footer, {0}, c++, (host-sycl)
 // CHK-PHASES-NO-CC: 2: preprocessor, {1}, c++-cpp-output, (host-sycl)
-// CHK-PHASES-NO-CC: 3: input, "{{.*}}", c++, (device-sycl)
-// CHK-PHASES-NO-CC: 4: preprocessor, {3}, c++-cpp-output, (device-sycl)
-// CHK-PHASES-NO-CC: 5: compiler, {4}, ir, (device-sycl)
-// CHK-PHASES-NO-CC: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {5}, c++-cpp-output
+// CHK-PHASES-NO-CC: 3: input, "{{.*}}", c++, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 5: compiler, {4}, ir, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (amdgcn-amd-amdhsa-sycldevice:gfx906)" {5}, c++-cpp-output
 // CHK-PHASES-NO-CC: 7: compiler, {6}, ir, (host-sycl)
 // CHK-PHASES-NO-CC: 8: backend, {7}, assembler, (host-sycl)
 // CHK-PHASES-NO-CC: 9: assembler, {8}, object, (host-sycl)
 // CHK-PHASES-NO-CC: 10: linker, {9}, image, (host-sycl)
-// CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl)
-// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl)
-// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl)
-// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl)
-// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl)
-// CHK-PHASES-NO-CC: 16: linker, {15}, image, (device-sycl)
-// CHK-PHASES-NO-CC: 17: linker, {16}, hip-fatbin, (device-sycl)
-// CHK-PHASES-NO-CC: 18: foreach, {13, 17}, hip-fatbin, (device-sycl)
-// CHK-PHASES-NO-CC: 19: file-table-tform, {12, 18}, tempfiletable, (device-sycl)
-// CHK-PHASES-NO-CC: 20: clang-offload-wrapper, {19}, object, (device-sycl)
-// CHK-PHASES-NO-CC: 21: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {20}, image
+// CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 16: linker, {15}, image, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 17: linker, {16}, hip-fatbin, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 18: foreach, {13, 17}, hip-fatbin, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 19: file-table-tform, {12, 18}, tempfiletable, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 20: clang-offload-wrapper, {19}, object, (device-sycl, gfx906)
+// CHK-PHASES-NO-CC: 21: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice:gfx906)" {20}, image
diff --git a/clang/test/Driver/sycl-triple-dae-flags.cpp b/clang/test/Driver/sycl-triple-dae-flags.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-dead-args-optimization %s 2> %t.cuda.out
 // RUN: FileCheck %s --input-file %t.cuda.out
 //
-// RUN: %clangxx -### -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -fsycl-dead-args-optimization %s 2> %t.rocm.out
+// RUN: %clangxx -### -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 -fsycl-dead-args-optimization %s 2> %t.rocm.out
 // RUN: FileCheck %s --input-file %t.rocm.out
 // CHECK-NOT: -fenable-sycl-dae
 // CHECK-NOT: -emit-param-info
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
@@ -547,11 +547,14 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \
   simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
 ```
 
-When building for ROCm, please note that the option `mcpu` must be specified, use the ROCm target triple as follows:
+When building for ROCm, use the ROCm target triple and specify the
+target architecture with `-Xsycl-target-backend --offload-arch=<arch>`
+as follows:
 
 ```bash
 clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice \
-  -mcpu=gfx906 simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
+  -Xsycl-target-backend --offload-arch=gfx906              \
+  simple-sycl-app.cpp -o simple-sycl-app-amd.exe
 ```
 
 To build simple-sycl-app ahead of time for GPU, CPU or Accelerator devices,