[SYCL][CUDA] Update program manager and queue to resolve multi-targeting issues (#4921)

AidanBeltonS · web-flow · commit a346c08182f9 · 2021-11-30T15:45:10.000+03:00
This PR makes two changes, the first is it moves the macro which prevents `__devicelib_assert_read` being used for `nvptx64` devices. This is done to resolve an issue where the binary images of `spirv64` and `nvptx64` are neither identical nor disjoint (have no kernels in common). The program manager needs binary images to be identical or disjoint, it fails otherwise. This creates a kernel of the same name as when building for `spirv64` but it does not use `__devicelib_assert_read`. 
The second it prevents errors being thrown in the program manager when the binaries compatibility check returns false. This is to allow for multi-targeting to be used with module splitting. 
A cuda and hip only regression test is added to check for successful compilation with multi-targeting and module splitting options.
diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
@@ -67,7 +67,7 @@
 
 // Helper macro to identify if fallback assert is needed
 // FIXME remove __NVPTX__ condition once devicelib supports CUDA
-#if !defined(SYCL_DISABLE_FALLBACK_ASSERT) && !defined(__NVPTX__)
+#if !defined(SYCL_DISABLE_FALLBACK_ASSERT)
 #define __SYCL_USE_FALLBACK_ASSERT 1
 #else
 #define __SYCL_USE_FALLBACK_ASSERT 0
@@ -1187,11 +1187,11 @@ event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
     auto Acc = Buffer.get_access<access::mode::write>(CGH);
 
     CGH.single_task<__sycl_service_kernel__::AssertInfoCopier>([Acc] {
-#ifdef __SYCL_DEVICE_ONLY__
+#if defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
       __devicelib_assert_read(&Acc[0]);
 #else
       (void)Acc;
-#endif // __SYCL_DEVICE_ONLY__
+#endif // defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
     });
   };
   auto CheckerCGF = [&CopierEv, &Buffer](handler &CGH) {
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1313,9 +1313,12 @@ static bool compatibleWithDevice(RTDeviceBinaryImage *BinImage,
   pi_uint32 SuitableImageID = std::numeric_limits<pi_uint32>::max();
   pi_device_binary DevBin =
       const_cast<pi_device_binary>(&BinImage->getRawData());
-  Plugin.call<PiApiKind::piextDeviceSelectBinary>(
+  RT::PiResult Error = Plugin.call_nocheck<PiApiKind::piextDeviceSelectBinary>(
       PIDeviceHandle, &DevBin,
       /*num bin images = */ (cl_uint)1, &SuitableImageID);
+  if (Error != PI_SUCCESS && Error != PI_INVALID_BINARY)
+    throw runtime_error("Invalid binary image or device", PI_INVALID_VALUE);
+
   return (0 == SuitableImageID);
 }
 
diff --git a/sycl/test/regression/multi_targeting.cpp b/sycl/test/regression/multi_targeting.cpp
@@ -0,0 +1,40 @@
+// REQUIRES: cuda || hip_be
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple,spir64 %s -o -
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,%sycl_triple %s -o -
+//
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple,spir64 %s -o -
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64,%sycl_triple %s -o -
+//
+// Test checks that compiling for multiple devices works regardless of target
+// order.
+
+#include <sycl/sycl.hpp>
+
+using namespace cl::sycl;
+
+int main() {
+  sycl::queue q;
+
+  float A_Data[5] = {1.1};
+  float B_Data[5] = {0};
+  int C_Data[10] = {0};
+
+  {
+    buffer<float, 1> A_buff(A_Data, range<1>(5));
+    buffer<float, 1> B_buff(B_Data, range<1>(5));
+    q.submit([&](handler &cgh) {
+       auto A_acc = A_buff.get_access<access::mode::read>(cgh);
+       auto B_acc = B_buff.get_access<access::mode::write>(cgh);
+       cgh.parallel_for(range<1>{5},
+                        [=](id<1> index) { B_acc[index] = A_acc[index]; });
+     }).wait();
+  }
+
+  {
+    buffer<int, 1> C_buff(C_Data, range<1>(10));
+    q.submit([&](handler &cgh) {
+       auto C_acc = C_buff.get_access<access::mode::write>(cgh);
+       cgh.parallel_for(range<1>{10}, [=](id<1> index) { C_acc[index] = 15; });
+     }).wait();
+  }
+}