intel · bader · Nov 30, 2021 · Nov 9, 2021 · s-kanaev · Nov 9, 2021
@@ -67,7 +67,7 @@
 
 // Helper macro to identify if fallback assert is needed
 // FIXME remove __NVPTX__ condition once devicelib supports CUDA
-#if !defined(SYCL_DISABLE_FALLBACK_ASSERT) && !defined(__NVPTX__)
+#if !defined(SYCL_DISABLE_FALLBACK_ASSERT)
 #define __SYCL_USE_FALLBACK_ASSERT 1
 #else
 #define __SYCL_USE_FALLBACK_ASSERT 0
@@ -1188,11 +1188,11 @@ event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue,
     auto Acc = Buffer.get_access<access::mode::write>(CGH);
 
     CGH.single_task<__sycl_service_kernel__::AssertInfoCopier>([Acc] {
-#ifdef __SYCL_DEVICE_ONLY__
+#if defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
       __devicelib_assert_read(&Acc[0]);
 #else
       (void)Acc;
-#endif // __SYCL_DEVICE_ONLY__
+#endif // defined(__SYCL_DEVICE_ONLY__) && !defined(__NVPTX__)
     });
   };
   auto CheckerCGF = [&CopierEv, &Buffer](handler &CGH) {

@@ -1313,9 +1313,12 @@ static bool compatibleWithDevice(RTDeviceBinaryImage *BinImage,
   pi_uint32 SuitableImageID = std::numeric_limits<pi_uint32>::max();
   pi_device_binary DevBin =
       const_cast<pi_device_binary>(&BinImage->getRawData());
-  Plugin.call<PiApiKind::piextDeviceSelectBinary>(
+  RT::PiResult Error = Plugin.call_nocheck<PiApiKind::piextDeviceSelectBinary>(
       PIDeviceHandle, &DevBin,
       /*num bin images = */ (cl_uint)1, &SuitableImageID);
+  if (Error != PI_SUCCESS && Error != PI_INVALID_BINARY)
+    throw runtime_error("Invalid binary image or device", PI_INVALID_VALUE);
+
   return (0 == SuitableImageID);
 }
 

@@ -0,0 +1,40 @@
+// REQUIRES: cuda || hip_be
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple,spir64 %s -o -
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,%sycl_triple %s -o -
+//
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple,spir64 %s -o -
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64,%sycl_triple %s -o -
+//
+// Test checks that compiling for multiple devices works regardless of target
+// order.
+
+#include <sycl/sycl.hpp>
+
+using namespace cl::sycl;
+
+int main() {
+  sycl::queue q;
+
+  float A_Data[5] = {1.1};
+  float B_Data[5] = {0};
+  int C_Data[10] = {0};
+
+  {
+    buffer<float, 1> A_buff(A_Data, range<1>(5));
+    buffer<float, 1> B_buff(B_Data, range<1>(5));
+    q.submit([&](handler &cgh) {
+       auto A_acc = A_buff.get_access<access::mode::read>(cgh);
+       auto B_acc = B_buff.get_access<access::mode::write>(cgh);
+       cgh.parallel_for(range<1>{5},
+                        [=](id<1> index) { B_acc[index] = A_acc[index]; });
+     }).wait();
+  }
+
+  {
+    buffer<int, 1> C_buff(C_Data, range<1>(10));
+    q.submit([&](handler &cgh) {
+       auto C_acc = C_buff.get_access<access::mode::write>(cgh);
+       cgh.parallel_for(range<1>{10}, [=](id<1> index) { C_acc[index] = 15; });
+     }).wait();
+  }
+}