[SYCL][Graph][CUDA] Skip unsupported Windows E2E tests (#13764)

EwanC · web-flow · commit c541c2298478 · 2024-05-16T10:15:04.000+02:00
On a CUDA & Windows setup when shared USM is used there is an issue with using the allocations concurrently in both device commands and host-tasks. This is based on an underlying CUDA restriction: https://forums.developer.nvidia.com/t/cudamallocmanaged-clarification-needed/67611 > Applications running on Windows (whether in TCC or WDDM mode) or macOS will use the basic Unified Memory model as on pre-6.x architectures even when they are running on hardware with compute capability 6.x or higher.” > “Simultaneous access to managed memory on devices of compute capability lower than 6.x is not possible,” Therefore, simultaneous access to managed memory on Windows is not possible. This appears in SYCL-Graph tests where the graph has multiple roots, allowing host-tasks branching from one root to run concurrently with device commands from the other root. With the issue manifesting as a page fault in the host-task when trying to access a USM allocation. I've created a more minimal test `test-e2e/USM/host-task.cpp` which exhibits the same issues.
diff --git a/sycl/test-e2e/Graph/Explicit/host_task2_multiple_roots.cpp b/sycl/test-e2e/Graph/Explicit/host_task2_multiple_roots.cpp
@@ -7,6 +7,10 @@
 
 // REQUIRES: aspect-usm_shared_allocations
 
+// Concurrent access to shared USM allocations is not supported by CUDA on
+// Windows
+// UNSUPPORTED: cuda && windows
+
 #define GRAPH_E2E_EXPLICIT
 
 #include "../Inputs/host_task2_multiple_roots.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/host_task_multiple_roots.cpp b/sycl/test-e2e/Graph/Explicit/host_task_multiple_roots.cpp
@@ -7,6 +7,10 @@
 
 // REQUIRES: aspect-usm_shared_allocations
 
+// Concurrent access to shared USM allocations is not supported by CUDA on
+// Windows
+// UNSUPPORTED: cuda && windows
+
 #define GRAPH_E2E_EXPLICIT
 
 #include "../Inputs/host_task_multiple_roots.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task2_multiple_roots.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task2_multiple_roots.cpp
@@ -7,6 +7,10 @@
 
 // REQUIRES: aspect-usm_shared_allocations
 
+// Concurrent access to shared USM allocations is not supported by CUDA on
+// Windows
+// UNSUPPORTED: cuda && windows
+
 #define GRAPH_E2E_RECORD_REPLAY
 
 #include "../Inputs/host_task2_multiple_roots.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_roots.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_roots.cpp
@@ -7,6 +7,10 @@
 
 // REQUIRES: aspect-usm_shared_allocations
 
+// Concurrent access to shared USM allocations is not supported by CUDA on
+// Windows
+// UNSUPPORTED: cuda && windows
+
 #define GRAPH_E2E_RECORD_REPLAY
 
 #include "../Inputs/host_task_multiple_roots.cpp"
diff --git a/sycl/test-e2e/USM/host_task.cpp b/sycl/test-e2e/USM/host_task.cpp
@@ -0,0 +1,45 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Concurrent access to shared USM allocations is not supported by CUDA on
+// Windows, this occurs when the host-task and device kernel both access
+// USM without a dependency between the commands.
+// UNSUPPORTED: cuda && windows
+
+// REQUIRES: aspect-usm_shared_allocations
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  using namespace sycl;
+  queue Queue{};
+
+  constexpr size_t Size = 1024;
+  int *PtrA = malloc_shared<int>(Size, Queue);
+  int *PtrB = malloc_shared<int>(Size, Queue);
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) { PtrA[id] = id; });
+  });
+
+  const int ConstValue = 42;
+  Queue.submit([&](handler &CGH) {
+    CGH.host_task([=]() {
+      for (size_t i = 0; i < Size; i++) {
+        PtrB[i] = ConstValue;
+      }
+    });
+  });
+
+  Queue.wait_and_throw();
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(i == PtrA[i]);
+    assert(ConstValue == PtrB[i]);
+  }
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+
+  return 0;
+}