[SYCL] Add tests for reuse of l0 events in plugin (#1263)

againull · web-flow · commit f63f13ebe824 · 2022-09-15T11:33:58.000-07:00
diff --git a/SYCL/Plugin/level-zero-event-leak.cpp b/SYCL/Plugin/level-zero-event-leak.cpp
@@ -3,7 +3,12 @@
 // UNSUPPORTED: windows
 //
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
-// RUN: env SYCL_DEVICE_FILTER=level_zero ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_DEVICE_FILTER=level_zero ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.out wait 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_DEVICE_FILTER=level_zero ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.out nowait 2>&1 %GPU_CHECK_PLACEHOLDER
+//
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -DCHECK_INORDER -o %t.inorder.out
+// RUN: env SYCL_DEVICE_FILTER=level_zero ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.inorder.out wait 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_DEVICE_FILTER=level_zero ZE_DEBUG=4 %GPU_RUN_PLACEHOLDER %t.inorder.out nowait 2>&1 %GPU_CHECK_PLACEHOLDER
 //
 // CHECK-NOT: LEAK
 
@@ -36,10 +41,31 @@
 #include <sycl/sycl.hpp>
 
 int main(int argc, char **argv) {
+  assert(argc == 2 && "Invalid number of arguments");
+  std::string use_queue_finish(argv[1]);
+
+  bool Use = false;
+  if (use_queue_finish == "wait") {
+    Use = true;
+    std::cerr << "Use queue::wait" << std::endl;
+  } else if (use_queue_finish == "nowait") {
+    std::cerr << "No wait. Ensure resources are released anyway" << std::endl;
+  } else {
+    assert(0 && "Unsupported parameter value");
+  }
+
+#ifdef CHECK_INORDER
+  sycl::queue Q({sycl::property::queue::in_order{}});
+#else
   sycl::queue Q;
+#endif
+
   const unsigned n_chunk = 1000;
   for (int i = 0; i < n_chunk; i++)
     Q.single_task([=]() {});
-  Q.wait();
+
+  if (Use)
+    Q.wait();
+
   return 0;
 }
diff --git a/SYCL/Plugin/level_zero_inorder.cpp b/SYCL/Plugin/level_zero_inorder.cpp
@@ -0,0 +1,128 @@
+// REQUIRES: level_zero
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+//
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=0 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=2 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=3 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+//
+// The test checks that the kernels are executed in-order, regardless of
+// batching. IMPORTANT NOTE: this is a critical test, double-check if your
+// changes are related to L0 events and links between commands.
+
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+
+static constexpr int MAGIC_NUM1 = 2;
+static constexpr int buffer_size = 100;
+sycl::usm::alloc AllocType = sycl::usm::alloc::shared;
+
+const size_t PartSize = 5;
+const bool PartiallyPrint = buffer_size > 2 * PartSize;
+
+void ValidationPrint(const std::string &vectName, int *vect,
+                     const std::function<int(size_t)> &ExpectedVal) {
+  std::cerr << vectName;
+  if (!PartiallyPrint) {
+    for (size_t i = 0u; i < buffer_size; ++i) {
+      std::cerr << " " << vect[i];
+    }
+  } else {
+    for (size_t i = 0u; i < PartSize; ++i) {
+      std::cerr << " " << vect[i];
+    }
+    std::cerr << " ... ";
+    for (size_t i = buffer_size - PartSize; i < buffer_size; ++i) {
+      std::cerr << " " << vect[i];
+    }
+  }
+
+  std::cerr << std::endl << "expected[] = ";
+  if (!PartiallyPrint) {
+    for (size_t i = 0u; i < buffer_size; ++i) {
+      std::cerr << " " << ExpectedVal(i);
+    }
+  } else {
+    for (size_t i = 0u; i < PartSize; ++i) {
+      std::cerr << " " << ExpectedVal(i);
+    }
+    std::cerr << " ... ";
+    for (size_t i = buffer_size - PartSize; i < buffer_size; ++i) {
+      std::cerr << " " << ExpectedVal(i);
+    }
+  }
+  std::cerr << std::endl;
+  for (int i = 0; i < buffer_size; ++i) {
+    if (vect[i] != ExpectedVal(i)) {
+      std::cerr << "i = " << i << " is wrong!!! " << std::endl;
+      break;
+    }
+  }
+  std::cerr << std::endl;
+}
+
+void IfTrueIncrementByValue(sycl::queue Q, sycl::range<1> Range, int *Harray,
+                            int ValueToCheck, int ValueToIncrement) {
+  Q.submit([&](sycl::handler &CGH) {
+    CGH.parallel_for<class increment_usm>(Range, [=](sycl::item<1> itemID) {
+      size_t i = itemID.get_id(0);
+      if (Harray[i] == ValueToCheck) {
+        Harray[i] += ValueToIncrement;
+      }
+    });
+  });
+}
+
+void RunCalculation(sycl::queue Q) {
+  sycl::range<1> Range(buffer_size);
+  auto Dev = Q.get_device();
+  if (!Dev.has(sycl::aspect::usm_shared_allocations))
+    return;
+
+  int *values = sycl::malloc<int>(buffer_size, Dev, Q.get_context(), AllocType);
+
+  try {
+    Q.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for(Range, [=](sycl::item<1> itemID) {
+        size_t i = itemID.get_id(0);
+        values[i] = 1;
+      });
+    });
+
+    IfTrueIncrementByValue(Q, Range, values, 1, 10);
+
+    IfTrueIncrementByValue(Q, Range, values, 11, 100);
+
+    IfTrueIncrementByValue(Q, Range, values, 111, 1000);
+
+    IfTrueIncrementByValue(Q, Range, values, 1111, 10000);
+
+    IfTrueIncrementByValue(Q, Range, values, 11111, 100000);
+
+    Q.wait();
+
+    ValidationPrint("vector[]  = ", values, [&](size_t i) { return 111111; });
+
+    for (int i = 0; i < buffer_size; ++i) {
+      int expected = 111111;
+      assert(values[i] == expected);
+    }
+
+  } catch (sycl::exception &e) {
+    std::cout << "Exception: " << std::string(e.what()) << std::endl;
+  }
+
+  free(values, Q);
+}
+
+int main(int argc, char *argv[]) {
+  sycl::queue Q({sycl::property::queue::in_order{}});
+
+  RunCalculation(Q);
+
+  std::cout << "The test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Plugin/level_zero_inorder_interleaving_kernel_copy.cpp b/SYCL/Plugin/level_zero_inorder_interleaving_kernel_copy.cpp
@@ -0,0 +1,159 @@
+// REQUIRES: level_zero
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+//
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=0 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=2 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=3 SYCL_DEVICE_FILTER=level_zero %GPU_RUN_PLACEHOLDER %t.out
+//
+// The test checks that interleaving using copy and kernel operations are
+// performed in-order, regardless of batching. IMPORTANT NOTE: this is a
+// critical test, double-check if your changes are related to L0 events and
+// links between commands.
+
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+
+static constexpr int MAGIC_NUM1 = 2;
+static constexpr int buffer_size = 100;
+sycl::usm::alloc AllocType = sycl::usm::alloc::device;
+
+const size_t PartSize = 5;
+const bool PartiallyPrint = buffer_size > 2 * PartSize;
+
+void ValidationPrint(const std::string &vectName, const std::vector<int> &vect,
+                     const std::function<int(size_t)> &ExpectedVal) {
+  std::cerr << vectName;
+  if (!PartiallyPrint) {
+    for (size_t i = 0u; i < buffer_size; ++i) {
+      std::cerr << " " << vect[i];
+    }
+  } else {
+    for (size_t i = 0u; i < PartSize; ++i) {
+      std::cerr << " " << vect[i];
+    }
+    std::cerr << " ... ";
+    for (size_t i = buffer_size - PartSize; i < buffer_size; ++i) {
+      std::cerr << " " << vect[i];
+    }
+  }
+
+  std::cerr << std::endl << "expected[] = ";
+  if (!PartiallyPrint) {
+    for (size_t i = 0u; i < buffer_size; ++i) {
+      std::cerr << " " << ExpectedVal(i);
+    }
+  } else {
+    for (size_t i = 0u; i < PartSize; ++i) {
+      std::cerr << " " << ExpectedVal(i);
+    }
+    std::cerr << " ... ";
+    for (size_t i = buffer_size - PartSize; i < buffer_size; ++i) {
+      std::cerr << " " << ExpectedVal(i);
+    }
+  }
+  std::cerr << std::endl;
+  for (int i = 0; i < buffer_size; ++i) {
+    if (vect[i] != ExpectedVal(i)) {
+      std::cerr << "i = " << i << " is wrong!!! " << std::endl;
+      break;
+    }
+  }
+  std::cerr << std::endl;
+}
+
+void RunCalculation(sycl::queue Q) {
+  sycl::range<1> Range(buffer_size);
+  auto Dev = Q.get_device();
+  if (!Dev.has(sycl::aspect::usm_device_allocations))
+    return;
+
+  int *Dvalues =
+      sycl::malloc<int>(buffer_size, Dev, Q.get_context(), AllocType);
+  int *DvaluesTmp =
+      sycl::malloc<int>(buffer_size, Dev, Q.get_context(), AllocType);
+
+  std::vector<int> Hvalues1(buffer_size, 0);
+  std::vector<int> HvaluesTmp(buffer_size, 0);
+  std::iota(Hvalues1.begin(), Hvalues1.end(), 0);
+
+  try {
+    Q.memcpy(Dvalues, Hvalues1.data(), buffer_size * sizeof(int));
+
+    Q.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for(Range, [=](sycl::item<1> itemID) {
+        size_t i = itemID.get_id(0);
+        if (Dvalues[i] == i)
+          Dvalues[i] += 1;
+      });
+    });
+
+    Q.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for(Range, [=](sycl::item<1> itemID) {
+        size_t i = itemID.get_id(0);
+        if (Dvalues[i] == i + 1)
+          Dvalues[i] += 10;
+      });
+    });
+
+    Q.memcpy(Hvalues1.data(), Dvalues, buffer_size * sizeof(int));
+    Q.memcpy(DvaluesTmp, Hvalues1.data(), buffer_size * sizeof(int));
+
+    Q.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for(Range, [=](sycl::item<1> itemID) {
+        size_t i = itemID.get_id(0);
+        if (Dvalues[i] == i + 11)
+          if (DvaluesTmp[i] == i + 11)
+            Dvalues[i] += 100;
+      });
+    });
+
+    Q.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for(Range, [=](sycl::item<1> itemID) {
+        size_t i = itemID.get_id(0);
+        if (Dvalues[i] == i + 111)
+          Dvalues[i] += 1000;
+      });
+    });
+
+    Q.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for(Range, [=](sycl::item<1> itemID) {
+        size_t i = itemID.get_id(0);
+        if (Dvalues[i] == i + 1111)
+          Dvalues[i] += 10000;
+      });
+    });
+
+    Q.memcpy(Hvalues1.data(), Dvalues, buffer_size * sizeof(int));
+    Q.memcpy(HvaluesTmp.data(), DvaluesTmp, buffer_size * sizeof(int));
+    Q.wait();
+
+    ValidationPrint("vector1[]  = ", Hvalues1,
+                    [&](size_t i) { return i + 11111; });
+    ValidationPrint("vector2[]  = ", HvaluesTmp,
+                    [&](size_t i) { return i + 11; });
+
+    for (int i = 0; i < buffer_size; ++i) {
+      int expected = i + 11111;
+      assert(Hvalues1[i] == expected);
+    }
+
+  } catch (sycl::exception &e) {
+    std::cout << "Exception: " << std::string(e.what()) << std::endl;
+  }
+
+  free(Dvalues, Q);
+  free(DvaluesTmp, Q);
+}
+
+int main(int argc, char *argv[]) {
+  sycl::queue Q({sycl::property::queue::in_order{}});
+
+  RunCalculation(Q);
+
+  std::cout << "The test passed." << std::endl;
+  return 0;
+}