[SYCL] Emulating OOO execution for in-order queues.

MrSidims · vladimirlaz · commit c701af3422ea · 2019-02-11T22:14:59.000+03:00
For devices which do not support out-of-order queues
they are emulated by creating multiple in-order
queues and dispatching kernels to these queues in
parallel.

Signed-off-by: Dmitry Sidorov &lt;dmitry.sidorov@intel.com&gt;
Signed-off-by: Vladimir Lazarev &lt;vladimir.lazarev@intel.com&gt;
diff --git a/sycl/include/CL/sycl/detail/queue_impl.hpp b/sycl/include/CL/sycl/detail/queue_impl.hpp
@@ -20,6 +20,9 @@ namespace cl {
 namespace sycl {
 namespace detail {
 
+// Set max number of queues supported by FPGA RT.
+const size_t MaxNumQueues = 256;
+
 class queue_impl {
 public:
   queue_impl(const device &SyclDevice, async_handler AsyncHandler,
@@ -28,26 +31,7 @@ class queue_impl {
         m_PropList(PropList), m_HostQueue(m_Device.is_host()) {
     m_OpenCLInterop = !m_HostQueue;
     if (!m_HostQueue) {
-      cl_command_queue_properties CreationFlags =
-          CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-
-      if (m_PropList.has_property<property::queue::enable_profiling>()) {
-        CreationFlags |= CL_QUEUE_PROFILING_ENABLE;
-      }
-
-      cl_int Error = CL_SUCCESS;
-#ifdef CL_VERSION_2_0
-      vector_class<cl_queue_properties> CreationFlagProperties = {
-          CL_QUEUE_PROPERTIES, CreationFlags, 0};
-      m_CommandQueue = clCreateCommandQueueWithProperties(
-          m_Context.get(), m_Device.get(), CreationFlagProperties.data(),
-          &Error);
-#else
-      m_CommandQueue = clCreateCommandQueue(m_Context.get(), m_Device.get(),
-                                            CreationFlags, &Error);
-#endif
-      CHECK_OCL_CODE(Error);
-      // TODO catch an exception and put it to list of asynchronous exceptions
+      m_CommandQueue = createQueue();
     }
   }
 
@@ -132,7 +116,61 @@ class queue_impl {
     m_Exceptions.clear();
   }
 
-  cl_command_queue &getHandleRef() { return m_CommandQueue; }
+  cl_command_queue createQueue() const {
+    cl_command_queue_properties CreationFlags = 0;
+
+    // FPGA RT can't handle out of order queue - create in order queue instead
+    if (!m_Device.is_accelerator()) {
+      CreationFlags = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+    }
+
+    if (m_PropList.has_property<property::queue::enable_profiling>()) {
+      CreationFlags |= CL_QUEUE_PROFILING_ENABLE;
+    }
+
+    cl_int Error = CL_SUCCESS;
+    cl_command_queue Queue;
+#ifdef CL_VERSION_2_0
+    cl_queue_properties CreationFlagProperties[] = {
+        CL_QUEUE_PROPERTIES, CreationFlags, 0};
+    Queue = clCreateCommandQueueWithProperties(
+        m_Context.get(), m_Device.get(), CreationFlagProperties,
+        &Error);
+#else
+    Queue = clCreateCommandQueue(m_Context.get(), m_Device.get(),
+                                          CreationFlags, &Error);
+#endif
+    CHECK_OCL_CODE(Error);
+    // TODO catch an exception and put it to list of asynchronous exceptions
+
+    return Queue;
+  }
+
+  cl_command_queue &getHandleRef() {
+    if (!m_Device.is_accelerator()) {
+      return m_CommandQueue;
+    }
+
+    // To achive parallelism for FPGA with in order execution model with
+    // possibility of two kernels to share data with each other we shall
+    // create a queue for every kernel enqueued.
+    if (m_Queues.empty()) {
+      m_Queues.push_back(m_CommandQueue);
+      return m_CommandQueue;
+    }
+    else if (m_Queues.size() < MaxNumQueues) {
+      m_Queues.push_back(createQueue());
+      return m_Queues.back();
+    }
+
+    // If the limit of OpenCL queues is going to be exceeded - take the earliest
+    // used queue, wait until it finished and then reuse it.
+    m_QueueNumber %= MaxNumQueues;
+    size_t FreeQueueNum = m_QueueNumber++;
+
+    CHECK_OCL_CODE(clFinish(m_Queues[FreeQueueNum]));
+    return m_Queues[FreeQueueNum];
+  }
 
   template <typename propertyT> bool has_property() const {
     return m_PropList.has_property<propertyT>();
@@ -161,6 +199,12 @@ class queue_impl {
   property_list m_PropList;
 
   cl_command_queue m_CommandQueue = nullptr;
+
+  // List of OpenCL queues created for FPGA device from a single SYCL queue.
+  vector_class<cl_command_queue> m_Queues;
+  // Iterator through m_Queues.
+  size_t m_QueueNumber = 0;
+
   bool m_OpenCLInterop = false;
   bool m_HostQueue = false;
 };
diff --git a/sycl/test/fpga_tests/fpga_queue.cpp b/sycl/test/fpga_tests/fpga_queue.cpp
@@ -0,0 +1,148 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+const int dataSize = 32;
+const int maxNumQueues = 256;
+
+void GetCLQueue(event sycl_event, std::set<cl_command_queue>& cl_queues) {
+  try {
+    cl_command_queue cl_queue;
+    cl_event cl_event = sycl_event.get();
+    cl_int error = clGetEventInfo(cl_event, CL_EVENT_COMMAND_QUEUE,
+                                  sizeof(cl_queue), &cl_queue, nullptr);
+    assert(CL_SUCCESS == error && "Failed to obtain queue from OpenCL event");
+
+    cl_queues.insert(cl_queue);
+  } catch (invalid_object_error e) {
+    std::cout << "Failed to get OpenCL queue from SYCL event: " << e.what()
+              << std::endl;
+  }
+}
+
+int main() {
+  int data[dataSize] = {0};
+
+  {
+    queue Queue;
+    std::set<cl_command_queue> cl_queues;
+    event sycl_event;
+
+    // Purpose of this test is to check how many OpenCL queues are being
+    // created from 1 SYCL queue for FPGA device. For that we submit 3 kernels
+    // expecting 3 OpenCL queues created as a result.
+    buffer<int, 1> bufA (data, range<1>(dataSize));
+    buffer<int, 1> bufB (data, range<1>(dataSize));
+    buffer<int, 1> bufC (data, range<1>(dataSize));
+
+    sycl_event = Queue.submit([&](handler& cgh) {
+      auto writeBuffer = bufA.get_access<access::mode::write>(cgh);
+
+      // Create a range.
+      auto myRange = range<1>(dataSize);
+
+      // Create a kernel.
+      auto myKernel = ([=](id<1> idx) {
+        writeBuffer[idx] = idx[0];
+      });
+
+      cgh.parallel_for<class fpga_writer_1>(myRange, myKernel);
+    });
+    GetCLQueue(sycl_event, cl_queues);
+
+    sycl_event = Queue.submit([&](handler& cgh) {
+      auto writeBuffer = bufB.get_access<access::mode::write>(cgh);
+
+      // Create a range.
+      auto myRange = range<1>(dataSize);
+
+      // Create a kernel.
+      auto myKernel = ([=](id<1> idx) {
+        writeBuffer[idx] = idx[0];
+      });
+
+      cgh.parallel_for<class fpga_writer_2>(myRange, myKernel);
+    });
+    GetCLQueue(sycl_event, cl_queues);
+
+    sycl_event = Queue.submit([&](handler& cgh) {
+      auto readBufferA = bufA.get_access<access::mode::read>(cgh);
+      auto readBufferB = bufB.get_access<access::mode::read>(cgh);
+      auto writeBuffer = bufC.get_access<access::mode::write>(cgh);
+
+      // Create a range.
+      auto myRange = range<1>(dataSize);
+
+      // Create a kernel.
+      auto myKernel = ([=](id<1> idx) {
+        writeBuffer[idx] = readBufferA[idx] + readBufferB[idx];
+      });
+
+      cgh.parallel_for<class fpga_calculator>(myRange, myKernel);
+    });
+    GetCLQueue(sycl_event, cl_queues);
+
+    int result = cl_queues.size();
+    device dev = Queue.get_device();
+    int expected_result = dev.is_accelerator() ? 3 : dev.is_host() ? 0 : 1;
+
+    if (expected_result != result) {
+      std::cout << "Result Num of queues = " << result << std::endl
+                << "Expected Num of queues = 3" << std::endl;
+
+      return -1;
+    }
+
+    auto readBufferC = bufC.get_access<access::mode::read>();
+    for (size_t i = 0; i != dataSize; ++i) {
+      if (readBufferC[i] != 2 * i) {
+        std::cout << "Result mismatches " << readBufferC[i] << " Vs expected "
+                  << 2 * i << " for index " << i << std::endl;
+      }
+    }
+  }
+
+  {
+    queue Queue;
+    std::set<cl_command_queue> cl_queues;
+    event sycl_event;
+
+    // Check limits of OpenCL queues creation for accelerator device.
+    buffer<int, 1> buf (&data[0], range<1>(1));
+
+    for (size_t i = 0; i != maxNumQueues + 1; ++i) {
+      sycl_event = Queue.submit([&](handler& cgh) {
+        auto Buffer = buf.get_access<access::mode::write>(cgh);
+
+        // Create a kernel.
+        auto myKernel = ([=]() {
+          Buffer[0] = 0;
+        });
+
+        cgh.single_task<class fpga_kernel>(myKernel);
+      });
+      GetCLQueue(sycl_event, cl_queues);
+    }
+
+    int result = cl_queues.size();
+    device dev = Queue.get_device();
+    int expected_result = dev.is_accelerator() ? maxNumQueues :
+                          dev.is_host() ? 0 : 1;
+
+    if (expected_result != result) {
+      std::cout << "Result Num of queues = " << result << std::endl
+                << "Expected Num of queues = " << maxNumQueues << std::endl;
+
+      return -1;
+    }
+  }
+
+  return 0;
+}