Skip to content

Commit c701af3

Browse files
MrSidimsvladimirlaz
authored andcommitted
[SYCL] Emulating OOO execution for in-order queues.
For devices which do not support out-of-order queues they are emulated by creating multiple in-order queues and dispatching kernels to these queues in parallel. Signed-off-by: Dmitry Sidorov <[email protected]> Signed-off-by: Vladimir Lazarev <[email protected]>
1 parent 36d4d1d commit c701af3

File tree

2 files changed

+213
-21
lines changed

2 files changed

+213
-21
lines changed

sycl/include/CL/sycl/detail/queue_impl.hpp

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ namespace cl {
2020
namespace sycl {
2121
namespace detail {
2222

23+
// Set max number of queues supported by FPGA RT.
24+
const size_t MaxNumQueues = 256;
25+
2326
class queue_impl {
2427
public:
2528
queue_impl(const device &SyclDevice, async_handler AsyncHandler,
@@ -28,26 +31,7 @@ class queue_impl {
2831
m_PropList(PropList), m_HostQueue(m_Device.is_host()) {
2932
m_OpenCLInterop = !m_HostQueue;
3033
if (!m_HostQueue) {
31-
cl_command_queue_properties CreationFlags =
32-
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
33-
34-
if (m_PropList.has_property<property::queue::enable_profiling>()) {
35-
CreationFlags |= CL_QUEUE_PROFILING_ENABLE;
36-
}
37-
38-
cl_int Error = CL_SUCCESS;
39-
#ifdef CL_VERSION_2_0
40-
vector_class<cl_queue_properties> CreationFlagProperties = {
41-
CL_QUEUE_PROPERTIES, CreationFlags, 0};
42-
m_CommandQueue = clCreateCommandQueueWithProperties(
43-
m_Context.get(), m_Device.get(), CreationFlagProperties.data(),
44-
&Error);
45-
#else
46-
m_CommandQueue = clCreateCommandQueue(m_Context.get(), m_Device.get(),
47-
CreationFlags, &Error);
48-
#endif
49-
CHECK_OCL_CODE(Error);
50-
// TODO catch an exception and put it to list of asynchronous exceptions
34+
m_CommandQueue = createQueue();
5135
}
5236
}
5337

@@ -132,7 +116,61 @@ class queue_impl {
132116
m_Exceptions.clear();
133117
}
134118

135-
cl_command_queue &getHandleRef() { return m_CommandQueue; }
119+
cl_command_queue createQueue() const {
120+
cl_command_queue_properties CreationFlags = 0;
121+
122+
// FPGA RT can't handle out of order queue - create in order queue instead
123+
if (!m_Device.is_accelerator()) {
124+
CreationFlags = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
125+
}
126+
127+
if (m_PropList.has_property<property::queue::enable_profiling>()) {
128+
CreationFlags |= CL_QUEUE_PROFILING_ENABLE;
129+
}
130+
131+
cl_int Error = CL_SUCCESS;
132+
cl_command_queue Queue;
133+
#ifdef CL_VERSION_2_0
134+
cl_queue_properties CreationFlagProperties[] = {
135+
CL_QUEUE_PROPERTIES, CreationFlags, 0};
136+
Queue = clCreateCommandQueueWithProperties(
137+
m_Context.get(), m_Device.get(), CreationFlagProperties,
138+
&Error);
139+
#else
140+
Queue = clCreateCommandQueue(m_Context.get(), m_Device.get(),
141+
CreationFlags, &Error);
142+
#endif
143+
CHECK_OCL_CODE(Error);
144+
// TODO catch an exception and put it to list of asynchronous exceptions
145+
146+
return Queue;
147+
}
148+
149+
cl_command_queue &getHandleRef() {
150+
if (!m_Device.is_accelerator()) {
151+
return m_CommandQueue;
152+
}
153+
154+
// To achive parallelism for FPGA with in order execution model with
155+
// possibility of two kernels to share data with each other we shall
156+
// create a queue for every kernel enqueued.
157+
if (m_Queues.empty()) {
158+
m_Queues.push_back(m_CommandQueue);
159+
return m_CommandQueue;
160+
}
161+
else if (m_Queues.size() < MaxNumQueues) {
162+
m_Queues.push_back(createQueue());
163+
return m_Queues.back();
164+
}
165+
166+
// If the limit of OpenCL queues is going to be exceeded - take the earliest
167+
// used queue, wait until it finished and then reuse it.
168+
m_QueueNumber %= MaxNumQueues;
169+
size_t FreeQueueNum = m_QueueNumber++;
170+
171+
CHECK_OCL_CODE(clFinish(m_Queues[FreeQueueNum]));
172+
return m_Queues[FreeQueueNum];
173+
}
136174

137175
template <typename propertyT> bool has_property() const {
138176
return m_PropList.has_property<propertyT>();
@@ -161,6 +199,12 @@ class queue_impl {
161199
property_list m_PropList;
162200

163201
cl_command_queue m_CommandQueue = nullptr;
202+
203+
// List of OpenCL queues created for FPGA device from a single SYCL queue.
204+
vector_class<cl_command_queue> m_Queues;
205+
// Iterator through m_Queues.
206+
size_t m_QueueNumber = 0;
207+
164208
bool m_OpenCLInterop = false;
165209
bool m_HostQueue = false;
166210
};

sycl/test/fpga_tests/fpga_queue.cpp

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
2+
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
3+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
6+
7+
#include <CL/sycl.hpp>
8+
#include <iostream>
9+
10+
using namespace cl::sycl;
11+
12+
const int dataSize = 32;
13+
const int maxNumQueues = 256;
14+
15+
void GetCLQueue(event sycl_event, std::set<cl_command_queue>& cl_queues) {
16+
try {
17+
cl_command_queue cl_queue;
18+
cl_event cl_event = sycl_event.get();
19+
cl_int error = clGetEventInfo(cl_event, CL_EVENT_COMMAND_QUEUE,
20+
sizeof(cl_queue), &cl_queue, nullptr);
21+
assert(CL_SUCCESS == error && "Failed to obtain queue from OpenCL event");
22+
23+
cl_queues.insert(cl_queue);
24+
} catch (invalid_object_error e) {
25+
std::cout << "Failed to get OpenCL queue from SYCL event: " << e.what()
26+
<< std::endl;
27+
}
28+
}
29+
30+
int main() {
31+
int data[dataSize] = {0};
32+
33+
{
34+
queue Queue;
35+
std::set<cl_command_queue> cl_queues;
36+
event sycl_event;
37+
38+
// Purpose of this test is to check how many OpenCL queues are being
39+
// created from 1 SYCL queue for FPGA device. For that we submit 3 kernels
40+
// expecting 3 OpenCL queues created as a result.
41+
buffer<int, 1> bufA (data, range<1>(dataSize));
42+
buffer<int, 1> bufB (data, range<1>(dataSize));
43+
buffer<int, 1> bufC (data, range<1>(dataSize));
44+
45+
sycl_event = Queue.submit([&](handler& cgh) {
46+
auto writeBuffer = bufA.get_access<access::mode::write>(cgh);
47+
48+
// Create a range.
49+
auto myRange = range<1>(dataSize);
50+
51+
// Create a kernel.
52+
auto myKernel = ([=](id<1> idx) {
53+
writeBuffer[idx] = idx[0];
54+
});
55+
56+
cgh.parallel_for<class fpga_writer_1>(myRange, myKernel);
57+
});
58+
GetCLQueue(sycl_event, cl_queues);
59+
60+
sycl_event = Queue.submit([&](handler& cgh) {
61+
auto writeBuffer = bufB.get_access<access::mode::write>(cgh);
62+
63+
// Create a range.
64+
auto myRange = range<1>(dataSize);
65+
66+
// Create a kernel.
67+
auto myKernel = ([=](id<1> idx) {
68+
writeBuffer[idx] = idx[0];
69+
});
70+
71+
cgh.parallel_for<class fpga_writer_2>(myRange, myKernel);
72+
});
73+
GetCLQueue(sycl_event, cl_queues);
74+
75+
sycl_event = Queue.submit([&](handler& cgh) {
76+
auto readBufferA = bufA.get_access<access::mode::read>(cgh);
77+
auto readBufferB = bufB.get_access<access::mode::read>(cgh);
78+
auto writeBuffer = bufC.get_access<access::mode::write>(cgh);
79+
80+
// Create a range.
81+
auto myRange = range<1>(dataSize);
82+
83+
// Create a kernel.
84+
auto myKernel = ([=](id<1> idx) {
85+
writeBuffer[idx] = readBufferA[idx] + readBufferB[idx];
86+
});
87+
88+
cgh.parallel_for<class fpga_calculator>(myRange, myKernel);
89+
});
90+
GetCLQueue(sycl_event, cl_queues);
91+
92+
int result = cl_queues.size();
93+
device dev = Queue.get_device();
94+
int expected_result = dev.is_accelerator() ? 3 : dev.is_host() ? 0 : 1;
95+
96+
if (expected_result != result) {
97+
std::cout << "Result Num of queues = " << result << std::endl
98+
<< "Expected Num of queues = 3" << std::endl;
99+
100+
return -1;
101+
}
102+
103+
auto readBufferC = bufC.get_access<access::mode::read>();
104+
for (size_t i = 0; i != dataSize; ++i) {
105+
if (readBufferC[i] != 2 * i) {
106+
std::cout << "Result mismatches " << readBufferC[i] << " Vs expected "
107+
<< 2 * i << " for index " << i << std::endl;
108+
}
109+
}
110+
}
111+
112+
{
113+
queue Queue;
114+
std::set<cl_command_queue> cl_queues;
115+
event sycl_event;
116+
117+
// Check limits of OpenCL queues creation for accelerator device.
118+
buffer<int, 1> buf (&data[0], range<1>(1));
119+
120+
for (size_t i = 0; i != maxNumQueues + 1; ++i) {
121+
sycl_event = Queue.submit([&](handler& cgh) {
122+
auto Buffer = buf.get_access<access::mode::write>(cgh);
123+
124+
// Create a kernel.
125+
auto myKernel = ([=]() {
126+
Buffer[0] = 0;
127+
});
128+
129+
cgh.single_task<class fpga_kernel>(myKernel);
130+
});
131+
GetCLQueue(sycl_event, cl_queues);
132+
}
133+
134+
int result = cl_queues.size();
135+
device dev = Queue.get_device();
136+
int expected_result = dev.is_accelerator() ? maxNumQueues :
137+
dev.is_host() ? 0 : 1;
138+
139+
if (expected_result != result) {
140+
std::cout << "Result Num of queues = " << result << std::endl
141+
<< "Expected Num of queues = " << maxNumQueues << std::endl;
142+
143+
return -1;
144+
}
145+
}
146+
147+
return 0;
148+
}

0 commit comments

Comments
 (0)