Skip to content

Commit da6bfd0

Browse files
[SYCL] Implement SYCL_INTEL_enqueue_barrier DPC++ extension (#1836)
This patch adds the implementation of SYCL_INTEL_enqueue_barrier DPC++ extension. There are situations where defining dependencies based on events is more explicit than desired or required by an application. For instance, the user may know that a given task depends on all previously submitted tasks. Instead of explicitly adding all the required handler::depends_on calls, the user could express this intent via a single call, making the program more concise and explicit. To simplify the interface, this extension adds two new members to the handler class, and two new members to the queue class: handler::barrier : void barrier() and void barrier(const vector_class<event> &WaitList) queue::submit_barrier : event submit_barrier() and event submit_barrier(const vector_class<event> &WaitList)
1 parent bc5be46 commit da6bfd0

File tree

15 files changed

+271
-20
lines changed

15 files changed

+271
-20
lines changed

sycl/doc/extensions/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ DPC++ extensions status:
1212
| [SYCL_INTEL_data_flow_pipes](DataFlowPipes/data_flow_pipes.asciidoc) | Partially supported(OpenCL: ACCELERATOR) | kernel_host_pipe_support part is not implemented |
1313
| [SYCL_INTEL_deduction_guides](deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) | Supported | |
1414
| [SYCL_INTEL_device_specific_kernel_queries](DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc) | Proposal | |
15-
| [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc) | Proposal | |
15+
| [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc) | Supported(OpenCL, Level Zero) | |
1616
| [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) | Proposal | |
1717
| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc) | Supported(OpenCL) | |
1818
| [SYCL_INTEL_group_mask](./GroupMask/SYCL_INTEL_group_mask.asciidoc) | Proposal | |

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class CG {
5555
COPY_ACC_TO_PTR,
5656
COPY_PTR_TO_ACC,
5757
COPY_ACC_TO_ACC,
58+
BARRIER,
59+
BARRIER_WAITLIST,
5860
FILL,
5961
UPDATE_HOST,
6062
RUN_ON_HOST_INTEL,
@@ -320,6 +322,23 @@ class CGHostTask : public CG {
320322
MHostTask(std::move(HostTask)), MArgs(std::move(Args)) {}
321323
};
322324

325+
class CGBarrier : public CG {
326+
public:
327+
vector_class<detail::EventImplPtr> MEventsWaitWithBarrier;
328+
329+
CGBarrier(vector_class<detail::EventImplPtr> EventsWaitWithBarrier,
330+
std::vector<std::vector<char>> ArgsStorage,
331+
std::vector<detail::AccessorImplPtr> AccStorage,
332+
std::vector<std::shared_ptr<const void>> SharedPtrStorage,
333+
std::vector<Requirement *> Requirements,
334+
std::vector<detail::EventImplPtr> Events, CGTYPE Type,
335+
detail::code_location loc = {})
336+
: CG(Type, std::move(ArgsStorage), std::move(AccStorage),
337+
std::move(SharedPtrStorage), std::move(Requirements),
338+
std::move(Events), std::move(loc)),
339+
MEventsWaitWithBarrier(std::move(EventsWaitWithBarrier)) {}
340+
};
341+
323342
} // namespace detail
324343
} // namespace sycl
325344
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/include/CL/sycl/detail/pi.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ _PI_API(piSamplerRelease)
9797
_PI_API(piEnqueueKernelLaunch)
9898
_PI_API(piEnqueueNativeKernel)
9999
_PI_API(piEnqueueEventsWait)
100+
_PI_API(piEnqueueEventsWaitWithBarrier)
100101
_PI_API(piEnqueueMemBufferRead)
101102
_PI_API(piEnqueueMemBufferReadRect)
102103
_PI_API(piEnqueueMemBufferWrite)

sycl/include/CL/sycl/detail/pi.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,6 +1226,10 @@ __SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue,
12261226
const pi_event *event_wait_list,
12271227
pi_event *event);
12281228

1229+
__SYCL_EXPORT pi_result piEnqueueEventsWaitWithBarrier(
1230+
pi_queue command_queue, pi_uint32 num_events_in_wait_list,
1231+
const pi_event *event_wait_list, pi_event *event);
1232+
12291233
__SYCL_EXPORT pi_result piEnqueueMemBufferRead(
12301234
pi_queue queue, pi_mem buffer, pi_bool blocking_read, size_t offset,
12311235
size_t size, void *ptr, pi_uint32 num_events_in_wait_list,

sycl/include/CL/sycl/handler.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1683,6 +1683,29 @@ class __SYCL_EXPORT handler {
16831683
}
16841684
}
16851685

1686+
/// Prevents any commands submitted afterward to this queue from executing
1687+
/// until all commands previously submitted to this queue have entered the
1688+
/// complete state.
1689+
void barrier() {
1690+
throwIfActionIsCreated();
1691+
MCGType = detail::CG::BARRIER;
1692+
}
1693+
1694+
/// Prevents any commands submitted afterward to this queue from executing
1695+
/// until all events in WaitList have entered the complete state. If WaitList
1696+
/// is empty, then the barrier has no effect.
1697+
///
1698+
/// \param WaitList is a vector of valid SYCL events that need to complete
1699+
/// before barrier command can be executed.
1700+
void barrier(const vector_class<event> &WaitList) {
1701+
throwIfActionIsCreated();
1702+
MCGType = detail::CG::BARRIER_WAITLIST;
1703+
MEventsWaitWithBarrier.resize(WaitList.size());
1704+
std::transform(
1705+
WaitList.begin(), WaitList.end(), MEventsWaitWithBarrier.begin(),
1706+
[](const event &Event) { return detail::getSyclObjImpl(Event); });
1707+
}
1708+
16861709
/// Copies data from one memory region to another, both pointed by
16871710
/// USM pointers.
16881711
///
@@ -1766,6 +1789,9 @@ class __SYCL_EXPORT handler {
17661789
std::unique_ptr<detail::InteropTask> MInteropTask;
17671790
/// The list of events that order this operation.
17681791
vector_class<detail::EventImplPtr> MEvents;
1792+
/// The list of valid SYCL events that need to complete
1793+
/// before barrier command can be executed
1794+
vector_class<detail::EventImplPtr> MEventsWaitWithBarrier;
17691795

17701796
bool MIsHost = false;
17711797

sycl/include/CL/sycl/queue.hpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,46 @@ class __SYCL_EXPORT queue {
221221
return submit_impl(CGF, SecondaryQueue, CodeLoc);
222222
}
223223

224+
/// Prevents any commands submitted afterward to this queue from executing
225+
/// until all commands previously submitted to this queue have entered the
226+
/// complete state.
227+
///
228+
/// \param CodeLoc is the code location of the submit call (default argument)
229+
/// \return a SYCL event object, which corresponds to the queue the command
230+
/// group is being enqueued on.
231+
event submit_barrier(
232+
#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
233+
const detail::code_location &CodeLoc = detail::code_location::current()
234+
#endif
235+
) {
236+
#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
237+
const detail::code_location &CodeLoc = {};
238+
#endif
239+
return submit([=](handler &CGH) { CGH.barrier(); }, CodeLoc);
240+
}
241+
242+
/// Prevents any commands submitted afterward to this queue from executing
243+
/// until all events in WaitList have entered the complete state. If WaitList
244+
/// is empty, then submit_barrier has no effect.
245+
///
246+
/// \param WaitList is a vector of valid SYCL events that need to complete
247+
/// before barrier command can be executed.
248+
/// \param CodeLoc is the code location of the submit call (default argument)
249+
/// \return a SYCL event object, which corresponds to the queue the command
250+
/// group is being enqueued on.
251+
event submit_barrier(
252+
const vector_class<event> &WaitList
253+
#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
254+
,
255+
const detail::code_location &CodeLoc = detail::code_location::current()
256+
#endif
257+
) {
258+
#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
259+
const detail::code_location &CodeLoc = {};
260+
#endif
261+
return submit([=](handler &CGH) { CGH.barrier(WaitList); }, CodeLoc);
262+
}
263+
224264
/// Performs a blocking wait for the completion of all enqueued tasks in the
225265
/// queue.
226266
///

sycl/plugins/level_zero/pi_level0.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2503,6 +2503,46 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList,
25032503
return {};
25042504
}
25052505

2506+
pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
2507+
pi_uint32 NumEventsInWaitList,
2508+
const pi_event *EventWaitList,
2509+
pi_event *Event) {
2510+
2511+
assert(Queue);
2512+
2513+
// Get a new command list to be used on this call
2514+
ze_command_list_handle_t ZeCommandList = nullptr;
2515+
if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList))
2516+
return Res;
2517+
2518+
ze_event_handle_t ZeEvent = nullptr;
2519+
if (Event) {
2520+
auto Res = piEventCreate(Queue->Context, Event);
2521+
if (Res != PI_SUCCESS)
2522+
return Res;
2523+
2524+
(*Event)->Queue = Queue;
2525+
(*Event)->CommandType = PI_COMMAND_TYPE_USER;
2526+
(*Event)->ZeCommandList = ZeCommandList;
2527+
2528+
ZeEvent = (*Event)->ZeEvent;
2529+
}
2530+
2531+
// TODO: use unique_ptr with custom deleter in the whole Level Zero plugin for
2532+
// wrapping ze_event_handle_t *ZeEventWaitList to avoid memory leaks in case
2533+
// return will be called in ZE_CALL(ze***(...)), and thus
2534+
// _pi_event::deleteZeEventList(ZeEventWaitList) won't be called.
2535+
ze_event_handle_t *ZeEventWaitList =
2536+
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
2537+
2538+
ZE_CALL(zeCommandListAppendBarrier(ZeCommandList, ZeEvent,
2539+
NumEventsInWaitList, ZeEventWaitList));
2540+
2541+
_pi_event::deleteZeEventList(ZeEventWaitList);
2542+
2543+
return PI_SUCCESS;
2544+
}
2545+
25062546
pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src,
25072547
pi_bool BlockingRead, size_t Offset,
25082548
size_t Size, void *Dst,

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,6 +1193,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
11931193
_PI_CL(piEnqueueKernelLaunch, clEnqueueNDRangeKernel)
11941194
_PI_CL(piEnqueueNativeKernel, clEnqueueNativeKernel)
11951195
_PI_CL(piEnqueueEventsWait, clEnqueueMarkerWithWaitList)
1196+
_PI_CL(piEnqueueEventsWaitWithBarrier, clEnqueueBarrierWithWaitList)
11961197
_PI_CL(piEnqueueMemBufferRead, clEnqueueReadBuffer)
11971198
_PI_CL(piEnqueueMemBufferReadRect, clEnqueueReadBufferRect)
11981199
_PI_CL(piEnqueueMemBufferWrite, clEnqueueWriteBuffer)

sycl/source/detail/program_impl.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ class program_impl {
240240
return createSyclObjFromImpl<context>(MContext);
241241
}
242242

243-
// \return the Plugin associated withh the context of this program.
243+
/// \return the Plugin associated with the context of this program.
244244
const plugin &getPlugin() const {
245245
assert(!is_host() && "Plugin is not available for Host.");
246246
return MContext->getPlugin();

sycl/source/detail/scheduler/commands.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1949,6 +1949,32 @@ cl_int ExecCGCommand::enqueueImp() {
19491949

19501950
return CL_SUCCESS;
19511951
}
1952+
case CG::CGTYPE::BARRIER: {
1953+
if (MQueue->get_device().is_host()) {
1954+
// NOP for host device.
1955+
return PI_SUCCESS;
1956+
}
1957+
const detail::plugin &Plugin = MQueue->getPlugin();
1958+
Plugin.call<PiApiKind::piEnqueueEventsWaitWithBarrier>(
1959+
MQueue->getHandleRef(), 0, nullptr, &Event);
1960+
1961+
return PI_SUCCESS;
1962+
}
1963+
case CG::CGTYPE::BARRIER_WAITLIST: {
1964+
CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
1965+
std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
1966+
if (MQueue->get_device().is_host() || Events.empty()) {
1967+
// NOP for host device.
1968+
// If Events is empty, then the barrier has no effect.
1969+
return PI_SUCCESS;
1970+
}
1971+
std::vector<RT::PiEvent> PiEvents = getPiEvents(Events);
1972+
const detail::plugin &Plugin = MQueue->getPlugin();
1973+
Plugin.call<PiApiKind::piEnqueueEventsWaitWithBarrier>(
1974+
MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], &Event);
1975+
1976+
return PI_SUCCESS;
1977+
}
19521978
case CG::CGTYPE::NONE:
19531979
throw runtime_error("CG type not implemented.", PI_INVALID_OPERATION);
19541980
}

sycl/source/handler.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,13 @@ event handler::finalize() {
8989
std::move(MAccStorage), std::move(MSharedPtrStorage),
9090
std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
9191
break;
92+
case detail::CG::BARRIER:
93+
case detail::CG::BARRIER_WAITLIST:
94+
CommandGroup.reset(new detail::CGBarrier(
95+
std::move(MEventsWaitWithBarrier), std::move(MArgsStorage),
96+
std::move(MAccStorage), std::move(MSharedPtrStorage),
97+
std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
98+
break;
9299
case detail::CG::NONE:
93100
throw runtime_error("Command group submitted without a kernel or a "
94101
"explicit memory operation.",

sycl/test/abi/layout_handler.cpp

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -161,19 +161,27 @@ void foo() {
161161
// CHECK: 448 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_start
162162
// CHECK-NEXT: 456 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_finish
163163
// CHECK-NEXT: 464 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_end_of_storage
164-
// CHECK-NEXT: 472 | _Bool MIsHost
165-
// CHECK-NEXT: 480 | struct cl::sycl::detail::code_location MCodeLoc
166-
// CHECK-NEXT: 480 | const char * MFileName
167-
// CHECK-NEXT: 488 | const char * MFunctionName
168-
// CHECK-NEXT: 496 | unsigned long MLineNo
169-
// CHECK-NEXT: 504 | unsigned long MColumnNo
170-
// CHECK-NEXT: 512 | _Bool MIsFinalized
171-
// CHECK-NEXT: 520 | class cl::sycl::event MLastEvent
172-
// CHECK-NEXT: 520 | class std::shared_ptr<class cl::sycl::detail::event_impl> impl
173-
// CHECK-NEXT: 520 | class std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base)
174-
// CHECK-NEXT: 520 | class std::__shared_ptr_access<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic, false, false> (base) (empty)
175-
// CHECK-NEXT: 520 | std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic>::element_type * _M_ptr
176-
// CHECK-NEXT: 528 | class std::__shared_count<__gnu_cxx::_S_atomic> _M_refcount
177-
// CHECK-NEXT: 528 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi
178-
// CHECK-NEXT: | [sizeof=536, dsize=536, align=8,
179-
// CHECK-NEXT: | nvsize=536, nvalign=8]
164+
// CHECK-NEXT: 472 | class std::vector<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > > MEventsWaitWithBarrier
165+
// CHECK-NEXT: 472 | struct std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > > (base)
166+
// CHECK-NEXT: 472 | struct std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::_Vector_impl _M_impl
167+
// CHECK-NEXT: 472 | class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > (base) (empty)
168+
// CHECK-NEXT: 472 | class __gnu_cxx::new_allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > (base) (empty)
169+
// CHECK-NEXT: 472 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_start
170+
// CHECK-NEXT: 480 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_finish
171+
// CHECK-NEXT: 488 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_end_of_storage
172+
// CHECK-NEXT: 496 | _Bool MIsHost
173+
// CHECK-NEXT: 504 | struct cl::sycl::detail::code_location MCodeLoc
174+
// CHECK-NEXT: 504 | const char * MFileName
175+
// CHECK-NEXT: 512 | const char * MFunctionName
176+
// CHECK-NEXT: 520 | unsigned long MLineNo
177+
// CHECK-NEXT: 528 | unsigned long MColumnNo
178+
// CHECK-NEXT: 536 | _Bool MIsFinalized
179+
// CHECK-NEXT: 544 | class cl::sycl::event MLastEvent
180+
// CHECK-NEXT: 544 | class std::shared_ptr<class cl::sycl::detail::event_impl> impl
181+
// CHECK-NEXT: 544 | class std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base)
182+
// CHECK-NEXT: 544 | class std::__shared_ptr_access<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic, false, false> (base) (empty)
183+
// CHECK-NEXT: 544 | std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic>::element_type * _M_ptr
184+
// CHECK-NEXT: 552 | class std::__shared_count<__gnu_cxx::_S_atomic> _M_refcount
185+
// CHECK-NEXT: 552 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi
186+
// CHECK-NEXT: | [sizeof=560, dsize=560, align=8,
187+
// CHECK-NEXT: | nvsize=560, nvalign=8]

sycl/test/abi/pi_level0_symbol_check.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ piMemRetain
6161
piextUSMEnqueuePrefetch
6262
piextKernelSetArgPointer
6363
piEnqueueEventsWait
64+
piEnqueueEventsWaitWithBarrier
6465
piEnqueueMemBufferCopy
6566
piQueueGetInfo
6667
piDevicePartition

sycl/test/abi/symbol_size.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ int main() {
4444
check_size<event, 16>();
4545
check_size<gpu_selector, 8>();
4646
#ifdef _MSC_VER
47-
check_size<handler, 528>();
47+
check_size<handler, 552>();
4848
#else
49-
check_size<handler, 536>();
49+
check_size<handler, 560>();
5050
#endif
5151
check_size<image<1>, 16>();
5252
check_size<kernel, 16>();

0 commit comments

Comments
 (0)