Skip to content

[SYCL] Implement SYCL_INTEL_enqueue_barrier DPC++ extension #1836

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sycl/doc/extensions/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ DPC++ extensions status:
| [SYCL_INTEL_data_flow_pipes](DataFlowPipes/data_flow_pipes.asciidoc) | Partially supported(OpenCL: ACCELERATOR) | kernel_host_pipe_support part is not implemented |
| [SYCL_INTEL_deduction_guides](deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) | Supported | |
| [SYCL_INTEL_device_specific_kernel_queries](DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc) | Proposal | |
| [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc) | Proposal | |
| [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc) | Supported(OpenCL, Level Zero) | |
| [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) | Proposal | |
| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc) | Supported(OpenCL) | |
| [SYCL_INTEL_group_mask](./GroupMask/SYCL_INTEL_group_mask.asciidoc) | Proposal | |
Expand Down
19 changes: 19 additions & 0 deletions sycl/include/CL/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class CG {
COPY_ACC_TO_PTR,
COPY_PTR_TO_ACC,
COPY_ACC_TO_ACC,
BARRIER,
BARRIER_WAITLIST,
FILL,
UPDATE_HOST,
RUN_ON_HOST_INTEL,
Expand Down Expand Up @@ -320,6 +322,23 @@ class CGHostTask : public CG {
MHostTask(std::move(HostTask)), MArgs(std::move(Args)) {}
};

class CGBarrier : public CG {
public:
vector_class<detail::EventImplPtr> MEventsWaitWithBarrier;

CGBarrier(vector_class<detail::EventImplPtr> EventsWaitWithBarrier,
std::vector<std::vector<char>> ArgsStorage,
std::vector<detail::AccessorImplPtr> AccStorage,
std::vector<std::shared_ptr<const void>> SharedPtrStorage,
std::vector<Requirement *> Requirements,
std::vector<detail::EventImplPtr> Events, CGTYPE Type,
detail::code_location loc = {})
: CG(Type, std::move(ArgsStorage), std::move(AccStorage),
std::move(SharedPtrStorage), std::move(Requirements),
std::move(Events), std::move(loc)),
MEventsWaitWithBarrier(std::move(EventsWaitWithBarrier)) {}
};

} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ _PI_API(piSamplerRelease)
_PI_API(piEnqueueKernelLaunch)
_PI_API(piEnqueueNativeKernel)
_PI_API(piEnqueueEventsWait)
_PI_API(piEnqueueEventsWaitWithBarrier)
_PI_API(piEnqueueMemBufferRead)
_PI_API(piEnqueueMemBufferReadRect)
_PI_API(piEnqueueMemBufferWrite)
Expand Down
4 changes: 4 additions & 0 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -1226,6 +1226,10 @@ __SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue,
const pi_event *event_wait_list,
pi_event *event);

__SYCL_EXPORT pi_result piEnqueueEventsWaitWithBarrier(
pi_queue command_queue, pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list, pi_event *event);

__SYCL_EXPORT pi_result piEnqueueMemBufferRead(
pi_queue queue, pi_mem buffer, pi_bool blocking_read, size_t offset,
size_t size, void *ptr, pi_uint32 num_events_in_wait_list,
Expand Down
26 changes: 26 additions & 0 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1683,6 +1683,29 @@ class __SYCL_EXPORT handler {
}
}

/// Prevents any commands submitted afterward to this queue from executing
/// until all commands previously submitted to this queue have entered the
/// complete state.
void barrier() {
throwIfActionIsCreated();
MCGType = detail::CG::BARRIER;
}

/// Prevents any commands submitted afterward to this queue from executing
/// until all events in WaitList have entered the complete state. If WaitList
/// is empty, then the barrier has no effect.
///
/// \param WaitList is a vector of valid SYCL events that need to complete
/// before barrier command can be executed.
void barrier(const vector_class<event> &WaitList) {
throwIfActionIsCreated();
MCGType = detail::CG::BARRIER_WAITLIST;
MEventsWaitWithBarrier.resize(WaitList.size());
std::transform(
WaitList.begin(), WaitList.end(), MEventsWaitWithBarrier.begin(),
[](const event &Event) { return detail::getSyclObjImpl(Event); });
}

/// Copies data from one memory region to another, both pointed by
/// USM pointers.
///
Expand Down Expand Up @@ -1766,6 +1789,9 @@ class __SYCL_EXPORT handler {
std::unique_ptr<detail::InteropTask> MInteropTask;
/// The list of events that order this operation.
vector_class<detail::EventImplPtr> MEvents;
/// The list of valid SYCL events that need to complete
/// before barrier command can be executed
vector_class<detail::EventImplPtr> MEventsWaitWithBarrier;

bool MIsHost = false;

Expand Down
40 changes: 40 additions & 0 deletions sycl/include/CL/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,46 @@ class __SYCL_EXPORT queue {
return submit_impl(CGF, SecondaryQueue, CodeLoc);
}

/// Prevents any commands submitted afterward to this queue from executing
/// until all commands previously submitted to this queue have entered the
/// complete state.
///
/// \param CodeLoc is the code location of the submit call (default argument)
/// \return a SYCL event object, which corresponds to the queue the command
/// group is being enqueued on.
event submit_barrier(
#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
const detail::code_location &CodeLoc = detail::code_location::current()
#endif
) {
#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
const detail::code_location &CodeLoc = {};
#endif
return submit([=](handler &CGH) { CGH.barrier(); }, CodeLoc);
}

/// Prevents any commands submitted afterward to this queue from executing
/// until all events in WaitList have entered the complete state. If WaitList
/// is empty, then submit_barrier has no effect.
///
/// \param WaitList is a vector of valid SYCL events that need to complete
/// before barrier command can be executed.
/// \param CodeLoc is the code location of the submit call (default argument)
/// \return a SYCL event object, which corresponds to the queue the command
/// group is being enqueued on.
event submit_barrier(
const vector_class<event> &WaitList
#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
,
const detail::code_location &CodeLoc = detail::code_location::current()
#endif
) {
#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
const detail::code_location &CodeLoc = {};
#endif
return submit([=](handler &CGH) { CGH.barrier(WaitList); }, CodeLoc);
}

/// Performs a blocking wait for the completion of all enqueued tasks in the
/// queue.
///
Expand Down
40 changes: 40 additions & 0 deletions sycl/plugins/level_zero/pi_level0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2499,6 +2499,46 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList,
return {};
}

pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList,
pi_event *Event) {

assert(Queue);

// Get a new command list to be used on this call
ze_command_list_handle_t ZeCommandList = nullptr;
if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList))
return Res;

ze_event_handle_t ZeEvent = nullptr;
if (Event) {
auto Res = piEventCreate(Queue->Context, Event);
if (Res != PI_SUCCESS)
return Res;

(*Event)->Queue = Queue;
(*Event)->CommandType = PI_COMMAND_TYPE_USER;
(*Event)->ZeCommandList = ZeCommandList;

ZeEvent = (*Event)->ZeEvent;
}

// TODO: use unique_ptr with custom deleter in the whole Level Zero plugin for
// wrapping ze_event_handle_t *ZeEventWaitList to avoid memory leaks in case
// return will be called in ZE_CALL(ze***(...)), and thus
// _pi_event::deleteZeEventList(ZeEventWaitList) won't be called.
ze_event_handle_t *ZeEventWaitList =
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);

ZE_CALL(zeCommandListAppendBarrier(ZeCommandList, ZeEvent,
NumEventsInWaitList, ZeEventWaitList));

_pi_event::deleteZeEventList(ZeEventWaitList);

return PI_SUCCESS;
}

pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src,
pi_bool BlockingRead, size_t Offset,
size_t Size, void *Dst,
Expand Down
1 change: 1 addition & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piEnqueueKernelLaunch, clEnqueueNDRangeKernel)
_PI_CL(piEnqueueNativeKernel, clEnqueueNativeKernel)
_PI_CL(piEnqueueEventsWait, clEnqueueMarkerWithWaitList)
_PI_CL(piEnqueueEventsWaitWithBarrier, clEnqueueBarrierWithWaitList)
_PI_CL(piEnqueueMemBufferRead, clEnqueueReadBuffer)
_PI_CL(piEnqueueMemBufferReadRect, clEnqueueReadBufferRect)
_PI_CL(piEnqueueMemBufferWrite, clEnqueueWriteBuffer)
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/program_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ class program_impl {
return createSyclObjFromImpl<context>(MContext);
}

// \return the Plugin associated withh the context of this program.
/// \return the Plugin associated with the context of this program.
const plugin &getPlugin() const {
assert(!is_host() && "Plugin is not available for Host.");
return MContext->getPlugin();
Expand Down
26 changes: 26 additions & 0 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1949,6 +1949,32 @@ cl_int ExecCGCommand::enqueueImp() {

return CL_SUCCESS;
}
case CG::CGTYPE::BARRIER: {
if (MQueue->get_device().is_host()) {
// NOP for host device.
return PI_SUCCESS;
}
const detail::plugin &Plugin = MQueue->getPlugin();
Plugin.call<PiApiKind::piEnqueueEventsWaitWithBarrier>(
MQueue->getHandleRef(), 0, nullptr, &Event);

return PI_SUCCESS;
}
case CG::CGTYPE::BARRIER_WAITLIST: {
CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
if (MQueue->get_device().is_host() || Events.empty()) {
// NOP for host device.
// If Events is empty, then the barrier has no effect.
return PI_SUCCESS;
}
std::vector<RT::PiEvent> PiEvents = getPiEvents(Events);
const detail::plugin &Plugin = MQueue->getPlugin();
Plugin.call<PiApiKind::piEnqueueEventsWaitWithBarrier>(
MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], &Event);

return PI_SUCCESS;
}
case CG::CGTYPE::NONE:
throw runtime_error("CG type not implemented.", PI_INVALID_OPERATION);
}
Expand Down
7 changes: 7 additions & 0 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ event handler::finalize() {
std::move(MAccStorage), std::move(MSharedPtrStorage),
std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
break;
case detail::CG::BARRIER:
case detail::CG::BARRIER_WAITLIST:
CommandGroup.reset(new detail::CGBarrier(
std::move(MEventsWaitWithBarrier), std::move(MArgsStorage),
std::move(MAccStorage), std::move(MSharedPtrStorage),
std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
break;
case detail::CG::NONE:
throw runtime_error("Command group submitted without a kernel or a "
"explicit memory operation.",
Expand Down
40 changes: 24 additions & 16 deletions sycl/test/abi/layout_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,19 +161,27 @@ void foo() {
// CHECK-NEXT: 448 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_start
// CHECK-NEXT: 456 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_finish
// CHECK-NEXT: 464 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_end_of_storage
// CHECK-NEXT: 472 | _Bool MIsHost
// CHECK-NEXT: 480 | struct cl::sycl::detail::code_location MCodeLoc
// CHECK-NEXT: 480 | const char * MFileName
// CHECK-NEXT: 488 | const char * MFunctionName
// CHECK-NEXT: 496 | unsigned long MLineNo
// CHECK-NEXT: 504 | unsigned long MColumnNo
// CHECK-NEXT: 512 | _Bool MIsFinalized
// CHECK-NEXT: 520 | class cl::sycl::event MLastEvent
// CHECK-NEXT: 520 | class std::shared_ptr<class cl::sycl::detail::event_impl> impl
// CHECK-NEXT: 520 | class std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base)
// CHECK-NEXT: 520 | class std::__shared_ptr_access<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic, false, false> (base) (empty)
// CHECK-NEXT: 520 | std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic>::element_type * _M_ptr
// CHECK-NEXT: 528 | class std::__shared_count<__gnu_cxx::_S_atomic> _M_refcount
// CHECK-NEXT: 528 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi
// CHECK-NEXT: | [sizeof=536, dsize=536, align=8,
// CHECK-NEXT: | nvsize=536, nvalign=8]
// CHECK-NEXT: 472 | class std::vector<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > > MEventsWaitWithBarrier
// CHECK-NEXT: 472 | struct std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > > (base)
// CHECK-NEXT: 472 | struct std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::_Vector_impl _M_impl
// CHECK-NEXT: 472 | class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > (base) (empty)
// CHECK-NEXT: 472 | class __gnu_cxx::new_allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > (base) (empty)
// CHECK-NEXT: 472 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_start
// CHECK-NEXT: 480 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_finish
// CHECK-NEXT: 488 | std::_Vector_base<class std::shared_ptr<class cl::sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class cl::sycl::detail::event_impl> > >::pointer _M_end_of_storage
// CHECK-NEXT: 496 | _Bool MIsHost
// CHECK-NEXT: 504 | struct cl::sycl::detail::code_location MCodeLoc
// CHECK-NEXT: 504 | const char * MFileName
// CHECK-NEXT: 512 | const char * MFunctionName
// CHECK-NEXT: 520 | unsigned long MLineNo
// CHECK-NEXT: 528 | unsigned long MColumnNo
// CHECK-NEXT: 536 | _Bool MIsFinalized
// CHECK-NEXT: 544 | class cl::sycl::event MLastEvent
// CHECK-NEXT: 544 | class std::shared_ptr<class cl::sycl::detail::event_impl> impl
// CHECK-NEXT: 544 | class std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base)
// CHECK-NEXT: 544 | class std::__shared_ptr_access<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic, false, false> (base) (empty)
// CHECK-NEXT: 544 | std::__shared_ptr<class cl::sycl::detail::event_impl, __gnu_cxx::_S_atomic>::element_type * _M_ptr
// CHECK-NEXT: 552 | class std::__shared_count<__gnu_cxx::_S_atomic> _M_refcount
// CHECK-NEXT: 552 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi
// CHECK-NEXT: | [sizeof=560, dsize=560, align=8,
// CHECK-NEXT: | nvsize=560, nvalign=8]
1 change: 1 addition & 0 deletions sycl/test/abi/pi_level0_symbol_check.dump
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ piMemRetain
piextUSMEnqueuePrefetch
piextKernelSetArgPointer
piEnqueueEventsWait
piEnqueueEventsWaitWithBarrier
piEnqueueMemBufferCopy
piQueueGetInfo
piDevicePartition
Expand Down
4 changes: 2 additions & 2 deletions sycl/test/abi/symbol_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ int main() {
check_size<event, 16>();
check_size<gpu_selector, 8>();
#ifdef _MSC_VER
check_size<handler, 528>();
check_size<handler, 552>();
#else
check_size<handler, 536>();
check_size<handler, 560>();
#endif
check_size<image<1>, 16>();
check_size<kernel, 16>();
Expand Down
Loading