Skip to content

[SYCL] Refactor queue to improve ABI stability #985

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 31, 2020
293 changes: 198 additions & 95 deletions sycl/include/CL/sycl/detail/queue_impl.hpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions sycl/include/CL/sycl/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include <CL/sycl/detail/common.hpp>
#include <CL/sycl/info/info_desc.hpp>
#include <CL/sycl/stl.hpp>

#include <memory>
Expand Down
4 changes: 1 addition & 3 deletions sycl/include/CL/sycl/info/info_desc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,7 @@ template <typename T, T param> class param_traits {};

#include <CL/sycl/info/program_traits.def>

PARAM_TRAITS_SPEC(queue, reference_count, cl_uint)
PARAM_TRAITS_SPEC(queue, context, cl::sycl::context)
PARAM_TRAITS_SPEC(queue, device, cl::sycl::device)
#include <CL/sycl/info/queue_traits.def>

#undef PARAM_TRAITS_SPEC
#undef PARAM_TRAITS_SPEC_WITH_INPUT
Expand Down
4 changes: 4 additions & 0 deletions sycl/include/CL/sycl/info/queue_traits.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
PARAM_TRAITS_SPEC(queue, reference_count, cl_uint)
PARAM_TRAITS_SPEC(queue, context, cl::sycl::context)
PARAM_TRAITS_SPEC(queue, device, cl::sycl::device)

2 changes: 1 addition & 1 deletion sycl/include/CL/sycl/property_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ template <class T> class PropertyHolder {

const T &getProp() const {
assert(true == m_Initialized && "Property was not set!");
return *(T *)m_Mem;
return *(const T *)m_Mem;
}
bool isInitialized() const { return m_Initialized; }

Expand Down
275 changes: 197 additions & 78 deletions sycl/include/CL/sycl/queue.hpp

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace detail {
template <> cl_uint queue_impl::get_info<info::queue::reference_count>() const {
RT::PiResult result = PI_SUCCESS;
if (!is_host())
PI_CALL(piQueueGetInfo)(m_CommandQueue, PI_QUEUE_INFO_REFERENCE_COUNT,
PI_CALL(piQueueGetInfo)(MCommandQueue, PI_QUEUE_INFO_REFERENCE_COUNT,
sizeof(result), &result, nullptr);
return result;
}
Expand All @@ -35,7 +35,7 @@ template <> device queue_impl::get_info<info::queue::device>() const {
return get_device();
}

event queue_impl::memset(std::shared_ptr<detail::queue_impl> Impl, void *Ptr,
event queue_impl::memset(shared_ptr_class<detail::queue_impl> Impl, void *Ptr,
int Value, size_t Count) {
context Context = get_context();
RT::PiEvent Event = nullptr;
Expand All @@ -47,7 +47,7 @@ event queue_impl::memset(std::shared_ptr<detail::queue_impl> Impl, void *Ptr,
return event(pi::cast<cl_event>(Event), Context);
}

event queue_impl::memcpy(std::shared_ptr<detail::queue_impl> Impl, void *Dest,
event queue_impl::memcpy(shared_ptr_class<detail::queue_impl> Impl, void *Dest,
const void *Src, size_t Count) {
context Context = get_context();
RT::PiEvent Event = nullptr;
Expand Down
30 changes: 15 additions & 15 deletions sycl/source/detail/scheduler/graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
MemObject->MRecord.reset(new MemObjRecord{/*MAllocaCommands*/ {},
/*MReadLeaves*/ {},
/*MWriteLeaves*/ {},
Queue->get_context_impl(),
Queue->getContextImplPtr(),
/*MMemModified*/ false});

MMemObjs.push_back(MemObject);
Expand Down Expand Up @@ -162,7 +162,7 @@ void Scheduler::GraphBuilder::AddNodeToLeaves(MemObjRecord *Record,
UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue) {
AllocaCommandBase *AllocaCmd =
findAllocaForReq(Record, Req, Queue->get_context_impl());
findAllocaForReq(Record, Req, Queue->getContextImplPtr());
assert(AllocaCmd && "There must be alloca for requirement!");
UpdateHostRequirementCommand *UpdateCommand =
new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
Expand All @@ -171,7 +171,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
const Requirement *StoredReq = UpdateCommand->getRequirement();

std::set<Command *> Deps =
findDepsForReq(Record, Req, Queue->get_context_impl());
findDepsForReq(Record, Req, Queue->getContextImplPtr());
for (Command *Dep : Deps) {
UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd});
Dep->addUser(UpdateCommand);
Expand Down Expand Up @@ -218,7 +218,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(MemObjRecord *Record,
throw runtime_error("Out of host memory");

std::set<Command *> Deps =
findDepsForReq(Record, Req, Queue->get_context_impl());
findDepsForReq(Record, Req, Queue->getContextImplPtr());
Deps.insert(AllocaCmdDst);
// Get parent allocation of sub buffer to perform full copy of whole buffer
if (IsSuitableSubReq(Req)) {
Expand All @@ -237,7 +237,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(MemObjRecord *Record,
// current context, need to find a parent alloca command for it (it must be
// there)
auto IsSuitableAlloca = [Record, Req](AllocaCommandBase *AllocaCmd) {
bool Res = sameCtx(AllocaCmd->getQueue()->get_context_impl(),
bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
Record->MCurContext) &&
// Looking for a parent buffer alloca command
AllocaCmd->getType() == Command::CommandType::ALLOCA;
Expand Down Expand Up @@ -279,7 +279,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(MemObjRecord *Record,
}
UpdateLeaves(Deps, Record, access::mode::read_write);
AddNodeToLeaves(Record, NewCmd, access::mode::read_write);
Record->MCurContext = Queue->get_context_impl();
Record->MCurContext = Queue->getContextImplPtr();
return NewCmd;
}

Expand All @@ -298,7 +298,7 @@ Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) {
return nullptr;

std::set<Command *> Deps =
findDepsForReq(Record, Req, HostQueue->get_context_impl());
findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
AllocaCommandBase *SrcAllocaCmd =
findAllocaForReq(Record, Req, Record->MCurContext);

Expand Down Expand Up @@ -336,7 +336,7 @@ Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req) {
AllocaCommandBase *HostAllocaCmd =
getOrCreateAllocaForReq(Record, Req, HostQueue);

if (!sameCtx(HostAllocaCmd->getQueue()->get_context_impl(),
if (!sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(),
Record->MCurContext))
insertMemoryMove(Record, Req, HostQueue);

Expand Down Expand Up @@ -418,7 +418,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req,
// Going through copying memory between contexts is not supported.
if (Dep.MDepCommand)
CanBypassDep &=
sameCtx(Context, Dep.MDepCommand->getQueue()->get_context_impl());
sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());

if (!CanBypassDep) {
RetDeps.insert(DepCmd);
Expand All @@ -441,7 +441,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req,
AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context) {
auto IsSuitableAlloca = [&Context, Req](AllocaCommandBase *AllocaCmd) {
bool Res = sameCtx(AllocaCmd->getQueue()->get_context_impl(), Context);
bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
if (IsSuitableSubReq(Req)) {
const Requirement *TmpReq = AllocaCmd->getRequirement();
Res &= TmpReq->MOffsetInBytes == Req->MOffsetInBytes;
Expand All @@ -462,7 +462,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
MemObjRecord *Record, Requirement *Req, QueueImplPtr Queue) {

AllocaCommandBase *AllocaCmd =
findAllocaForReq(Record, Req, Queue->get_context_impl());
findAllocaForReq(Record, Req, Queue->getContextImplPtr());

if (!AllocaCmd) {
if (IsSuitableSubReq(Req)) {
Expand All @@ -477,7 +477,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
auto *ParentAlloca =
getOrCreateAllocaForReq(Record, &ParentRequirement, Queue);
AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca);
UpdateLeaves(findDepsForReq(Record, Req, Queue->get_context_impl()),
UpdateLeaves(findDepsForReq(Record, Req, Queue->getContextImplPtr()),
Record, access::mode::read_write);
} else {

Expand Down Expand Up @@ -530,7 +530,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
AllocaCmd->MIsActive = false;
} else {
LinkedAllocaCmd->MIsActive = false;
Record->MCurContext = Queue->get_context_impl();
Record->MCurContext = Queue->getContextImplPtr();
}
}
}
Expand Down Expand Up @@ -576,7 +576,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
AllocaCommandBase *AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue);
// If there is alloca command we need to check if the latest memory is in
// required context.
if (!sameCtx(Queue->get_context_impl(), Record->MCurContext)) {
if (!sameCtx(Queue->getContextImplPtr(), Record->MCurContext)) {
// Cannot directly copy memory from OpenCL device to OpenCL device -
// create two copies: device->host and host->device.
if (!Queue->is_host() && !Record->MCurContext->is_host())
Expand All @@ -585,7 +585,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
insertMemoryMove(Record, Req, Queue);
}
std::set<Command *> Deps =
findDepsForReq(Record, Req, Queue->get_context_impl());
findDepsForReq(Record, Req, Queue->getContextImplPtr());

for (Command *Dep : Deps)
NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd});
Expand Down
1 change: 1 addition & 0 deletions sycl/source/detail/usm/usm_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <CL/sycl/detail/aligned_allocator.hpp>
#include <CL/sycl/detail/os_util.hpp>
#include <CL/sycl/detail/pi.hpp>
#include <CL/sycl/detail/queue_impl.hpp>
#include <CL/sycl/device.hpp>
#include <CL/sycl/usm.hpp>

Expand Down
5 changes: 2 additions & 3 deletions sycl/source/ordered_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ ordered_queue::ordered_queue(cl_command_queue clQueue,
throw runtime_error(
"Failed to build a sycl ordered queue from a cl OOO queue.");

impl =
std::make_shared<detail::queue_impl>(clQueue,
detail::getSyclObjImpl(syclContext), asyncHandler);
impl = std::make_shared<detail::queue_impl>(
m_CommandQueue, detail::getSyclObjImpl(syclContext), asyncHandler);
}

} // namespace sycl
Expand Down
76 changes: 73 additions & 3 deletions sycl/source/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
//
//===----------------------------------------------------------------------===//

#include <CL/sycl/detail/queue_impl.hpp>
#include <CL/sycl/event.hpp>
#include <CL/sycl/exception_list.hpp>
#include <CL/sycl/handler.hpp>
#include <CL/sycl/queue.hpp>
#include <CL/sycl/stl.hpp>

#include <algorithm>

Expand Down Expand Up @@ -37,10 +41,76 @@ queue::queue(const device &syclDevice, const async_handler &asyncHandler,

queue::queue(cl_command_queue clQueue, const context &syclContext,
const async_handler &asyncHandler) {
impl =
std::make_shared<detail::queue_impl>(clQueue,
detail::getSyclObjImpl(syclContext), asyncHandler);
impl = std::make_shared<detail::queue_impl>(
detail::pi::cast<detail::RT::PiQueue>(clQueue),
detail::getSyclObjImpl(syclContext), asyncHandler);
}

queue::queue(const context &syclContext, const device_selector &deviceSelector,
const property_list &propList)
: queue(syclContext, deviceSelector,
detail::getSyclObjImpl(syclContext)->get_async_handler(),
propList) {}

cl_command_queue queue::get() const { return impl->get(); }

context queue::get_context() const { return impl->get_context(); }

device queue::get_device() const { return impl->get_device(); }

bool queue::is_host() const { return impl->is_host(); }

void queue::wait() { impl->wait(); }

void queue::wait_and_throw() { impl->wait_and_throw(); }

void queue::throw_asynchronous() { impl->throw_asynchronous(); }

event queue::memset(void *ptr, int value, size_t count) {
return impl->memset(impl, ptr, value, count);
}

event queue::memcpy(void *dest, const void *src, size_t count) {
return impl->memcpy(impl, dest, src, count);
}

event queue::mem_advise(const void *ptr, size_t length, int advice) {
return impl->mem_advise(ptr, length, advice);
}

event queue::submit_impl(function_class<void(handler &)> CGH) {
return impl->submit(CGH, impl);
}

event queue::submit_impl(function_class<void(handler &)> CGH,
queue secondQueue) {
return impl->submit(CGH, impl, secondQueue.impl);
}

template <info::queue param>
typename info::param_traits<info::queue, param>::return_type
queue::get_info() const {
return impl->get_info<param>();
}

#define PARAM_TRAITS_SPEC(param_type, param, ret_type) \
template ret_type queue::get_info<info::param_type::param>() const;

#include <CL/sycl/info/queue_traits.def>

#undef PARAM_TRAITS_SPEC

template <typename propertyT> bool queue::has_property() const {
return impl->has_property<propertyT>();
}

template <typename propertyT> propertyT queue::get_property() const {
return impl->get_property<propertyT>();
}

template bool queue::has_property<property::queue::enable_profiling>() const;
template property::queue::enable_profiling
queue::get_property<property::queue::enable_profiling>() const;

} // namespace sycl
} // namespace cl