Skip to content

[SYCL][Scheduler] Rework of host accessor and host allocation #724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions sycl/include/CL/sycl/detail/accessor_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ namespace cl {
namespace sycl {
namespace detail {

class Command;

// The class describes a requirement to access a SYCL memory object such as
// sycl::buffer and sycl::image. For example, each accessor used in a kernel,
// except one with access target "local", adds such requirement for the command
Expand Down Expand Up @@ -70,10 +72,8 @@ class AccessorImplHost {
MElemSize(ElemSize), MOffsetInBytes(OffsetInBytes),
MIsSubBuffer(IsSubBuffer) {}

~AccessorImplHost() {
if (BlockingEvent)
BlockingEvent->setComplete();
}
~AccessorImplHost();

AccessorImplHost(const AccessorImplHost &Other)
: MOffset(Other.MOffset), MAccessRange(Other.MAccessRange),
MMemoryRange(Other.MMemoryRange), MAccessMode(Other.MAccessMode),
Expand All @@ -97,7 +97,7 @@ class AccessorImplHost {

void *MData = nullptr;

EventImplPtr BlockingEvent;
Command *MBlockedCmd = nullptr;
};

using AccessorImplPtr = std::shared_ptr<AccessorImplHost>;
Expand Down
12 changes: 10 additions & 2 deletions sycl/include/CL/sycl/detail/buffer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,17 @@ class buffer_impl final : public SYCLMemObjT<AllocatorT> {
}

void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
RT::PiEvent &OutEventToWait) override {
void *HostPtr, RT::PiEvent &OutEventToWait) override {

void *UserPtr = InitFromUserData ? BaseT::getUserPtr() : nullptr;
assert(!(InitFromUserData && HostPtr) &&
"Cannot init from user data and reuse host ptr provided "
"simultaneously");

void *UserPtr = InitFromUserData ? BaseT::getUserPtr() : HostPtr;

assert(!(nullptr == UserPtr && BaseT::useHostPtr() && Context->is_host()) &&
"Internal error. Allocating memory on the host "
"while having use_host_ptr property");

return MemoryManager::allocateMemBuffer(
std::move(Context), this, UserPtr, BaseT::MHostPtrReadOnly,
Expand Down
9 changes: 7 additions & 2 deletions sycl/include/CL/sycl/detail/image_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,13 @@ class image_impl final : public SYCLMemObjT<AllocatorT> {
size_t get_count() const { return MRange.size(); }

void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
RT::PiEvent &OutEventToWait) override {
void *UserPtr = InitFromUserData ? BaseT::getUserPtr() : nullptr;
void *HostPtr, RT::PiEvent &OutEventToWait) override {

assert(!(InitFromUserData && HostPtr) &&
"Cannot init from user data and reuse host ptr provided "
"simultaneously");

void *UserPtr = InitFromUserData ? BaseT::getUserPtr() : HostPtr;

RT::PiMemImageDesc Desc = getImageDesc(UserPtr != nullptr);
assert(checkImageDesc(Desc, Context, UserPtr) &&
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/CL/sycl/detail/memory_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class MemoryManager {
// The following method allocates memory allocation of memory object.
// Depending on the context it allocates memory on host or on device.
static void *allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
bool InitFromUserData, std::vector<RT::PiEvent> DepEvents,
bool InitFromUserData, void *HostPtr,
std::vector<RT::PiEvent> DepEvents,
RT::PiEvent &OutEvent);

// The following method creates OpenCL sub buffer for specified
Expand Down
41 changes: 28 additions & 13 deletions sycl/include/CL/sycl/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ class Command {
bool MIsBlockable = false;
// Indicates whether the command is blocked from enqueueing
std::atomic<bool> MCanEnqueue;

const char *MBlockReason = "Unknown";
};

// The command does nothing during enqueue. The task can be used to implement
Expand Down Expand Up @@ -193,8 +195,10 @@ class ReleaseCommand : public Command {

class AllocaCommandBase : public Command {
public:
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req)
: Command(Type, Queue), MReleaseCmd(Queue, this),
AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req,
AllocaCommandBase *LinkedAllocaCmd)
: Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MReleaseCmd(Queue, this),
MRequirement(std::move(Req)) {
MRequirement.MAccessMode = access::mode::read_write;
}
Expand All @@ -207,20 +211,35 @@ class AllocaCommandBase : public Command {

const Requirement *getRequirement() const final { return &MRequirement; }

void *MMemAllocation = nullptr;

// Alloca command linked with current command.
// Device and host alloca commands can be linked, so they may share the same
// memory. Only one allocation from a pair can be accessed at a time. Alloca
// commands associated with such allocation is "active". In order to switch
// "active" status between alloca commands map/unmap operations are used.
AllocaCommandBase *MLinkedAllocaCmd = nullptr;
// Indicates that current alloca is active one.
bool MIsActive = true;

// Indicates that the command owns memory allocation in case of connected
// alloca command
bool MIsLeaderAlloca = true;

protected:
ReleaseCommand MReleaseCmd;
Requirement MRequirement;
void *MMemAllocation = nullptr;
};

// The command enqueues allocation of instance of memory object on Host or
// underlying framework.
class AllocaCommand : public AllocaCommandBase {
public:
AllocaCommand(QueueImplPtr Queue, Requirement Req,
bool InitFromUserData = true)
: AllocaCommandBase(CommandType::ALLOCA, std::move(Queue),
std::move(Req)),
bool InitFromUserData = true,
AllocaCommandBase *LinkedAllocaCmd = nullptr)
: AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
LinkedAllocaCmd),
MInitFromUserData(InitFromUserData) {
addDep(DepDesc(nullptr, getRequirement(), this));
}
Expand All @@ -240,7 +259,8 @@ class AllocaSubBufCommand : public AllocaCommandBase {
AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
AllocaCommandBase *ParentAlloca)
: AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
std::move(Req)),
std::move(Req),
/*LinkedAllocaCmd*/ nullptr),
MParentAlloca(ParentAlloca) {
addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca));
}
Expand All @@ -251,7 +271,7 @@ class AllocaSubBufCommand : public AllocaCommandBase {
private:
cl_int enqueueImp() final;

AllocaCommandBase *MParentAlloca;
AllocaCommandBase *MParentAlloca = nullptr;
};

class MapMemObject : public Command {
Expand Down Expand Up @@ -295,10 +315,6 @@ class MemCpyCommand : public Command {
QueueImplPtr SrcQueue, QueueImplPtr DstQueue,
bool UseExclusiveQueue = false);

void setAccessorToUpdate(Requirement *AccToUpdate) {
MAccToUpdate = AccToUpdate;
}

void printDot(std::ostream &Stream) const final;
const Requirement *getRequirement() const final { return &MDstReq; }

Expand All @@ -310,7 +326,6 @@ class MemCpyCommand : public Command {
AllocaCommandBase *MSrcAllocaCmd = nullptr;
Requirement MDstReq;
AllocaCommandBase *MDstAllocaCmd = nullptr;
Requirement *MAccToUpdate = nullptr;
};

// The command enqueues memory copy between two instances of memory object.
Expand Down
25 changes: 18 additions & 7 deletions sycl/include/CL/sycl/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,17 @@ class Scheduler {
// sycl::image destructors.
void removeMemoryObject(detail::SYCLMemObjI *MemObj);

// Creates nodes in the graph, that update Req with the pointer to the host
// memory which contains the latest data of the memory object. New operations
// with the same memory object that have side effects are blocked until
// releaseHostAccessor is called.
// Returns an event which indicates when these nodes are completed and host
// accessor is ready for using.
EventImplPtr addHostAccessor(Requirement *Req);

// Unblocks operations with the memory object.
void releaseHostAccessor(Requirement *Req);

// Returns an instance of the scheduler object.
static Scheduler &getInstance();

Expand Down Expand Up @@ -101,7 +110,7 @@ class Scheduler {
QueueImplPtr HostQueue);

Command *addCopyBack(Requirement *Req);
Command *addHostAccessor(Requirement *Req, EventImplPtr &RetEvent);
Command *addHostAccessor(Requirement *Req);

// [Provisional] Optimizes the whole graph.
void optimize();
Expand Down Expand Up @@ -142,18 +151,20 @@ class Scheduler {
std::vector<SYCLMemObjI *> MMemObjs;

private:
// The method inserts memory copy operation from the context where the
// memory current lives to the context bound to Queue.
MemCpyCommand *insertMemCpyCmd(MemObjRecord *Record, Requirement *Req,
const QueueImplPtr &Queue,
bool UseExclusiveQueue = false);
// The method inserts required command to make so the latest state for the
// memory object Record refers to resides in the context which is bound to
// the Queue. Can insert copy/map/unmap operations depending on the source
// and destination.
Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req,
const QueueImplPtr &Queue,
bool UseExclusiveQueue = false);

UpdateHostRequirementCommand *
insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req,
const QueueImplPtr &Queue);

std::set<Command *> findDepsForReq(MemObjRecord *Record, Requirement *Req,
QueueImplPtr Context);
const ContextImplPtr &Context);

// Searches for suitable alloca in memory record.
AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, Requirement *Req,
Expand Down
5 changes: 3 additions & 2 deletions sycl/include/CL/sycl/detail/sycl_mem_obj_i.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ class SYCLMemObjI {
// point to event that should be waited before using the memory.
// InitFromUserData indicates that the returned memory should be intialized
// with the data provided by user(if any). Usually it should happen on the
// first allocation of memory for the buffer.
// first allocation of memory for the memory object.
// Non null HostPtr requires allocation to be made with USE_HOST_PTR property.
// Method returns a pointer to host allocation if Context is host one and
// cl_mem obect if not.
virtual void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
RT::PiEvent &InteropEvent) = 0;
void *HostPtr, RT::PiEvent &InteropEvent) = 0;

// Should be used for memory object created without use_host_ptr property.
virtual void *allocateHostMem() = 0;
Expand Down
1 change: 1 addition & 0 deletions sycl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ endfunction(add_sycl_rt_library)

set(SYCL_SOURCES
"${sycl_inc_dir}/CL/sycl.hpp"
"detail/accessor_impl.cpp"
"detail/builtins_common.cpp"
"detail/builtins_geometric.cpp"
"detail/builtins_integer.cpp"
Expand Down
23 changes: 23 additions & 0 deletions sycl/source/detail/accessor_impl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//==---------------- accessor_impl.cpp - SYCL standard source file ---------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <CL/sycl/detail/accessor_impl.hpp>
#include <CL/sycl/detail/scheduler/scheduler.hpp>

namespace cl {
namespace sycl {
namespace detail {

AccessorImplHost::~AccessorImplHost() {
if (MBlockedCmd)
detail::Scheduler::getInstance().releaseHostAccessor(this);
}
}
}
}

4 changes: 0 additions & 4 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ event_impl::~event_impl() {
}
}

void event_impl::setComplete() {
PI_CALL(RT::piEventSetStatus, m_Event, CL_COMPLETE);
}

void event_impl::waitInternal() const {
if (!m_HostEvent) {
PI_CALL(RT::piEventsWait, 1, &m_Event);
Expand Down
16 changes: 11 additions & 5 deletions sycl/source/detail/memory_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,16 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext,
}

void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
bool InitFromUserData,
bool InitFromUserData, void *HostPtr,
std::vector<RT::PiEvent> DepEvents,
RT::PiEvent &OutEvent) {
// There is no async API for memory allocation. Explicitly wait for all
// dependency events and return empty event.
waitForEvents(DepEvents);
OutEvent = nullptr;

return MemObj->allocateMem(TargetContext, InitFromUserData, OutEvent);
return MemObj->allocateMem(TargetContext, InitFromUserData, HostPtr,
OutEvent);
}

void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr,
Expand Down Expand Up @@ -420,7 +421,7 @@ void *MemoryManager::map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue,
sycl::id<3> AccessOffset, unsigned int ElementSize,
std::vector<RT::PiEvent> DepEvents,
RT::PiEvent &OutEvent) {
if (Queue->is_host() || Dim != 1) {
if (Queue->is_host()) {
assert(!"Not supported configuration of map requested");
throw runtime_error("Not supported configuration of map requested");
}
Expand All @@ -447,10 +448,15 @@ void *MemoryManager::map(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue,
AccessOffset[0] *= ElementSize;
AccessRange[0] *= ElementSize;

void *MappedPtr;
// TODO: Handle offset
assert(AccessOffset[0] == 0 && "Handle offset");

void *MappedPtr = nullptr;
const size_t BytesToMap = AccessRange[0] * AccessRange[1] * AccessRange[2];

PI_CALL(RT::piEnqueueMemBufferMap, Queue->getHandleRef(),
pi::cast<RT::PiMem>(Mem), CL_FALSE, Flags, AccessOffset[0],
AccessRange[0], DepEvents.size(),
BytesToMap, DepEvents.size(),
DepEvents.empty() ? nullptr : &DepEvents[0], &OutEvent, &MappedPtr);
return MappedPtr;
}
Expand Down
Loading