Skip to content

[SYCL] Improve memobj creation for devices w/o host unified memory #2761

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 18, 2020
4 changes: 4 additions & 0 deletions sycl/include/CL/sycl/detail/sycl_mem_obj_t.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,10 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI {
ContextImplPtr getInteropContext() const override { return MInteropContext; }

protected:
// An allocateMem helper that determines which host ptr to use
void determineHostPtr(const ContextImplPtr &Context, bool InitFromUserData,
void *&HostPtr, bool &HostPtrReadOnly);

// Allocator used for allocation memory on host.
unique_ptr_class<SYCLMemObjAllocator> MAllocator;
// Properties passed by user.
Expand Down
12 changes: 1 addition & 11 deletions sycl/source/detail/buffer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,8 @@ namespace sycl {
namespace detail {
void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
void *HostPtr, RT::PiEvent &OutEventToWait) {
// The host pointer for the allocation can be provided in 2 ways:
// 1. Initialize the allocation from user data. Check if the user pointer is
// read-only.
// 2. Use a HostPtr allocated by the runtime. Assume any such pointer to be
// read-write.
bool HostPtrReadOnly = false;
if (InitFromUserData) {
assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
"simultaneously");
HostPtr = BaseT::getUserPtr();
HostPtrReadOnly = BaseT::MHostPtrReadOnly;
}
BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);

assert(!(nullptr == HostPtr && BaseT::useHostPtr() && Context->is_host()) &&
"Internal error. Allocating memory on the host "
Expand Down
12 changes: 1 addition & 11 deletions sycl/source/detail/image_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,18 +307,8 @@ template <int Dimensions>
void *image_impl<Dimensions>::allocateMem(ContextImplPtr Context,
bool InitFromUserData, void *HostPtr,
RT::PiEvent &OutEventToWait) {
// The host pointer for the allocation can be provided in 2 ways:
// 1. Initialize the allocation from user data. Check if the user pointer is
// read-only.
// 2. Use a HostPtr allocated by the runtime. Assume any such pointer to be
// read-write.
bool HostPtrReadOnly = false;
if (InitFromUserData) {
assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
"simultaneously");
HostPtr = BaseT::getUserPtr();
HostPtrReadOnly = BaseT::MHostPtrReadOnly;
}
BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);

RT::PiMemImageDesc Desc = getImageDesc(HostPtr != nullptr);
assert(checkImageDesc(Desc, Context, HostPtr) &&
Expand Down
43 changes: 31 additions & 12 deletions sycl/source/detail/memory_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,38 @@ void *MemoryManager::allocateInteropMemObject(
return UserPtr;
}

RT::PiMemFlags getMemObjCreationFlags(const ContextImplPtr &TargetContext,
void *UserPtr, bool HostPtrReadOnly) {
// Create read_write mem object to handle arbitrary uses.
RT::PiMemFlags Result = PI_MEM_FLAGS_ACCESS_RW;
if (UserPtr) {
if (HostPtrReadOnly)
Result |= PI_MEM_FLAGS_HOST_PTR_COPY;
else {
// Create the memory object using the host pointer only if the devices
// support host_unified_memory to avoid potential copy overhead.
// TODO This check duplicates the one performed in the GraphBuilder during
// AllocaCommand creation. This information should be propagated here
// instead, which would be a breaking ABI change.
bool HostUnifiedMemory = true;
for (const device &Device : TargetContext->getDevices())
HostUnifiedMemory &=
Device.get_info<info::device::host_unified_memory>();
Result |= HostUnifiedMemory ? PI_MEM_FLAGS_HOST_PTR_USE
: PI_MEM_FLAGS_HOST_PTR_COPY;
}
}

return Result;
}

void *MemoryManager::allocateImageObject(ContextImplPtr TargetContext,
void *UserPtr, bool HostPtrReadOnly,
const RT::PiMemImageDesc &Desc,
const RT::PiMemImageFormat &Format,
const sycl::property_list &) {
// Create read_write mem object by default to handle arbitrary uses.
RT::PiMemFlags CreationFlags = PI_MEM_FLAGS_ACCESS_RW;
if (UserPtr)
CreationFlags |= HostPtrReadOnly ? PI_MEM_FLAGS_HOST_PTR_COPY
: PI_MEM_FLAGS_HOST_PTR_USE;
RT::PiMemFlags CreationFlags =
getMemObjCreationFlags(TargetContext, UserPtr, HostPtrReadOnly);

RT::PiMem NewMem;
const detail::plugin &Plugin = TargetContext->getPlugin();
Expand All @@ -142,13 +164,10 @@ void *
MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr,
bool HostPtrReadOnly, const size_t Size,
const sycl::property_list &PropsList) {
// Create read_write mem object by default to handle arbitrary uses.
RT::PiMemFlags CreationFlags = PI_MEM_FLAGS_ACCESS_RW;
if (UserPtr)
CreationFlags |= HostPtrReadOnly ? PI_MEM_FLAGS_HOST_PTR_COPY
: PI_MEM_FLAGS_HOST_PTR_USE;
else if (PropsList.has_property<
sycl::ext::oneapi::property::buffer::use_pinned_host_memory>())
RT::PiMemFlags CreationFlags =
getMemObjCreationFlags(TargetContext, UserPtr, HostPtrReadOnly);
if (PropsList.has_property<
sycl::ext::oneapi::property::buffer::use_pinned_host_memory>())
CreationFlags |= PI_MEM_FLAGS_HOST_PTR_ALLOC;

RT::PiMem NewMem = nullptr;
Expand Down
87 changes: 77 additions & 10 deletions sycl/source/detail/scheduler/graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,22 @@ static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
return false;
}

/// Combines two access modes into a single one that allows both.
static access::mode combineAccessModes(access::mode A, access::mode B) {
if (A == B)
return A;

if (A == access::mode::discard_write &&
(B == access::mode::discard_read_write || B == access::mode::write))
return B;

if (B == access::mode::discard_write &&
(A == access::mode::discard_read_write || A == access::mode::write))
return A;

return access::mode::read_write;
}

Scheduler::GraphBuilder::GraphBuilder() {
if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
std::string GraphPrintOpts(EnvVarCStr);
Expand Down Expand Up @@ -574,6 +590,14 @@ Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record,
return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
}

static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
for (const device &Device : Ctx->getDevices()) {
if (!Device.get_info<info::device::host_unified_memory>())
return false;
}
return true;
}

// The function searches for the alloca command matching context and
// requirement. If none exists, new allocation command is created.
// Note, creation of new allocation command can lead to the current context
Expand Down Expand Up @@ -603,8 +627,18 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
Req->MMemoryRange, access::mode::read_write,
Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
// Can reuse user data for the first allocation
const bool InitFromUserData = Record->MAllocaCommands.empty();
// Can reuse user data for the first allocation. Do so if host unified
// memory is supported regardless of the access mode (the pointer will be
// reused) or if it's not and the access mode is not discard (the pointer
// will be copied).
// TODO the case where the first alloca is made with a discard mode and
// the user pointer is read-only is still not handled: it leads to
// unnecessary copy on devices with unified host memory support.
const bool InitFromUserData =
Record->MAllocaCommands.empty() &&
(checkHostUnifiedMemory(Queue->getContextImplPtr()) ||
(Req->MAccessMode != access::mode::discard_write &&
Req->MAccessMode != access::mode::discard_read_write));

AllocaCommandBase *LinkedAllocaCmd = nullptr;
// If it is not the first allocation, try to setup a link
Expand All @@ -617,13 +651,22 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
// "not" current allocation, but it will require memory copy.
// Can setup link between cl and host allocations only
if (Queue->is_host() != Record->MCurContext->is_host()) {

AllocaCommandBase *LinkedAllocaCmdCand =
findAllocaForReq(Record, Req, Record->MCurContext);

// Cannot setup link if candidate is linked already
if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd)
LinkedAllocaCmd = LinkedAllocaCmdCand;
// Linked commands assume that the host allocation is reused by the
// plugin runtime and that can lead to unnecessary copy overhead on
// devices that do not support host unified memory. Do not link the
// allocations in this case.
const ContextImplPtr &NonHostCtx = Queue->is_host()
? Record->MCurContext
: Queue->getContextImplPtr();
if (checkHostUnifiedMemory(NonHostCtx)) {
AllocaCommandBase *LinkedAllocaCmdCand =
findAllocaForReq(Record, Req, Record->MCurContext);

// Cannot setup link if candidate is linked already
if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
LinkedAllocaCmd = LinkedAllocaCmdCand;
}
}
}

AllocaCmd =
Expand Down Expand Up @@ -732,10 +775,30 @@ static bool isInteropHostTask(const std::unique_ptr<ExecCGCommand> &Cmd) {
return HT.MHostTask->isInteropTask();
}

static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
bool HasDuplicateMemObjects = false;
for (const Requirement *Req : Reqs) {
auto Result = CombinedModes.insert(
std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
if (!Result.second) {
Result.first->second =
combineAccessModes(Result.first->second, Req->MAccessMode);
HasDuplicateMemObjects = true;
}
}

if (!HasDuplicateMemObjects)
return;
for (Requirement *Req : Reqs) {
Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
}
}

Command *
Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
QueueImplPtr Queue) {
const std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
const CG::CGTYPE CGType = CommandGroup->getType();

Expand All @@ -747,6 +810,10 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
if (MPrintOptionsArray[BeforeAddCG])
printGraphAsDot("before_addCG");

// If there are multiple requirements for the same memory object, its
// AllocaCommand creation will be dependent on the access mode of the first
// requirement. Combine these access modes to take all of them into account.
combineAccessModesOfReqs(Reqs);
for (Requirement *Req : Reqs) {
MemObjRecord *Record = nullptr;
AllocaCommandBase *AllocaCmd = nullptr;
Expand Down
29 changes: 29 additions & 0 deletions sycl/source/detail/sycl_mem_obj_t.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,35 @@ size_t SYCLMemObjT::getBufSizeForContext(const ContextImplPtr &Context,
sizeof(size_t), &BufSize, nullptr);
return BufSize;
}

void SYCLMemObjT::determineHostPtr(const ContextImplPtr &Context,
bool InitFromUserData, void *&HostPtr,
bool &HostPtrReadOnly) {
// The data for the allocation can be provided via either the user pointer
// (InitFromUserData, can be read-only) or a runtime-allocated read-write
// HostPtr. We can have one of these scenarios:
// 1. The allocation is the first one and on host. InitFromUserData == true.
// 2. The allocation is the first one and isn't on host. InitFromUserData
// varies based on unified host memory support and whether or not the data can
// be discarded.
// 3. The allocation is not the first one and is on host. InitFromUserData ==
// false, HostPtr == nullptr. This can only happen if the allocation command
// is not linked since it would be a no-op otherwise. Attempt to reuse the
// user pointer if it's read-write, but do not copy its contents if it's not.
// 4. The allocation is not the first one and not on host. InitFromUserData ==
// false, HostPtr is provided if the command is linked. The host pointer is
// guaranteed to be reused in this case.
if (Context->is_host() && !MOpenCLInterop && !MHostPtrReadOnly)
InitFromUserData = true;

if (InitFromUserData) {
assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
"simultaneously");
HostPtr = getUserPtr();
HostPtrReadOnly = MHostPtrReadOnly;
} else
HostPtrReadOnly = false;
}
} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3714,6 +3714,7 @@ _ZN2cl4sycl6detail10image_implILi3EED1Ev
_ZN2cl4sycl6detail10image_implILi3EED2Ev
_ZN2cl4sycl6detail10waitEventsESt6vectorINS0_5eventESaIS3_EE
_ZN2cl4sycl6detail11SYCLMemObjT10releaseMemESt10shared_ptrINS1_12context_implEEPv
_ZN2cl4sycl6detail11SYCLMemObjT16determineHostPtrERKSt10shared_ptrINS1_12context_implEEbRPvRb
_ZN2cl4sycl6detail11SYCLMemObjT16updateHostMemoryEPv
_ZN2cl4sycl6detail11SYCLMemObjT16updateHostMemoryEv
_ZN2cl4sycl6detail11SYCLMemObjT20getBufSizeForContextERKSt10shared_ptrINS1_12context_implEEP7_cl_mem
Expand Down
1 change: 1 addition & 0 deletions sycl/unittests/scheduler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ add_sycl_unittest(SchedulerTests OBJECT
WaitAfterCleanup.cpp
LinkedAllocaDependencies.cpp
LeavesCollection.cpp
NoUnifiedHostMemory.cpp
utils.cpp
)
20 changes: 16 additions & 4 deletions sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,30 @@ class MemObjMock : public cl::sycl::detail::SYCLMemObjI {
detail::ContextImplPtr getInteropContext() const override { return nullptr; }
};

static cl::sycl::device getDeviceWithHostUnifiedMemory() {
for (cl::sycl::device &D : cl::sycl::device::get_devices()) {
if (!D.is_host() &&
D.get_info<cl::sycl::info::device::host_unified_memory>())
return D;
}
return {};
}

TEST_F(SchedulerTest, LinkedAllocaDependencies) {
default_selector Selector{};
if (Selector.select_device().is_host()) {
std::cerr << "Not run due to host-only environment\n";
cl::sycl::device Dev = getDeviceWithHostUnifiedMemory();
if (Dev.is_host()) {
std::cerr << "Not run: no non-host devices with host unified memory support"
<< std::endl;
return;
}

// 1. create two commands: alloca + alloca and link them
// 2. call Scheduler::GraphBuilder::getOrCreateAllocaForReq
detail::Requirement Req = getMockRequirement();

cl::sycl::queue Queue1;
// Commands are linked only if the device supports host unified memory.

cl::sycl::queue Queue1{Dev};
cl::sycl::detail::QueueImplPtr Q1 = cl::sycl::detail::getSyclObjImpl(Queue1);

sycl::device HostDevice;
Expand Down
Loading