Skip to content

Commit a4f0924

Browse files
[SYCL] Improve memobj creation for devices w/o host unified memory (#2761)
The aggressive use of USE_HOST_PTR memory object creation option led to significant performance overhead on devices without host unified memory support. Furthermore, any discard access modes were ignored during the allocation leading to unnecessary copy. This patch makes it so that allocations for such devices are made with COPY_HOST_PTR instead, and only if necessitated by the access mode. These allocations are also not linked anymore, since linked allocations rely on USE_HOST_PTR option.
1 parent cece649 commit a4f0924

File tree

11 files changed

+315
-48
lines changed

11 files changed

+315
-48
lines changed

sycl/include/CL/sycl/detail/sycl_mem_obj_t.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,10 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI {
295295
ContextImplPtr getInteropContext() const override { return MInteropContext; }
296296

297297
protected:
298+
// An allocateMem helper that determines which host ptr to use
299+
void determineHostPtr(const ContextImplPtr &Context, bool InitFromUserData,
300+
void *&HostPtr, bool &HostPtrReadOnly);
301+
298302
// Allocator used for allocation memory on host.
299303
unique_ptr_class<SYCLMemObjAllocator> MAllocator;
300304
// Properties passed by user.

sycl/source/detail/buffer_impl.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,8 @@ namespace sycl {
1616
namespace detail {
1717
void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
1818
void *HostPtr, RT::PiEvent &OutEventToWait) {
19-
// The host pointer for the allocation can be provided in 2 ways:
20-
// 1. Initialize the allocation from user data. Check if the user pointer is
21-
// read-only.
22-
// 2. Use a HostPtr allocated by the runtime. Assume any such pointer to be
23-
// read-write.
2419
bool HostPtrReadOnly = false;
25-
if (InitFromUserData) {
26-
assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
27-
"simultaneously");
28-
HostPtr = BaseT::getUserPtr();
29-
HostPtrReadOnly = BaseT::MHostPtrReadOnly;
30-
}
20+
BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
3121

3222
assert(!(nullptr == HostPtr && BaseT::useHostPtr() && Context->is_host()) &&
3323
"Internal error. Allocating memory on the host "

sycl/source/detail/image_impl.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -307,18 +307,8 @@ template <int Dimensions>
307307
void *image_impl<Dimensions>::allocateMem(ContextImplPtr Context,
308308
bool InitFromUserData, void *HostPtr,
309309
RT::PiEvent &OutEventToWait) {
310-
// The host pointer for the allocation can be provided in 2 ways:
311-
// 1. Initialize the allocation from user data. Check if the user pointer is
312-
// read-only.
313-
// 2. Use a HostPtr allocated by the runtime. Assume any such pointer to be
314-
// read-write.
315310
bool HostPtrReadOnly = false;
316-
if (InitFromUserData) {
317-
assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
318-
"simultaneously");
319-
HostPtr = BaseT::getUserPtr();
320-
HostPtrReadOnly = BaseT::MHostPtrReadOnly;
321-
}
311+
BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
322312

323313
RT::PiMemImageDesc Desc = getImageDesc(HostPtr != nullptr);
324314
assert(checkImageDesc(Desc, Context, HostPtr) &&

sycl/source/detail/memory_manager.cpp

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -119,16 +119,38 @@ void *MemoryManager::allocateInteropMemObject(
119119
return UserPtr;
120120
}
121121

122+
RT::PiMemFlags getMemObjCreationFlags(const ContextImplPtr &TargetContext,
123+
void *UserPtr, bool HostPtrReadOnly) {
124+
// Create read_write mem object to handle arbitrary uses.
125+
RT::PiMemFlags Result = PI_MEM_FLAGS_ACCESS_RW;
126+
if (UserPtr) {
127+
if (HostPtrReadOnly)
128+
Result |= PI_MEM_FLAGS_HOST_PTR_COPY;
129+
else {
130+
// Create the memory object using the host pointer only if the devices
131+
// support host_unified_memory to avoid potential copy overhead.
132+
// TODO This check duplicates the one performed in the GraphBuilder during
133+
// AllocaCommand creation. This information should be propagated here
134+
// instead, which would be a breaking ABI change.
135+
bool HostUnifiedMemory = true;
136+
for (const device &Device : TargetContext->getDevices())
137+
HostUnifiedMemory &=
138+
Device.get_info<info::device::host_unified_memory>();
139+
Result |= HostUnifiedMemory ? PI_MEM_FLAGS_HOST_PTR_USE
140+
: PI_MEM_FLAGS_HOST_PTR_COPY;
141+
}
142+
}
143+
144+
return Result;
145+
}
146+
122147
void *MemoryManager::allocateImageObject(ContextImplPtr TargetContext,
123148
void *UserPtr, bool HostPtrReadOnly,
124149
const RT::PiMemImageDesc &Desc,
125150
const RT::PiMemImageFormat &Format,
126151
const sycl::property_list &) {
127-
// Create read_write mem object by default to handle arbitrary uses.
128-
RT::PiMemFlags CreationFlags = PI_MEM_FLAGS_ACCESS_RW;
129-
if (UserPtr)
130-
CreationFlags |= HostPtrReadOnly ? PI_MEM_FLAGS_HOST_PTR_COPY
131-
: PI_MEM_FLAGS_HOST_PTR_USE;
152+
RT::PiMemFlags CreationFlags =
153+
getMemObjCreationFlags(TargetContext, UserPtr, HostPtrReadOnly);
132154

133155
RT::PiMem NewMem;
134156
const detail::plugin &Plugin = TargetContext->getPlugin();
@@ -142,13 +164,10 @@ void *
142164
MemoryManager::allocateBufferObject(ContextImplPtr TargetContext, void *UserPtr,
143165
bool HostPtrReadOnly, const size_t Size,
144166
const sycl::property_list &PropsList) {
145-
// Create read_write mem object by default to handle arbitrary uses.
146-
RT::PiMemFlags CreationFlags = PI_MEM_FLAGS_ACCESS_RW;
147-
if (UserPtr)
148-
CreationFlags |= HostPtrReadOnly ? PI_MEM_FLAGS_HOST_PTR_COPY
149-
: PI_MEM_FLAGS_HOST_PTR_USE;
150-
else if (PropsList.has_property<
151-
sycl::ext::oneapi::property::buffer::use_pinned_host_memory>())
167+
RT::PiMemFlags CreationFlags =
168+
getMemObjCreationFlags(TargetContext, UserPtr, HostPtrReadOnly);
169+
if (PropsList.has_property<
170+
sycl::ext::oneapi::property::buffer::use_pinned_host_memory>())
152171
CreationFlags |= PI_MEM_FLAGS_HOST_PTR_ALLOC;
153172

154173
RT::PiMem NewMem = nullptr;

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 77 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,22 @@ static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
6969
return false;
7070
}
7171

72+
/// Combines two access modes into a single one that allows both.
73+
static access::mode combineAccessModes(access::mode A, access::mode B) {
74+
if (A == B)
75+
return A;
76+
77+
if (A == access::mode::discard_write &&
78+
(B == access::mode::discard_read_write || B == access::mode::write))
79+
return B;
80+
81+
if (B == access::mode::discard_write &&
82+
(A == access::mode::discard_read_write || A == access::mode::write))
83+
return A;
84+
85+
return access::mode::read_write;
86+
}
87+
7288
Scheduler::GraphBuilder::GraphBuilder() {
7389
if (const char *EnvVarCStr = SYCLConfig<SYCL_PRINT_EXECUTION_GRAPH>::get()) {
7490
std::string GraphPrintOpts(EnvVarCStr);
@@ -574,6 +590,14 @@ Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record,
574590
return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
575591
}
576592

593+
static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
594+
for (const device &Device : Ctx->getDevices()) {
595+
if (!Device.get_info<info::device::host_unified_memory>())
596+
return false;
597+
}
598+
return true;
599+
}
600+
577601
// The function searches for the alloca command matching context and
578602
// requirement. If none exists, new allocation command is created.
579603
// Note, creation of new allocation command can lead to the current context
@@ -603,8 +627,18 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
603627
Req->MMemoryRange, access::mode::read_write,
604628
Req->MSYCLMemObj, Req->MDims, Req->MElemSize,
605629
0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
606-
// Can reuse user data for the first allocation
607-
const bool InitFromUserData = Record->MAllocaCommands.empty();
630+
// Can reuse user data for the first allocation. Do so if host unified
631+
// memory is supported regardless of the access mode (the pointer will be
632+
// reused) or if it's not and the access mode is not discard (the pointer
633+
// will be copied).
634+
// TODO the case where the first alloca is made with a discard mode and
635+
// the user pointer is read-only is still not handled: it leads to
636+
// unnecessary copy on devices with unified host memory support.
637+
const bool InitFromUserData =
638+
Record->MAllocaCommands.empty() &&
639+
(checkHostUnifiedMemory(Queue->getContextImplPtr()) ||
640+
(Req->MAccessMode != access::mode::discard_write &&
641+
Req->MAccessMode != access::mode::discard_read_write));
608642

609643
AllocaCommandBase *LinkedAllocaCmd = nullptr;
610644
// If it is not the first allocation, try to setup a link
@@ -617,13 +651,22 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
617651
// "not" current allocation, but it will require memory copy.
618652
// Can setup link between cl and host allocations only
619653
if (Queue->is_host() != Record->MCurContext->is_host()) {
620-
621-
AllocaCommandBase *LinkedAllocaCmdCand =
622-
findAllocaForReq(Record, Req, Record->MCurContext);
623-
624-
// Cannot setup link if candidate is linked already
625-
if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd)
626-
LinkedAllocaCmd = LinkedAllocaCmdCand;
654+
// Linked commands assume that the host allocation is reused by the
655+
// plugin runtime and that can lead to unnecessary copy overhead on
656+
// devices that do not support host unified memory. Do not link the
657+
// allocations in this case.
658+
const ContextImplPtr &NonHostCtx = Queue->is_host()
659+
? Record->MCurContext
660+
: Queue->getContextImplPtr();
661+
if (checkHostUnifiedMemory(NonHostCtx)) {
662+
AllocaCommandBase *LinkedAllocaCmdCand =
663+
findAllocaForReq(Record, Req, Record->MCurContext);
664+
665+
// Cannot setup link if candidate is linked already
666+
if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
667+
LinkedAllocaCmd = LinkedAllocaCmdCand;
668+
}
669+
}
627670
}
628671

629672
AllocaCmd =
@@ -732,10 +775,30 @@ static bool isInteropHostTask(const std::unique_ptr<ExecCGCommand> &Cmd) {
732775
return HT.MHostTask->isInteropTask();
733776
}
734777

778+
static void combineAccessModesOfReqs(std::vector<Requirement *> &Reqs) {
779+
std::unordered_map<SYCLMemObjI *, access::mode> CombinedModes;
780+
bool HasDuplicateMemObjects = false;
781+
for (const Requirement *Req : Reqs) {
782+
auto Result = CombinedModes.insert(
783+
std::make_pair(Req->MSYCLMemObj, Req->MAccessMode));
784+
if (!Result.second) {
785+
Result.first->second =
786+
combineAccessModes(Result.first->second, Req->MAccessMode);
787+
HasDuplicateMemObjects = true;
788+
}
789+
}
790+
791+
if (!HasDuplicateMemObjects)
792+
return;
793+
for (Requirement *Req : Reqs) {
794+
Req->MAccessMode = CombinedModes[Req->MSYCLMemObj];
795+
}
796+
}
797+
735798
Command *
736799
Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
737800
QueueImplPtr Queue) {
738-
const std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
801+
std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
739802
const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
740803
const CG::CGTYPE CGType = CommandGroup->getType();
741804

@@ -747,6 +810,10 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
747810
if (MPrintOptionsArray[BeforeAddCG])
748811
printGraphAsDot("before_addCG");
749812

813+
// If there are multiple requirements for the same memory object, its
814+
// AllocaCommand creation will be dependent on the access mode of the first
815+
// requirement. Combine these access modes to take all of them into account.
816+
combineAccessModesOfReqs(Reqs);
750817
for (Requirement *Req : Reqs) {
751818
MemObjRecord *Record = nullptr;
752819
AllocaCommandBase *AllocaCmd = nullptr;

sycl/source/detail/sycl_mem_obj_t.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,35 @@ size_t SYCLMemObjT::getBufSizeForContext(const ContextImplPtr &Context,
9999
sizeof(size_t), &BufSize, nullptr);
100100
return BufSize;
101101
}
102+
103+
void SYCLMemObjT::determineHostPtr(const ContextImplPtr &Context,
104+
bool InitFromUserData, void *&HostPtr,
105+
bool &HostPtrReadOnly) {
106+
// The data for the allocation can be provided via either the user pointer
107+
// (InitFromUserData, can be read-only) or a runtime-allocated read-write
108+
// HostPtr. We can have one of these scenarios:
109+
// 1. The allocation is the first one and on host. InitFromUserData == true.
110+
// 2. The allocation is the first one and isn't on host. InitFromUserData
111+
// varies based on unified host memory support and whether or not the data can
112+
// be discarded.
113+
// 3. The allocation is not the first one and is on host. InitFromUserData ==
114+
// false, HostPtr == nullptr. This can only happen if the allocation command
115+
// is not linked since it would be a no-op otherwise. Attempt to reuse the
116+
// user pointer if it's read-write, but do not copy its contents if it's not.
117+
// 4. The allocation is not the first one and not on host. InitFromUserData ==
118+
// false, HostPtr is provided if the command is linked. The host pointer is
119+
// guaranteed to be reused in this case.
120+
if (Context->is_host() && !MOpenCLInterop && !MHostPtrReadOnly)
121+
InitFromUserData = true;
122+
123+
if (InitFromUserData) {
124+
assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "
125+
"simultaneously");
126+
HostPtr = getUserPtr();
127+
HostPtrReadOnly = MHostPtrReadOnly;
128+
} else
129+
HostPtrReadOnly = false;
130+
}
102131
} // namespace detail
103132
} // namespace sycl
104133
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3714,6 +3714,7 @@ _ZN2cl4sycl6detail10image_implILi3EED1Ev
37143714
_ZN2cl4sycl6detail10image_implILi3EED2Ev
37153715
_ZN2cl4sycl6detail10waitEventsESt6vectorINS0_5eventESaIS3_EE
37163716
_ZN2cl4sycl6detail11SYCLMemObjT10releaseMemESt10shared_ptrINS1_12context_implEEPv
3717+
_ZN2cl4sycl6detail11SYCLMemObjT16determineHostPtrERKSt10shared_ptrINS1_12context_implEEbRPvRb
37173718
_ZN2cl4sycl6detail11SYCLMemObjT16updateHostMemoryEPv
37183719
_ZN2cl4sycl6detail11SYCLMemObjT16updateHostMemoryEv
37193720
_ZN2cl4sycl6detail11SYCLMemObjT20getBufSizeForContextERKSt10shared_ptrINS1_12context_implEEP7_cl_mem

sycl/unittests/scheduler/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,6 @@ add_sycl_unittest(SchedulerTests OBJECT
88
WaitAfterCleanup.cpp
99
LinkedAllocaDependencies.cpp
1010
LeavesCollection.cpp
11+
NoUnifiedHostMemory.cpp
1112
utils.cpp
1213
)

sycl/unittests/scheduler/LinkedAllocaDependencies.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,30 @@ class MemObjMock : public cl::sycl::detail::SYCLMemObjI {
3636
detail::ContextImplPtr getInteropContext() const override { return nullptr; }
3737
};
3838

39+
static cl::sycl::device getDeviceWithHostUnifiedMemory() {
40+
for (cl::sycl::device &D : cl::sycl::device::get_devices()) {
41+
if (!D.is_host() &&
42+
D.get_info<cl::sycl::info::device::host_unified_memory>())
43+
return D;
44+
}
45+
return {};
46+
}
47+
3948
TEST_F(SchedulerTest, LinkedAllocaDependencies) {
40-
default_selector Selector{};
41-
if (Selector.select_device().is_host()) {
42-
std::cerr << "Not run due to host-only environment\n";
49+
cl::sycl::device Dev = getDeviceWithHostUnifiedMemory();
50+
if (Dev.is_host()) {
51+
std::cerr << "Not run: no non-host devices with host unified memory support"
52+
<< std::endl;
4353
return;
4454
}
4555

4656
// 1. create two commands: alloca + alloca and link them
4757
// 2. call Scheduler::GraphBuilder::getOrCreateAllocaForReq
4858
detail::Requirement Req = getMockRequirement();
4959

50-
cl::sycl::queue Queue1;
60+
// Commands are linked only if the device supports host unified memory.
61+
62+
cl::sycl::queue Queue1{Dev};
5163
cl::sycl::detail::QueueImplPtr Q1 = cl::sycl::detail::getSyclObjImpl(Queue1);
5264

5365
sycl::device HostDevice;

0 commit comments

Comments
 (0)