Skip to content

Commit ad8c9d1

Browse files
[SYCL] Switch from COPY_HOST_PTR to write for devices w/o HUM (#3105)
Switch from initializing native memory objects with COPY_HOST_PTR to creating them without a host ptr and following up with a write operation. The reasoning behind this change is that some OpenCL runtimes (e. g. FPGA) use the queue the memory object write is enqueued to as a hint to which device from the context this memory should be copied to, as opposed to COPY_HOST_PTR where only the context itself is known. This led to unnecessary temporary host copy made by the FPGA runtime during buffer creation.
1 parent 05a805e commit ad8c9d1

File tree

4 files changed

+90
-70
lines changed

4 files changed

+90
-70
lines changed

sycl/include/CL/sycl/detail/sycl_mem_obj_t.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,8 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI {
296296

297297
ContextImplPtr getInteropContext() const override { return MInteropContext; }
298298

299+
bool hasUserDataPtr() const { return MUserPtr != nullptr; };
300+
299301
protected:
300302
// An allocateMem helper that determines which host ptr to use
301303
void determineHostPtr(const ContextImplPtr &Context, bool InitFromUserData,

sycl/source/detail/memory_manager.cpp

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -128,24 +128,9 @@ RT::PiMemFlags getMemObjCreationFlags(const ContextImplPtr &TargetContext,
128128
void *UserPtr, bool HostPtrReadOnly) {
129129
// Create read_write mem object to handle arbitrary uses.
130130
RT::PiMemFlags Result = PI_MEM_FLAGS_ACCESS_RW;
131-
if (UserPtr) {
132-
if (HostPtrReadOnly)
133-
Result |= PI_MEM_FLAGS_HOST_PTR_COPY;
134-
else {
135-
// Create the memory object using the host pointer only if the devices
136-
// support host_unified_memory to avoid potential copy overhead.
137-
// TODO This check duplicates the one performed in the GraphBuilder during
138-
// AllocaCommand creation. This information should be propagated here
139-
// instead, which would be a breaking ABI change.
140-
bool HostUnifiedMemory = true;
141-
for (const device &Device : TargetContext->getDevices())
142-
HostUnifiedMemory &=
143-
Device.get_info<info::device::host_unified_memory>();
144-
Result |= HostUnifiedMemory ? PI_MEM_FLAGS_HOST_PTR_USE
145-
: PI_MEM_FLAGS_HOST_PTR_COPY;
146-
}
147-
}
148-
131+
if (UserPtr)
132+
Result |= HostPtrReadOnly ? PI_MEM_FLAGS_HOST_PTR_COPY
133+
: PI_MEM_FLAGS_HOST_PTR_USE;
149134
return Result;
150135
}
151136

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 57 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -635,45 +635,70 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
635635
0 /*ReMOffsetInBytes*/, false /*MIsSubBuffer*/);
636636
// Can reuse user data for the first allocation. Do so if host unified
637637
// memory is supported regardless of the access mode (the pointer will be
638-
// reused) or if it's not and the access mode is not discard (the pointer
639-
// will be copied).
638+
// reused). For devices without host unified memory the initialization
639+
// will be performed as a write operation.
640640
// TODO the case where the first alloca is made with a discard mode and
641641
// the user pointer is read-only is still not handled: it leads to
642642
// unnecessary copy on devices with unified host memory support.
643+
const bool HostUnifiedMemory =
644+
checkHostUnifiedMemory(Queue->getContextImplPtr());
643645
const bool InitFromUserData =
644-
Record->MAllocaCommands.empty() &&
645-
(checkHostUnifiedMemory(Queue->getContextImplPtr()) ||
646-
(Req->MAccessMode != access::mode::discard_write &&
647-
Req->MAccessMode != access::mode::discard_read_write));
648-
646+
Record->MAllocaCommands.empty() && HostUnifiedMemory;
649647
AllocaCommandBase *LinkedAllocaCmd = nullptr;
650-
// If it is not the first allocation, try to setup a link
651-
// FIXME: Temporary limitation, linked alloca commands for an image is not
652-
// supported because map operation is not implemented for an image.
653-
if (!Record->MAllocaCommands.empty() &&
654-
Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::BUFFER)
655-
// Current limitation is to setup link between current allocation and
656-
// new one. There could be situations when we could setup link with
657-
// "not" current allocation, but it will require memory copy.
658-
// Can setup link between cl and host allocations only
659-
if (Queue->is_host() != Record->MCurContext->is_host()) {
660-
// Linked commands assume that the host allocation is reused by the
661-
// plugin runtime and that can lead to unnecessary copy overhead on
662-
// devices that do not support host unified memory. Do not link the
663-
// allocations in this case.
664-
const ContextImplPtr &NonHostCtx = Queue->is_host()
665-
? Record->MCurContext
666-
: Queue->getContextImplPtr();
667-
if (checkHostUnifiedMemory(NonHostCtx)) {
668-
AllocaCommandBase *LinkedAllocaCmdCand =
669-
findAllocaForReq(Record, Req, Record->MCurContext);
670-
671-
// Cannot setup link if candidate is linked already
672-
if (LinkedAllocaCmdCand && !LinkedAllocaCmdCand->MLinkedAllocaCmd) {
673-
LinkedAllocaCmd = LinkedAllocaCmdCand;
674-
}
648+
649+
// For the first allocation on a device without host unified memory we
650+
// might need to also create a host alloca right away in order to perform
651+
// the initial memory write.
652+
if (Record->MAllocaCommands.empty()) {
653+
if (!HostUnifiedMemory &&
654+
Req->MAccessMode != access::mode::discard_write &&
655+
Req->MAccessMode != access::mode::discard_read_write) {
656+
// There's no need to make a host allocation if the buffer is not
657+
// initialized with user data.
658+
// TODO casting is required here to get the necessary information
659+
// without breaking ABI, replace with the next major version.
660+
auto *MemObj = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj);
661+
if (MemObj->hasUserDataPtr()) {
662+
QueueImplPtr DefaultHostQueue =
663+
Scheduler::getInstance().getDefaultHostQueue();
664+
AllocaCommand *HostAllocaCmd = new AllocaCommand(
665+
DefaultHostQueue, FullReq, true /* InitFromUserData */,
666+
nullptr /* LinkedAllocaCmd */);
667+
Record->MAllocaCommands.push_back(HostAllocaCmd);
668+
Record->MWriteLeaves.push_back(HostAllocaCmd);
669+
++(HostAllocaCmd->MLeafCounter);
670+
Record->MCurContext = DefaultHostQueue->getContextImplPtr();
675671
}
676672
}
673+
} else {
674+
// If it is not the first allocation, try to setup a link
675+
// FIXME: Temporary limitation, linked alloca commands for an image is
676+
// not supported because map operation is not implemented for an image.
677+
if (Req->MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::BUFFER)
678+
// Current limitation is to setup link between current allocation and
679+
// new one. There could be situations when we could setup link with
680+
// "not" current allocation, but it will require memory copy.
681+
// Can setup link between cl and host allocations only
682+
if (Queue->is_host() != Record->MCurContext->is_host()) {
683+
// Linked commands assume that the host allocation is reused by the
684+
// plugin runtime and that can lead to unnecessary copy overhead on
685+
// devices that do not support host unified memory. Do not link the
686+
// allocations in this case.
687+
bool HostUnifiedMemoryOnNonHostDevice =
688+
Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext)
689+
: HostUnifiedMemory;
690+
if (HostUnifiedMemoryOnNonHostDevice) {
691+
AllocaCommandBase *LinkedAllocaCmdCand =
692+
findAllocaForReq(Record, Req, Record->MCurContext);
693+
694+
// Cannot setup link if candidate is linked already
695+
if (LinkedAllocaCmdCand &&
696+
!LinkedAllocaCmdCand->MLinkedAllocaCmd) {
697+
LinkedAllocaCmd = LinkedAllocaCmdCand;
698+
}
699+
}
700+
}
701+
}
677702

678703
AllocaCmd =
679704
new AllocaCommand(Queue, FullReq, InitFromUserData, LinkedAllocaCmd);

sycl/unittests/scheduler/NoHostUnifiedMemory.cpp

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,11 @@ static pi_result redefinedDeviceGetInfo(pi_device Device,
3030
return PI_SUCCESS;
3131
}
3232

33-
static RT::PiMemFlags ExpectedMemObjFlags;
34-
3533
static pi_result
3634
redefinedMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
3735
void *host_ptr, pi_mem *ret_mem,
3836
const pi_mem_properties *properties = nullptr) {
39-
EXPECT_EQ(flags, ExpectedMemObjFlags);
37+
EXPECT_EQ(flags, PI_MEM_FLAGS_ACCESS_RW);
4038
return PI_SUCCESS;
4139
}
4240

@@ -85,59 +83,70 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
8583
new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})};
8684

8785
MockScheduler MS;
88-
// Check non-host -> host alloca with non-discard access mode
86+
// Check non-host alloca with non-discard access mode
8987
{
9088
int val;
9189
buffer<int, 1> Buf(&val, range<1>(1));
9290
detail::Requirement Req = getMockRequirement(Buf);
9391

94-
// The host pointer should be copied during the non-host allocation in this
95-
// case.
96-
ExpectedMemObjFlags = PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY;
97-
9892
detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
9993
detail::AllocaCommandBase *NonHostAllocaCmd =
10094
MS.getOrCreateAllocaForReq(Record, &Req, QImpl);
10195

102-
detail::AllocaCommandBase *HostAllocaCmd =
103-
MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue);
96+
// Both non-host and host allocations should be created in this case in
97+
// order to perform a memory move.
98+
EXPECT_EQ(Record->MAllocaCommands.size(), 2U);
99+
detail::AllocaCommandBase *HostAllocaCmd = Record->MAllocaCommands[0];
100+
EXPECT_TRUE(HostAllocaCmd->getQueue()->is_host());
104101
EXPECT_TRUE(!HostAllocaCmd->MLinkedAllocaCmd);
105102
EXPECT_TRUE(!NonHostAllocaCmd->MLinkedAllocaCmd);
103+
EXPECT_TRUE(Record->MCurContext->is_host());
106104

107-
detail::Command *MemoryMove =
108-
MS.insertMemoryMove(Record, &Req, DefaultHostQueue);
105+
detail::Command *MemoryMove = MS.insertMemoryMove(Record, &Req, QImpl);
109106
EXPECT_EQ(MemoryMove->getType(), detail::Command::COPY_MEMORY);
110107
}
111-
// Check non-host -> host alloca with discard access modes
108+
// Check non-host alloca with discard access modes
112109
{
113110
int val;
114111
buffer<int, 1> Buf(&val, range<1>(1));
115112
detail::Requirement Req = getMockRequirement(Buf);
116-
// The host pointer should be ignored due to the discard access mode.
117-
ExpectedMemObjFlags = PI_MEM_FLAGS_ACCESS_RW;
118113

119114
detail::Requirement DiscardReq = getMockRequirement(Buf);
120115
DiscardReq.MAccessMode = access::mode::discard_read_write;
121116

117+
// No need to create a host allocation in this case since the data can be
118+
// discarded.
122119
detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
123120
MS.getOrCreateAllocaForReq(Record, &DiscardReq, QImpl);
121+
EXPECT_EQ(Record->MAllocaCommands.size(), 1U);
122+
}
123+
// Check non-host alloca without user pointer
124+
{
125+
buffer<int, 1> Buf(range<1>(1));
126+
detail::Requirement Req = getMockRequirement(Buf);
127+
128+
// No need to create a host allocation in this case since there's no data to
129+
// initialize the buffer with.
130+
detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
131+
MS.getOrCreateAllocaForReq(Record, &Req, QImpl);
132+
EXPECT_EQ(Record->MAllocaCommands.size(), 1U);
124133
}
125134
// Check host -> non-host alloca
126135
{
127136
int val;
128137
buffer<int, 1> Buf(&val, range<1>(1));
129138
detail::Requirement Req = getMockRequirement(Buf);
130139

131-
// No copy expected during the second allocation, it is performed as a
132-
// separate command.
133-
ExpectedMemObjFlags = PI_MEM_FLAGS_ACCESS_RW;
134-
140+
// No special handling required: alloca commands are created one after
141+
// another and the transfer is done via a write operation.
135142
detail::MemObjRecord *Record =
136143
MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req);
137144
detail::AllocaCommandBase *HostAllocaCmd =
138145
MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue);
146+
EXPECT_EQ(Record->MAllocaCommands.size(), 1U);
139147
detail::AllocaCommandBase *NonHostAllocaCmd =
140148
MS.getOrCreateAllocaForReq(Record, &Req, QImpl);
149+
EXPECT_EQ(Record->MAllocaCommands.size(), 2U);
141150
EXPECT_TRUE(!HostAllocaCmd->MLinkedAllocaCmd);
142151
EXPECT_TRUE(!NonHostAllocaCmd->MLinkedAllocaCmd);
143152

@@ -150,7 +159,6 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
150159
int val;
151160
buffer<int, 1> Buf(&val, range<1>(1));
152161
detail::Requirement Req = getMockRequirement(Buf);
153-
ExpectedMemObjFlags = PI_MEM_FLAGS_ACCESS_RW | PI_MEM_FLAGS_HOST_PTR_COPY;
154162

155163
detail::Requirement DiscardReq = getMockRequirement(Buf);
156164
DiscardReq.MAccessMode = access::mode::discard_read_write;

0 commit comments

Comments
 (0)