Skip to content

Commit 6733c8b

Browse files
[SYCL] Optimize discard_write access mode (#2854)
When using discard_write with a host accessor, we do not need to keep the buffer updated with the latest changes on the device. Skipping that operation results in a significant speed-up. When we have host unified memory, then Map/Unmap operations are enqueued. As these must always match, it is up to the backend (or its plugin) to take advantage of any optimization. In the case of Level Zero, this can be be done in the SYCL plugin interface code. This PR includes a fix for LevelZero. CUDA requires no change. OpenCL will need to be changed in the library itself. When we are not using host unified memory, instead of Map/Unmap operations, we enqueue basic Read/Write ops. When using discard_write on the host, the Read op is unnecessary. This patch schedules an empty operation instead. Signed-off-by: Chris Perkins <[email protected]>
1 parent 566c136 commit 6733c8b

File tree

3 files changed

+19
-22
lines changed

3 files changed

+19
-22
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4196,7 +4196,8 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
41964196
piEventsWait(NumEventsInWaitList, EventWaitList);
41974197
if (Buffer->MapHostPtr) {
41984198
*RetMap = Buffer->MapHostPtr + Offset;
4199-
memcpy(*RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset, Size);
4199+
if (!(MapFlags & CL_MAP_WRITE_INVALIDATE_REGION))
4200+
memcpy(*RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset, Size);
42004201
} else {
42014202
*RetMap = pi_cast<char *>(Buffer->getZeHandle()) + Offset;
42024203
}

sycl/source/detail/scheduler/commands.cpp

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,21 +1141,12 @@ cl_int MemCpyCommand::enqueueImp() {
11411141

11421142
auto RawEvents = getPiEvents(EventImpls);
11431143

1144-
// Omit copying if mode is discard one.
1145-
// TODO: Handle this at the graph building time by, for example, creating
1146-
// empty node instead of memcpy.
1147-
if (MDstReq.MAccessMode == access::mode::discard_read_write ||
1148-
MDstReq.MAccessMode == access::mode::discard_write ||
1149-
MSrcAllocaCmd->getMemAllocation() == MDstAllocaCmd->getMemAllocation()) {
1150-
Command::waitForEvents(Queue, EventImpls, Event);
1151-
} else {
1152-
MemoryManager::copy(
1153-
MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1154-
MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1155-
MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1156-
MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1157-
MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event);
1158-
}
1144+
MemoryManager::copy(
1145+
MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
1146+
MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
1147+
MSrcReq.MOffset, MSrcReq.MElemSize, MDstAllocaCmd->getMemAllocation(),
1148+
MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange,
1149+
MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event);
11591150

11601151
return CL_SUCCESS;
11611152
}

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -353,12 +353,17 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(MemObjRecord *Record,
353353
Record->MHostAccess = MapMode;
354354
} else {
355355

356-
// Full copy of buffer is needed to avoid loss of data that may be caused
357-
// by copying specific range from host to device and backwards.
358-
NewCmd =
359-
new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
360-
*AllocaCmdDst->getRequirement(), AllocaCmdDst,
361-
AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
356+
if ((Req->MAccessMode == access::mode::discard_write) ||
357+
(Req->MAccessMode == access::mode::discard_read_write)) {
358+
return nullptr;
359+
} else {
360+
// Full copy of buffer is needed to avoid loss of data that may be caused
361+
// by copying specific range from host to device and backwards.
362+
NewCmd =
363+
new MemCpyCommand(*AllocaCmdSrc->getRequirement(), AllocaCmdSrc,
364+
*AllocaCmdDst->getRequirement(), AllocaCmdDst,
365+
AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
366+
}
362367
}
363368

364369
for (Command *Dep : Deps) {

0 commit comments

Comments
 (0)