Skip to content

Commit b13c5e1

Browse files
authored
Merge pull request #1711 from hdelan/minimize-vector-allocations
[HIP][CUDA] Several changes to kernel launch
2 parents 8788bd1 + 61b42a3 commit b13c5e1

16 files changed

+532
-514
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 49 additions & 162 deletions
Large diffs are not rendered by default.

source/adapters/cuda/memory.cpp

Lines changed: 41 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "common.hpp"
1414
#include "context.hpp"
15+
#include "enqueue.hpp"
1516
#include "memory.hpp"
1617

1718
/// Creates a UR Memory object using a CUDA memory allocation.
@@ -238,7 +239,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
238239
try {
239240
if (PerformInitialCopy) {
240241
for (const auto &Device : hContext->getDevices()) {
241-
UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
242+
// Synchronous behaviour is best in this case
243+
ScopedContext Active(Device);
244+
CUstream Stream{0}; // Use default stream
245+
UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(),
246+
Device, Stream));
247+
UR_CHECK_ERROR(cuStreamSynchronize(Stream));
242248
}
243249
}
244250

@@ -496,27 +502,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
496502
}
497503

498504
namespace {
499-
ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
500-
ur_device_handle_t hDevice) {
505+
ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
506+
ur_device_handle_t hDevice,
507+
CUstream Stream) {
501508
auto &Buffer = std::get<BufferMem>(Mem->Mem);
502-
if (Mem->LastEventWritingToMemObj == nullptr) {
509+
if (Mem->LastQueueWritingToMemObj == nullptr) {
503510
// Device allocation being initialized from host for the first time
504511
if (Buffer.HostPtr) {
505-
UR_CHECK_ERROR(
506-
cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
512+
UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
513+
Buffer.Size, Stream));
507514
}
508-
} else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
509-
hDevice) {
510-
UR_CHECK_ERROR(cuMemcpyDtoD(
515+
} else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
516+
UR_CHECK_ERROR(cuMemcpyDtoDAsync(
511517
Buffer.getPtr(hDevice),
512-
Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
513-
Buffer.Size));
518+
Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size,
519+
Stream));
514520
}
515521
return UR_RESULT_SUCCESS;
516522
}
517523

518-
ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
519-
ur_device_handle_t hDevice) {
524+
ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
525+
ur_device_handle_t hDevice,
526+
CUstream Stream) {
520527
auto &Image = std::get<SurfaceMem>(Mem->Mem);
521528
// When a dimension isn't used image_desc has the size set to 1
522529
size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -547,40 +554,42 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
547554
CpyDesc3D.Depth = Image.ImageDesc.depth;
548555
}
549556

550-
if (Mem->LastEventWritingToMemObj == nullptr) {
557+
if (Mem->LastQueueWritingToMemObj == nullptr) {
551558
if (Image.HostPtr) {
552559
if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
553-
UR_CHECK_ERROR(
554-
cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
560+
UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr,
561+
ImageSizeBytes, Stream));
555562
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
556563
CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
557564
CpyDesc2D.srcHost = Image.HostPtr;
558-
UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
565+
UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
559566
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
560567
CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
561568
CpyDesc3D.srcHost = Image.HostPtr;
562-
UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
569+
UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
563570
}
564571
}
565-
} else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
566-
hDevice) {
572+
} else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
567573
if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
574+
// Blocking wait needed
575+
UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj));
568576
// FIXME: 1D memcpy from DtoD going through the host.
569577
UR_CHECK_ERROR(cuMemcpyAtoH(
570578
Image.HostPtr,
571-
Image.getArray(
572-
Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
579+
Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()),
573580
0 /*srcOffset*/, ImageSizeBytes));
574581
UR_CHECK_ERROR(
575582
cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
576583
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
577-
CpyDesc2D.srcArray = Image.getArray(
578-
Mem->LastEventWritingToMemObj->getQueue()->getDevice());
579-
UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
584+
CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
585+
CpyDesc2D.srcArray =
586+
Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
587+
UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
580588
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
581-
CpyDesc3D.srcArray = Image.getArray(
582-
Mem->LastEventWritingToMemObj->getQueue()->getDevice());
583-
UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
589+
CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
590+
CpyDesc3D.srcArray =
591+
Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
592+
UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
584593
}
585594
}
586595
return UR_RESULT_SUCCESS;
@@ -589,8 +598,8 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
589598

590599
// If calling this entry point it is necessary to lock the memoryMigrationMutex
591600
// beforehand
592-
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
593-
const ur_device_handle_t hDevice) {
601+
ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
602+
ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) {
594603
UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
595604
// Device allocation has already been initialized with most up to date
596605
// data in buffer
@@ -601,9 +610,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
601610

602611
ScopedContext Active(hDevice);
603612
if (Mem->isBuffer()) {
604-
UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
613+
UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
605614
} else {
606-
UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
615+
UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
607616
}
608617

609618
Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(

source/adapters/cuda/memory.hpp

Lines changed: 16 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@
2020
#include "device.hpp"
2121
#include "event.hpp"
2222

23+
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
24+
const ur_device_handle_t);
25+
ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
26+
const ur_device_handle_t,
27+
CUstream);
28+
2329
// Handler for plain, pointer-based CUDA allocations
2430
struct BufferMem {
2531

@@ -288,7 +294,7 @@ struct SurfaceMem {
288294
///
289295
/// The ur_mem_handle_t is responsible for memory allocation and migration
290296
/// across devices in the same ur_context_handle_t. If a kernel writes to a
291-
/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
297+
/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all
292298
/// subsequent operations that want to read from the ur_mem_handle_t must wait
293299
/// on the event referring to the last write.
294300
///
@@ -308,61 +314,7 @@ struct SurfaceMem {
308314
///
309315
/// Migrations will occur in both cases if the most recent version of data
310316
/// is on a different device, marked by
311-
/// LastEventWritingToMemObj->getQueue()->getDevice()
312-
///
313-
/// Example trace:
314-
/// ~~~~~~~~~~~~~~
315-
///
316-
/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
317-
/// -> OUT: hContext
318-
///
319-
/// =====> urMemBufferCreate(hContext,...);
320-
/// -> No native allocations made
321-
/// -> OUT: hBuffer
322-
///
323-
/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
324-
/// -> Allocation made on q0 ie device0
325-
/// -> New allocation initialized with host data.
326-
///
327-
/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
328-
/// -> ur_kernel_handle_t associated with a ur_program_handle_t,
329-
/// which is in turn unique to a device. So we can set the kernel
330-
/// arg with the ptr of the device specific allocation.
331-
/// -> hKernel0->getProgram()->getDevice() == device0
332-
/// -> allocateMemObjOnDeviceIfNeeded(device0);
333-
/// -> Native allocation already made on device0, continue.
334-
///
335-
/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
336-
/// -> Suppose that hKernel0 writes to hBuffer.
337-
/// -> Call hBuffer->setLastEventWritingToMemObj with return event
338-
/// from this operation
339-
/// -> Enqueue native kernel launch
340-
///
341-
/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
342-
/// -> hKernel1->getProgram()->getDevice() == device1
343-
/// -> New allocation will be made on device1 when calling
344-
/// getPtr(device1)
345-
/// -> No native allocation on device1
346-
/// -> Make native allocation on device1
347-
///
348-
/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
349-
/// -> Suppose hKernel1 wants to read from hBuffer and not write.
350-
/// -> migrateMemoryToDeviceIfNeeded(device1);
351-
/// -> hBuffer->LastEventWritingToMemObj is not nullptr
352-
/// -> Check if memory has been migrated to device1 since the
353-
/// last write
354-
/// -> Hasn't been migrated
355-
/// -> Wait on LastEventWritingToMemObj.
356-
/// -> Migrate memory from device0's native allocation to
357-
/// device1's native allocation.
358-
/// -> Enqueue native kernel launch
359-
///
360-
/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
361-
/// -> migrateMemoryToDeviceIfNeeded(device0);
362-
/// -> hBuffer->LastEventWritingToMemObj refers to an event
363-
/// from q0
364-
/// -> Migration not necessary
365-
/// -> Enqueue native kernel launch
317+
/// LastQueueWritingToMemObj->getDevice()
366318
///
367319
struct ur_mem_handle_t_ {
368320
// Context where the memory object is accessible
@@ -381,15 +333,13 @@ struct ur_mem_handle_t_ {
381333
// Has the memory been migrated to a device since the last write?
382334
std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
383335

384-
// We should wait on this event prior to migrating memory across allocations
385-
// in this ur_mem_handle_t_
386-
ur_event_handle_t LastEventWritingToMemObj{nullptr};
336+
// Queue with most up to date data of ur_mem_handle_t_
337+
ur_queue_handle_t LastQueueWritingToMemObj{nullptr};
387338

388339
// Enumerates all possible types of accesses.
389340
enum access_mode_t { unknown, read_write, read_only, write_only };
390341

391342
ur_mutex MemoryAllocationMutex; // A mutex for allocations
392-
ur_mutex MemoryMigrationMutex; // A mutex for memory transfers
393343

394344
/// A UR Memory object represents either plain memory allocations ("Buffers"
395345
/// in OpenCL) or typed allocations ("Images" in OpenCL).
@@ -478,20 +428,17 @@ struct ur_mem_handle_t_ {
478428

479429
uint32_t getReferenceCount() const noexcept { return RefCount; }
480430

481-
void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
482-
assert(NewEvent && "Invalid event!");
483-
// This entry point should only ever be called when using multi device ctx
484-
assert(Context->Devices.size() > 1);
485-
urEventRetain(NewEvent);
486-
if (LastEventWritingToMemObj != nullptr) {
487-
urEventRelease(LastEventWritingToMemObj);
431+
void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) {
432+
urQueueRetain(WritingQueue);
433+
if (LastQueueWritingToMemObj != nullptr) {
434+
urQueueRelease(LastQueueWritingToMemObj);
488435
}
489-
LastEventWritingToMemObj = NewEvent;
436+
LastQueueWritingToMemObj = WritingQueue;
490437
for (const auto &Device : Context->getDevices()) {
491438
// This event is never an interop event so will always have an associated
492439
// queue
493440
HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
494-
Device == NewEvent->getQueue()->getDevice();
441+
Device == WritingQueue->getDevice();
495442
}
496443
}
497444
};

0 commit comments

Comments
 (0)