Merge pull request #1220 from fabiomestre/fabio/cuda_multimap

kbenzie · web-flow · commit 6e76c98a1f5c · 2024-04-10T16:26:17.000+01:00
[CUDA] Add support for multiple active mappings
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -1160,27 +1160,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
             UR_RESULT_ERROR_INVALID_SIZE);
 
   auto &BufferImpl = std::get<BufferMem>(hBuffer->Mem);
-  ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT;
-  const bool IsPinned =
-      BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
+  auto MapPtr = BufferImpl.mapToPtr(size, offset, mapFlags);
 
-  // Currently no support for overlapping regions
-  if (BufferImpl.getMapPtr() != nullptr) {
-    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  if (!MapPtr) {
+    return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
   }
 
-  // Allocate a pointer in the host to store the mapped information
-  auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags);
-  *ppRetMap = BufferImpl.getMapPtr();
-  if (HostPtr) {
-    Result = UR_RESULT_SUCCESS;
-  }
+  const bool IsPinned =
+      BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
 
+  ur_result_t Result = UR_RESULT_SUCCESS;
   if (!IsPinned &&
       ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) {
     // Pinned host memory is already on host so it doesn't need to be read.
     Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
-                                    HostPtr, numEventsInWaitList,
+                                    MapPtr, numEventsInWaitList,
                                     phEventWaitList, phEvent);
   } else {
     ScopedContext Active(hQueue->getContext());
@@ -1201,6 +1195,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
       }
     }
   }
+  *ppRetMap = MapPtr;
 
   return Result;
 }
@@ -1213,23 +1208,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(std::get<BufferMem>(hMem->Mem).getMapPtr() != nullptr,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(std::get<BufferMem>(hMem->Mem).getMapPtr() == pMappedPtr,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  auto &BufferImpl = std::get<BufferMem>(hMem->Mem);
 
-  const bool IsPinned = std::get<BufferMem>(hMem->Mem).MemAllocMode ==
-                        BufferMem::AllocMode::AllocHostPtr;
+  auto *Map = BufferImpl.getMapDetails(pMappedPtr);
+  UR_ASSERT(Map != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  if (!IsPinned &&
-      (std::get<BufferMem>(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE)) {
+  const bool IsPinned =
+      BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  if (!IsPinned && (Map->getMapFlags() & UR_MAP_FLAG_WRITE)) {
     // Pinned host memory is only on host so it doesn't need to be written to.
     Result = urEnqueueMemBufferWrite(
-        hQueue, hMem, true, std::get<BufferMem>(hMem->Mem).getMapOffset(),
-        std::get<BufferMem>(hMem->Mem).getMapSize(), pMappedPtr,
+        hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr,
         numEventsInWaitList, phEventWaitList, phEvent);
   } else {
     ScopedContext Active(hQueue->getContext());
@@ -1250,8 +1243,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
       }
     }
   }
+  BufferImpl.unmap(pMappedPtr);
 
-  std::get<BufferMem>(hMem->Mem).unmap(pMappedPtr);
   return Result;
 }
 
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
@@ -18,89 +18,107 @@
 
 // Handler for plain, pointer-based CUDA allocations
 struct BufferMem {
-  using native_type = CUdeviceptr;
 
-  // If this allocation is a sub-buffer (i.e., a view on an existing
-  // allocation), this is the pointer to the parent handler structure
-  ur_mem_handle_t Parent;
-  // CUDA handler for the pointer
-  native_type Ptr;
+  struct BufferMap {
+    /// Size of the active mapped region.
+    size_t MapSize;
+    /// Offset of the active mapped region.
+    size_t MapOffset;
+    /// Original flags for the mapped region
+    ur_map_flags_t MapFlags;
+    /// Allocated host memory used exclusively for this map.
+    std::unique_ptr<unsigned char[]> MapMem;
 
-  /// Pointer associated with this device on the host
-  void *HostPtr;
-  /// Size of the allocation in bytes
-  size_t Size;
-  /// Size of the active mapped region.
-  size_t MapSize;
-  /// Offset of the active mapped region.
-  size_t MapOffset;
-  /// Pointer to the active mapped region, if any
-  void *MapPtr;
-  /// Original flags for the mapped region
-  ur_map_flags_t MapFlags;
+    BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags)
+        : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
+          MapMem(nullptr) {}
+
+    BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags,
+              std::unique_ptr<unsigned char[]> &MapMem)
+        : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
+          MapMem(std::move(MapMem)) {}
+
+    size_t getMapSize() const noexcept { return MapSize; }
+
+    size_t getMapOffset() const noexcept { return MapOffset; }
+
+    ur_map_flags_t getMapFlags() const noexcept { return MapFlags; }
+  };
 
   /** AllocMode
    * classic: Just a normal buffer allocated on the device via cuda malloc
    * use_host_ptr: Use an address on the host for the device
-   * copy_in: The data for the device comes from the host but the host
-   pointer is not available later for re-use
-   * alloc_host_ptr: Uses pinned-memory allocation
-  */
+   * copy_in: The data for the device comes from the host but the host pointer
+   * is not available later for re-use alloc_host_ptr: Uses pinned-memory
+   * allocation
+   */
   enum class AllocMode {
     Classic,
     UseHostPtr,
     CopyIn,
     AllocHostPtr,
-  } MemAllocMode;
+  };
+
+  using native_type = CUdeviceptr;
+
+  /// If this allocation is a sub-buffer (i.e., a view on an existing
+  /// allocation), this is the pointer to the parent handler structure
+  ur_mem_handle_t Parent;
+  /// CUDA handler for the pointer
+  native_type Ptr;
+  /// Pointer associated with this device on the host
+  void *HostPtr;
+  /// Size of the allocation in bytes
+  size_t Size;
+  /// A map that contains all the active mappings for this buffer.
+  std::unordered_map<void *, BufferMap> PtrToBufferMap;
+
+  AllocMode MemAllocMode;
 
   BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr,
             void *HostPtr, size_t Size)
-      : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, MapSize{0},
-        MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE},
-        MemAllocMode{Mode} {};
+      : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size},
+        PtrToBufferMap{}, MemAllocMode{Mode} {};
 
   native_type get() const noexcept { return Ptr; }
 
   size_t getSize() const noexcept { return Size; }
 
-  void *getMapPtr() const noexcept { return MapPtr; }
-
-  size_t getMapSize() const noexcept { return MapSize; }
-
-  size_t getMapOffset() const noexcept { return MapOffset; }
+  BufferMap *getMapDetails(void *Map) {
+    auto details = PtrToBufferMap.find(Map);
+    if (details != PtrToBufferMap.end()) {
+      return &details->second;
+    }
+    return nullptr;
+  }
 
   /// Returns a pointer to data visible on the host that contains
   /// the data on the device associated with this allocation.
   /// The offset is used to index into the CUDA allocation.
-  void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept {
-    assert(MapPtr == nullptr);
-    MapSize = Size;
-    MapOffset = Offset;
-    MapFlags = Flags;
-    if (HostPtr) {
-      MapPtr = static_cast<char *>(HostPtr) + Offset;
+  void *mapToPtr(size_t MapSize, size_t MapOffset,
+                 ur_map_flags_t MapFlags) noexcept {
+
+    void *MapPtr = nullptr;
+    if (HostPtr == nullptr) {
+      /// If HostPtr is invalid, we need to create a Mapping that owns its own
+      /// memory on the host.
+      auto MapMem = std::make_unique<unsigned char[]>(MapSize);
+      MapPtr = MapMem.get();
+      PtrToBufferMap.insert(
+          {MapPtr, BufferMap(MapSize, MapOffset, MapFlags, MapMem)});
     } else {
-      // TODO: Allocate only what is needed based on the offset
-      MapPtr = static_cast<void *>(malloc(this->getSize()));
+      /// However, if HostPtr already has valid memory (e.g. pinned allocation),
+      /// we can just use that memory for the mapping.
+      MapPtr = static_cast<char *>(HostPtr) + MapOffset;
+      PtrToBufferMap.insert({MapPtr, BufferMap(MapSize, MapOffset, MapFlags)});
     }
     return MapPtr;
   }
 
   /// Detach the allocation from the host memory.
-  void unmap(void *) noexcept {
-    assert(MapPtr != nullptr);
-
-    if (MapPtr != HostPtr) {
-      free(MapPtr);
-    }
-    MapPtr = nullptr;
-    MapSize = 0;
-    MapOffset = 0;
-  }
-
-  ur_map_flags_t getMapFlags() const noexcept {
+  void unmap(void *MapPtr) noexcept {
     assert(MapPtr != nullptr);
-    return MapFlags;
+    PtrToBufferMap.erase(MapPtr);
   }
 };
 
diff --git a/test/conformance/enqueue/enqueue_adapter_cuda.match b/test/conformance/enqueue/enqueue_adapter_cuda.match
@@ -2,7 +2,6 @@
 {{OPT}}urEnqueueMemBufferCopyRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__256__patternSize__256
 {{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__1024__patternSize__256
-{{OPT}}urEnqueueMemBufferMapTest.SuccessMultiMaps/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemBufferReadRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemBufferWriteRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemImageCopyTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}___1D
diff --git a/test/conformance/enqueue/urEnqueueMemBufferMap.cpp b/test/conformance/enqueue/urEnqueueMemBufferMap.cpp
@@ -194,8 +194,8 @@ TEST_P(urEnqueueMemBufferMapTestWithParam, SuccessMultiMaps) {
     for (size_t i = 0; i < map_count; ++i) {
         map_a[i] = 42;
     }
-    for (size_t i = map_count; i < count; ++i) {
-        map_a[i] = 24;
+    for (size_t i = 0; i < map_count; ++i) {
+        map_b[i] = 24;
     }
     ASSERT_SUCCESS(
         urEnqueueMemUnmap(queue, buffer, map_a, 0, nullptr, nullptr));

Original file line number	Diff line number	Diff line change
`@@ -194,8 +194,8 @@ TEST_P(urEnqueueMemBufferMapTestWithParam, SuccessMultiMaps) {`
`194`	`194`	`for (size_t i = 0; i < map_count; ++i) {`
`195`	`195`	`map_a[i] = 42;`
`196`	`196`	`}`
`197`		`- for (size_t i = map_count; i < count; ++i) {`
`198`		`- map_a[i] = 24;`
	`197`	`+ for (size_t i = 0; i < map_count; ++i) {`
	`198`	`+ map_b[i] = 24;`
`199`	`199`	`}`
`200`	`200`	`ASSERT_SUCCESS(`
`201`	`201`	`urEnqueueMemUnmap(queue, buffer, map_a, 0, nullptr, nullptr));`