llvm · clementval · Apr 8, 2025 · Apr 7, 2025
diff --git a/flang-rt/include/flang-rt/runtime/allocator-registry.h b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -11,25 +11,27 @@
 
 #include "flang/Common/api-attrs.h"
 #include "flang/Runtime/allocator-registry-consts.h"
+#include <cstdint>
 #include <cstdlib>
 #include <vector>
 
 #define MAX_ALLOCATOR 7 // 3 bits are reserved in the descriptor.
 
 namespace Fortran::runtime {
 
-using AllocFct = void *(*)(std::size_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t);
 using FreeFct = void (*)(void *);
 
 typedef struct Allocator_t {
   AllocFct alloc{nullptr};
   FreeFct free{nullptr};
 } Allocator_t;
 
-#ifdef RT_DEVICE_COMPILATION
-static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
+static RT_API_ATTRS void *MallocWrapper(
+    std::size_t size, [[maybe_unused]] std::int64_t) {
   return std::malloc(size);
 }
+#ifdef RT_DEVICE_COMPILATION
 static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
 #endif
 
@@ -39,7 +41,7 @@ struct AllocatorRegistry {
       : allocators{{&MallocWrapper, &FreeWrapper}} {}
 #else
   constexpr AllocatorRegistry() {
-    allocators[kDefaultAllocator] = {&std::malloc, &std::free};
+    allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
   };
 #endif
   RT_API_ATTRS void Register(int, Allocator_t);

diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
@@ -34,15 +34,17 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 }
 
-void *CUFAllocPinned(std::size_t sizeInBytes) {
+void *CUFAllocPinned(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
 }
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(std::size_t sizeInBytes) {
+void *CUFAllocDevice(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
     CUDA_REPORT_IF_ERROR(
@@ -55,7 +57,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {
 
 void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocManaged(std::size_t sizeInBytes) {
+void *CUFAllocManaged(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -64,9 +67,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
 
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocUnified(std::size_t sizeInBytes) {
+void *CUFAllocUnified(
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes);
+  return CUFAllocManaged(sizeInBytes, asyncId);
 }
 
 void CUFFreeUnified(void *p) {

diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
@@ -20,7 +20,8 @@ RT_EXT_API_GROUP_BEGIN
 
 Descriptor *RTDEF(CUFAllocDescriptor)(
     std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
-  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+  return reinterpret_cast<Descriptor *>(
+      CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
 }
 
 void RTDEF(CUFFreeDescriptor)(

diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
   // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1)};
+  void *p{alloc(byteSize ? byteSize : 1, /*asyncId=*/-1)};
   if (!p) {
     return CFI_ERROR_MEM_ALLOCATION;
   }

diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
@@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
   byteSize = ((byteSize + align - 1) / align) * align;
   std::size_t total{byteSize + sizeof(std::uintptr_t)};
   AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
-  void *p{alloc(total)};
+  void *p{alloc(total, /*asyncId=*/-1)};
   if (p && allocatorIdx == 0) {
     // Fill the footer word with the XOR of the ones' complement of
     // the base address, which is a value that would be highly unlikely

diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -20,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }
 
-void *CUFAllocPinned(std::size_t);
+void *CUFAllocPinned(std::size_t, std::int64_t);
 void CUFFreePinned(void *);
 
-void *CUFAllocDevice(std::size_t);
+void *CUFAllocDevice(std::size_t, std::int64_t);
 void CUFFreeDevice(void *);
 
-void *CUFAllocManaged(std::size_t);
+void *CUFAllocManaged(std::size_t, std::int64_t);
 void CUFFreeManaged(void *);
 
-void *CUFAllocUnified(std::size_t);
+void *CUFAllocUnified(std::size_t, std::int64_t);
 void CUFFreeUnified(void *);
 
 } // namespace Fortran::runtime::cuda