[flang][cuda] Use a reference for asyncObject #138186

clementval · 2025-05-01T19:25:43Z

Switch from int64_t to int64_t* to fit with the rest of the implementation.

New tentative with some fix. The previous was reverted yesterday.

Switch from `int64_t` to `int64_t*` to fit with the rest of the implementation.

llvmbot · 2025-05-01T19:26:19Z

@llvm/pr-subscribers-openacc

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Switch from int64_t to int64_t* to fit with the rest of the implementation.

New tentative with some fix. The previous was reverted yesterday.

Patch is 88.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138186.diff

52 Files Affected:

(modified) flang-rt/include/flang-rt/runtime/allocator-registry.h (+2-2)
(modified) flang-rt/include/flang-rt/runtime/descriptor.h (+3-3)
(modified) flang-rt/include/flang-rt/runtime/reduction-templates.h (+1-1)
(modified) flang-rt/lib/cuda/allocatable.cpp (+4-4)
(modified) flang-rt/lib/cuda/allocator.cpp (+10-10)
(modified) flang-rt/lib/cuda/descriptor.cpp (+1-1)
(modified) flang-rt/lib/runtime/allocatable.cpp (+6-6)
(modified) flang-rt/lib/runtime/array-constructor.cpp (+2-2)
(modified) flang-rt/lib/runtime/assign.cpp (+2-2)
(modified) flang-rt/lib/runtime/character.cpp (+11-9)
(modified) flang-rt/lib/runtime/copy.cpp (+2-2)
(modified) flang-rt/lib/runtime/derived.cpp (+3-3)
(modified) flang-rt/lib/runtime/descriptor.cpp (+2-2)
(modified) flang-rt/lib/runtime/extrema.cpp (+2-2)
(modified) flang-rt/lib/runtime/findloc.cpp (+1-1)
(modified) flang-rt/lib/runtime/matmul-transpose.cpp (+1-1)
(modified) flang-rt/lib/runtime/matmul.cpp (+1-1)
(modified) flang-rt/lib/runtime/misc-intrinsic.cpp (+1-1)
(modified) flang-rt/lib/runtime/pointer.cpp (+1-1)
(modified) flang-rt/lib/runtime/temporary-stack.cpp (+1-1)
(modified) flang-rt/lib/runtime/tools.cpp (+1-1)
(modified) flang-rt/lib/runtime/transformational.cpp (+2-2)
(modified) flang-rt/unittests/Evaluate/reshape.cpp (+1-1)
(modified) flang-rt/unittests/Runtime/Allocatable.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/CUDA/Allocatable.cpp (+8-4)
(modified) flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/CUDA/Memory.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/CharacterTest.cpp (+1-1)
(modified) flang-rt/unittests/Runtime/CommandTest.cpp (+4-4)
(modified) flang-rt/unittests/Runtime/TemporaryStack.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/tools.h (+1-1)
(modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+5-6)
(modified) flang/include/flang/Runtime/CUDA/allocatable.h (+4-4)
(modified) flang/include/flang/Runtime/CUDA/allocator.h (+4-4)
(modified) flang/include/flang/Runtime/CUDA/pointer.h (+4-4)
(modified) flang/include/flang/Runtime/allocatable.h (+4-3)
(modified) flang/lib/Lower/Allocatable.cpp (+1-1)
(modified) flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp (+3-4)
(modified) flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp (+11-11)
(modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+4-6)
(modified) flang/test/Fir/CUDA/cuda-allocate.fir (+8-10)
(modified) flang/test/Fir/cuf-invalid.fir (+2-3)
(modified) flang/test/Fir/cuf.mlir (+3-4)
(modified) flang/test/HLFIR/elemental-codegen.fir (+3-3)
(modified) flang/test/Lower/CUDA/cuda-allocatable.cuf (+4-5)
(modified) flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90 (+2-2)
(modified) flang/test/Lower/OpenACC/acc-declare.f90 (+2-2)
(modified) flang/test/Lower/allocatable-polymorphic.f90 (+13-13)
(modified) flang/test/Lower/allocatable-runtime.f90 (+2-2)
(modified) flang/test/Lower/allocate-mold.f90 (+2-2)
(modified) flang/test/Lower/polymorphic.f90 (+1-1)
(modified) flang/test/Transforms/lower-repack-arrays.fir (+4-4)

diff --git a/flang-rt/include/flang-rt/runtime/allocator-registry.h b/flang-rt/include/flang-rt/runtime/allocator-registry.h
index 33e8e2c7d7850..f0ba77a360736 100644
--- a/flang-rt/include/flang-rt/runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -19,7 +19,7 @@
 
 namespace Fortran::runtime {
 
-using AllocFct = void *(*)(std::size_t, std::int64_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t *);
 using FreeFct = void (*)(void *);
 
 typedef struct Allocator_t {
@@ -28,7 +28,7 @@ typedef struct Allocator_t {
 } Allocator_t;
 
 static RT_API_ATTRS void *MallocWrapper(
-    std::size_t size, [[maybe_unused]] std::int64_t) {
+    std::size_t size, [[maybe_unused]] std::int64_t *) {
   return std::malloc(size);
 }
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang-rt/include/flang-rt/runtime/descriptor.h b/flang-rt/include/flang-rt/runtime/descriptor.h
index 9907e7866e7bf..c98e6b14850cb 100644
--- a/flang-rt/include/flang-rt/runtime/descriptor.h
+++ b/flang-rt/include/flang-rt/runtime/descriptor.h
@@ -29,8 +29,8 @@
 #include <cstdio>
 #include <cstring>
 
-/// Value used for asyncId when no specific stream is specified.
-static constexpr std::int64_t kNoAsyncId = -1;
+/// Value used for asyncObject when no specific stream is specified.
+static constexpr std::int64_t *kNoAsyncObject = nullptr;
 
 namespace Fortran::runtime {
 
@@ -372,7 +372,7 @@ class Descriptor {
   // before calling.  It (re)computes the byte strides after
   // allocation.  Does not allocate automatic components or
   // perform default component initialization.
-  RT_API_ATTRS int Allocate(std::int64_t asyncId);
+  RT_API_ATTRS int Allocate(std::int64_t *asyncObject);
   RT_API_ATTRS void SetByteStrides();
 
   // Deallocates storage; does not call FINAL subroutines or
diff --git a/flang-rt/include/flang-rt/runtime/reduction-templates.h b/flang-rt/include/flang-rt/runtime/reduction-templates.h
index 77f77a592a476..18412708b02c5 100644
--- a/flang-rt/include/flang-rt/runtime/reduction-templates.h
+++ b/flang-rt/include/flang-rt/runtime/reduction-templates.h
@@ -347,7 +347,7 @@ inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
     // as the element size of the source.
     result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
         CFI_attribute_allocatable);
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
       terminator.Crash(
           "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
     }
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 432974d18a3e3..c77819e9440d7 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -23,7 +23,7 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
@@ -41,7 +41,7 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
   return stat;
 }
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   if (desc.HasAddendum()) {
@@ -63,7 +63,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
       alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
@@ -76,7 +76,7 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocateSync)(
       alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 51119ab251168..f4289c55bd8de 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -98,7 +98,7 @@ static unsigned findAllocation(void *ptr) {
   return allocNotFound;
 }
 
-static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
+static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
   CriticalSection critical{lock};
   initAllocations();
   if (numDeviceAllocations >= maxDeviceAllocations) {
@@ -106,7 +106,7 @@ static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
   }
   deviceAllocations[numDeviceAllocations].ptr = ptr;
   deviceAllocations[numDeviceAllocations].size = size;
-  deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
+  deviceAllocations[numDeviceAllocations].stream = stream;
   ++numDeviceAllocations;
   qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
       compareDeviceAlloc);
@@ -136,7 +136,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 
 void *CUFAllocPinned(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -144,18 +144,18 @@ void *CUFAllocPinned(
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
     CUDA_REPORT_IF_ERROR(
         cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
   } else {
-    if (asyncId == kNoAsyncId) {
+    if (asyncObject == kNoAsyncObject) {
       CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
     } else {
       CUDA_REPORT_IF_ERROR(
-          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
-      insertAllocation(p, sizeInBytes, asyncId);
+          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
+      insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
     }
   }
   return p;
@@ -174,7 +174,7 @@ void CUFFreeDevice(void *p) {
 }
 
 void *CUFAllocManaged(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -184,9 +184,9 @@ void *CUFAllocManaged(
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
 void *CUFAllocUnified(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes, asyncId);
+  return CUFAllocManaged(sizeInBytes, asyncObject);
 }
 
 void CUFFreeUnified(void *p) {
diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index 175e8c0ef8438..7b768f91af29d 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -21,7 +21,7 @@ RT_EXT_API_GROUP_BEGIN
 Descriptor *RTDEF(CUFAllocDescriptor)(
     std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
   return reinterpret_cast<Descriptor *>(
-      CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
+      CUFAllocManaged(sizeInBytes, /*asyncObject=*/nullptr));
 }
 
 void RTDEF(CUFFreeDescriptor)(
diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index 6acce34eb9a9e..ef18da6ea0786 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -133,17 +133,17 @@ void RTDEF(AllocatableApplyMold)(
   }
 }
 
-int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
+    std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
   } else if (descriptor.IsAllocated()) {
     return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
   } else {
-    int stat{
-        ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
+    int stat{ReturnError(
+        terminator, descriptor.Allocate(asyncObject), errMsg, hasStat)};
     if (stat == StatOk) {
       if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
         if (const auto *derived{addendum->derivedType()}) {
@@ -162,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
     const Descriptor &source, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
-      alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     DoFromSourceAssign(alloc, source, terminator);
diff --git a/flang-rt/lib/runtime/array-constructor.cpp b/flang-rt/lib/runtime/array-constructor.cpp
index 67b3b5e1e0f50..858fac7bf2b39 100644
--- a/flang-rt/lib/runtime/array-constructor.cpp
+++ b/flang-rt/lib/runtime/array-constructor.cpp
@@ -50,7 +50,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
           initialAllocationSize(fromElements, to.ElementBytes())};
       to.GetDimension(0).SetBounds(1, allocationSize);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
           vector.sourceFile, vector.sourceLine);
       to.GetDimension(0).SetBounds(1, fromElements);
       vector.actualAllocationSize = allocationSize;
@@ -59,7 +59,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
       // first value: there should be no reallocation.
       RUNTIME_CHECK(terminator, previousToElements >= fromElements);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
           vector.sourceFile, vector.sourceLine);
       vector.actualAllocationSize = previousToElements;
     }
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index 4a813cd489022..8a4fa36c91479 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -99,7 +99,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncId))};
+  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
   if (result == StatOk && derived && !derived->noInitializationNeeded()) {
     result = ReturnError(terminator, Initialize(to, *derived, terminator));
   }
@@ -277,7 +277,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncId))};
+      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
       if (stat == StatOk) {
         if (HasDynamicComponent(from)) {
           // If 'from' has allocatable/automatic component, we cannot
diff --git a/flang-rt/lib/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp
index d1152ee1caefb..f140d202e118e 100644
--- a/flang-rt/lib/runtime/character.cpp
+++ b/flang-rt/lib/runtime/character.cpp
@@ -118,7 +118,7 @@ static RT_API_ATTRS void Compare(Descriptor &result, const Descriptor &x,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("Compare: could not allocate storage for result");
   }
   std::size_t xChars{x.ElementBytes() >> shift<CHAR>};
@@ -173,7 +173,7 @@ static RT_API_ATTRS void AdjustLRHelper(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("ADJUSTL/R: could not allocate storage for result");
   }
   for (SubscriptValue resultAt{0}; elements-- > 0;
@@ -227,7 +227,7 @@ static RT_API_ATTRS void LenTrim(Descriptor &result, const Descriptor &string,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("LEN_TRIM: could not allocate storage for result");
   }
   std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@@ -427,7 +427,7 @@ static RT_API_ATTRS void GeneralCharFunc(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("SCAN/VERIFY: could not allocate storage for result");
   }
   std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@@ -530,7 +530,8 @@ static RT_API_ATTRS void MaxMinHelper(Descriptor &accumulator,
     for (int j{0}; j < rank; ++j) {
       accumulator.GetDimension(j).SetBounds(1, ub[j]);
     }
-    RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+    RUNTIME_CHECK(
+        terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   }
   for (CHAR *result{accumulator.OffsetElement<CHAR>()}; elements-- > 0;
        accumData += accumChars, result += chars, x.IncrementSubscripts(xAt)) {
@@ -606,7 +607,7 @@ void RTDEF(CharacterConcatenate)(Descriptor &accumulator,
   for (int j{0}; j < rank; ++j) {
     accumulator.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (accumulator.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (accumulator.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash(
         "CharacterConcatenate: could not allocate storage for result");
   }
@@ -629,7 +630,8 @@ void RTDEF(CharacterConcatenateScalar1)(
   accumulator.set_base_addr(nullptr);
   std::size_t oldLen{accumulator.ElementBytes()};
   accumulator.raw().elem_len += chars;
-  RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(
+      terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   std::memcpy(accumulator.OffsetElement<char>(oldLen), from, chars);
   FreeMemory(old);
 }
@@ -831,7 +833,7 @@ void RTDEF(Repeat)(Descriptor &result, const Descriptor &string,
   std::size_t origBytes{string.ElementBytes()};
   result.Establish(string.type(), origBytes * ncopies, nullptr, 0, nullptr,
       CFI_attribute_allocatable);
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("REPEAT could not allocate storage for result");
   }
   const char *from{string.OffsetElement()};
@@ -865,7 +867,7 @@ void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
   }
   result.Establish(string.type(), resultBytes, nullptr, 0, nullptr,
       CFI_attribute_allocatable);
-  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   std::memcpy(result.OffsetElement(), string.OffsetElement(), resultBytes);
 }
 
diff --git a/flang-rt/lib/runtime/copy.cpp b/flang-rt/lib/runtime/copy.cpp
index 3a0f98cf8d376..f990f46e0be66 100644
--- a/flang-rt/lib/runtime/copy.cpp
+++ b/flang-rt/lib/runtime/copy.cpp
@@ -171,8 +171,8 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
                 *reinterpret_cast<Descriptor *>(toPtr + component->offset())};
             if (toDesc.raw().base_addr != nullptr) {
               toDesc.set_base_addr(nullptr);
-              RUNTIME_CHECK(
-                  terminator, toDesc.Allocate(/*asyncId=*/-1) == CFI_SUCCESS);
+              RUNTIME_CHECK(terminator,
+                  toDesc.Allocate(/*asyncObject=*/nullptr) == CFI_SUCCESS);
               const Descriptor &fromDesc{*reinterpret_cast<const Descriptor *>(
                   fromPtr + component->offset())};
               copyStack.emplace(toDesc, fromDesc);
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index c46ea806a430a..35037036f63e7 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -52,7 +52,7 @@ RT_API_ATTRS int Initialize(const Descriptor &instance,
         allocDesc.raw().attribute = CFI_attribute_allocatable;
         if (comp.genre() == typeInfo::Component::Genre::Automatic) {
           stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
           if (stat == StatOk) {
             if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
               if (const auto *derived{addendum->derivedType()}) {
@@ -153,7 +153,7 @@ RT_API_ATTRS int InitializeClone(const Descriptor &clone,
         if (origDesc.IsAllocated()) {
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
           stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
           if (stat == StatOk) {
             if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
               if (const typeInfo::DerivedType *
@@ -260,7 +260,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy.raw().attribute = CFI_attribute_allocatable;
         Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
         RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncId) == CFI_SUCCESS);
+            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index 3debf53bb5290..67336d01380e0 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -158,7 +158,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
 #endif
 }
 
-RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
+RT_API_ATTRS int Descriptor::Allocate(std::int64_t *asyncObject) {
   std::size_t elementBytes{ElementBytes()};
   if (static_cast<std::int64_t>(elementBytes) < 0) {
     // F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
   // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1, asyncId)};
+  void *p{alloc(byteSize ? byteSize : 1, asyncObject)};
   if (!p) {
     return CFI_ERROR_MEM_ALLOCATION;
   }
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/r...
[truncated]

llvmbot · 2025-05-01T19:26:20Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Switch from int64_t to int64_t* to fit with the rest of the implementation.

New tentative with some fix. The previous was reverted yesterday.

Patch is 88.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138186.diff

52 Files Affected:

(modified) flang-rt/include/flang-rt/runtime/allocator-registry.h (+2-2)
(modified) flang-rt/include/flang-rt/runtime/descriptor.h (+3-3)
(modified) flang-rt/include/flang-rt/runtime/reduction-templates.h (+1-1)
(modified) flang-rt/lib/cuda/allocatable.cpp (+4-4)
(modified) flang-rt/lib/cuda/allocator.cpp (+10-10)
(modified) flang-rt/lib/cuda/descriptor.cpp (+1-1)
(modified) flang-rt/lib/runtime/allocatable.cpp (+6-6)
(modified) flang-rt/lib/runtime/array-constructor.cpp (+2-2)
(modified) flang-rt/lib/runtime/assign.cpp (+2-2)
(modified) flang-rt/lib/runtime/character.cpp (+11-9)
(modified) flang-rt/lib/runtime/copy.cpp (+2-2)
(modified) flang-rt/lib/runtime/derived.cpp (+3-3)
(modified) flang-rt/lib/runtime/descriptor.cpp (+2-2)
(modified) flang-rt/lib/runtime/extrema.cpp (+2-2)
(modified) flang-rt/lib/runtime/findloc.cpp (+1-1)
(modified) flang-rt/lib/runtime/matmul-transpose.cpp (+1-1)
(modified) flang-rt/lib/runtime/matmul.cpp (+1-1)
(modified) flang-rt/lib/runtime/misc-intrinsic.cpp (+1-1)
(modified) flang-rt/lib/runtime/pointer.cpp (+1-1)
(modified) flang-rt/lib/runtime/temporary-stack.cpp (+1-1)
(modified) flang-rt/lib/runtime/tools.cpp (+1-1)
(modified) flang-rt/lib/runtime/transformational.cpp (+2-2)
(modified) flang-rt/unittests/Evaluate/reshape.cpp (+1-1)
(modified) flang-rt/unittests/Runtime/Allocatable.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/CUDA/Allocatable.cpp (+8-4)
(modified) flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/CUDA/Memory.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/CharacterTest.cpp (+1-1)
(modified) flang-rt/unittests/Runtime/CommandTest.cpp (+4-4)
(modified) flang-rt/unittests/Runtime/TemporaryStack.cpp (+2-2)
(modified) flang-rt/unittests/Runtime/tools.h (+1-1)
(modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+5-6)
(modified) flang/include/flang/Runtime/CUDA/allocatable.h (+4-4)
(modified) flang/include/flang/Runtime/CUDA/allocator.h (+4-4)
(modified) flang/include/flang/Runtime/CUDA/pointer.h (+4-4)
(modified) flang/include/flang/Runtime/allocatable.h (+4-3)
(modified) flang/lib/Lower/Allocatable.cpp (+1-1)
(modified) flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp (+3-4)
(modified) flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp (+11-11)
(modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+4-6)
(modified) flang/test/Fir/CUDA/cuda-allocate.fir (+8-10)
(modified) flang/test/Fir/cuf-invalid.fir (+2-3)
(modified) flang/test/Fir/cuf.mlir (+3-4)
(modified) flang/test/HLFIR/elemental-codegen.fir (+3-3)
(modified) flang/test/Lower/CUDA/cuda-allocatable.cuf (+4-5)
(modified) flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90 (+2-2)
(modified) flang/test/Lower/OpenACC/acc-declare.f90 (+2-2)
(modified) flang/test/Lower/allocatable-polymorphic.f90 (+13-13)
(modified) flang/test/Lower/allocatable-runtime.f90 (+2-2)
(modified) flang/test/Lower/allocate-mold.f90 (+2-2)
(modified) flang/test/Lower/polymorphic.f90 (+1-1)
(modified) flang/test/Transforms/lower-repack-arrays.fir (+4-4)

diff --git a/flang-rt/include/flang-rt/runtime/allocator-registry.h b/flang-rt/include/flang-rt/runtime/allocator-registry.h
index 33e8e2c7d7850..f0ba77a360736 100644
--- a/flang-rt/include/flang-rt/runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -19,7 +19,7 @@
 
 namespace Fortran::runtime {
 
-using AllocFct = void *(*)(std::size_t, std::int64_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t *);
 using FreeFct = void (*)(void *);
 
 typedef struct Allocator_t {
@@ -28,7 +28,7 @@ typedef struct Allocator_t {
 } Allocator_t;
 
 static RT_API_ATTRS void *MallocWrapper(
-    std::size_t size, [[maybe_unused]] std::int64_t) {
+    std::size_t size, [[maybe_unused]] std::int64_t *) {
   return std::malloc(size);
 }
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang-rt/include/flang-rt/runtime/descriptor.h b/flang-rt/include/flang-rt/runtime/descriptor.h
index 9907e7866e7bf..c98e6b14850cb 100644
--- a/flang-rt/include/flang-rt/runtime/descriptor.h
+++ b/flang-rt/include/flang-rt/runtime/descriptor.h
@@ -29,8 +29,8 @@
 #include <cstdio>
 #include <cstring>
 
-/// Value used for asyncId when no specific stream is specified.
-static constexpr std::int64_t kNoAsyncId = -1;
+/// Value used for asyncObject when no specific stream is specified.
+static constexpr std::int64_t *kNoAsyncObject = nullptr;
 
 namespace Fortran::runtime {
 
@@ -372,7 +372,7 @@ class Descriptor {
   // before calling.  It (re)computes the byte strides after
   // allocation.  Does not allocate automatic components or
   // perform default component initialization.
-  RT_API_ATTRS int Allocate(std::int64_t asyncId);
+  RT_API_ATTRS int Allocate(std::int64_t *asyncObject);
   RT_API_ATTRS void SetByteStrides();
 
   // Deallocates storage; does not call FINAL subroutines or
diff --git a/flang-rt/include/flang-rt/runtime/reduction-templates.h b/flang-rt/include/flang-rt/runtime/reduction-templates.h
index 77f77a592a476..18412708b02c5 100644
--- a/flang-rt/include/flang-rt/runtime/reduction-templates.h
+++ b/flang-rt/include/flang-rt/runtime/reduction-templates.h
@@ -347,7 +347,7 @@ inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
     // as the element size of the source.
     result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
         CFI_attribute_allocatable);
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
       terminator.Crash(
           "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
     }
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 432974d18a3e3..c77819e9440d7 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -23,7 +23,7 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
@@ -41,7 +41,7 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
   return stat;
 }
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   if (desc.HasAddendum()) {
@@ -63,7 +63,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
       alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
@@ -76,7 +76,7 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocateSync)(
       alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 51119ab251168..f4289c55bd8de 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -98,7 +98,7 @@ static unsigned findAllocation(void *ptr) {
   return allocNotFound;
 }
 
-static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
+static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
   CriticalSection critical{lock};
   initAllocations();
   if (numDeviceAllocations >= maxDeviceAllocations) {
@@ -106,7 +106,7 @@ static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
   }
   deviceAllocations[numDeviceAllocations].ptr = ptr;
   deviceAllocations[numDeviceAllocations].size = size;
-  deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
+  deviceAllocations[numDeviceAllocations].stream = stream;
   ++numDeviceAllocations;
   qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
       compareDeviceAlloc);
@@ -136,7 +136,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 
 void *CUFAllocPinned(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -144,18 +144,18 @@ void *CUFAllocPinned(
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
     CUDA_REPORT_IF_ERROR(
         cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
   } else {
-    if (asyncId == kNoAsyncId) {
+    if (asyncObject == kNoAsyncObject) {
       CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
     } else {
       CUDA_REPORT_IF_ERROR(
-          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
-      insertAllocation(p, sizeInBytes, asyncId);
+          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
+      insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
     }
   }
   return p;
@@ -174,7 +174,7 @@ void CUFFreeDevice(void *p) {
 }
 
 void *CUFAllocManaged(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -184,9 +184,9 @@ void *CUFAllocManaged(
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
 void *CUFAllocUnified(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes, asyncId);
+  return CUFAllocManaged(sizeInBytes, asyncObject);
 }
 
 void CUFFreeUnified(void *p) {
diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index 175e8c0ef8438..7b768f91af29d 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -21,7 +21,7 @@ RT_EXT_API_GROUP_BEGIN
 Descriptor *RTDEF(CUFAllocDescriptor)(
     std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
   return reinterpret_cast<Descriptor *>(
-      CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
+      CUFAllocManaged(sizeInBytes, /*asyncObject=*/nullptr));
 }
 
 void RTDEF(CUFFreeDescriptor)(
diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index 6acce34eb9a9e..ef18da6ea0786 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -133,17 +133,17 @@ void RTDEF(AllocatableApplyMold)(
   }
 }
 
-int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
+    std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
   } else if (descriptor.IsAllocated()) {
     return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
   } else {
-    int stat{
-        ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
+    int stat{ReturnError(
+        terminator, descriptor.Allocate(asyncObject), errMsg, hasStat)};
     if (stat == StatOk) {
       if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
         if (const auto *derived{addendum->derivedType()}) {
@@ -162,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
     const Descriptor &source, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
-      alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     DoFromSourceAssign(alloc, source, terminator);
diff --git a/flang-rt/lib/runtime/array-constructor.cpp b/flang-rt/lib/runtime/array-constructor.cpp
index 67b3b5e1e0f50..858fac7bf2b39 100644
--- a/flang-rt/lib/runtime/array-constructor.cpp
+++ b/flang-rt/lib/runtime/array-constructor.cpp
@@ -50,7 +50,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
           initialAllocationSize(fromElements, to.ElementBytes())};
       to.GetDimension(0).SetBounds(1, allocationSize);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
           vector.sourceFile, vector.sourceLine);
       to.GetDimension(0).SetBounds(1, fromElements);
       vector.actualAllocationSize = allocationSize;
@@ -59,7 +59,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
       // first value: there should be no reallocation.
       RUNTIME_CHECK(terminator, previousToElements >= fromElements);
       RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
           vector.sourceFile, vector.sourceLine);
       vector.actualAllocationSize = previousToElements;
     }
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index 4a813cd489022..8a4fa36c91479 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -99,7 +99,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncId))};
+  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
   if (result == StatOk && derived && !derived->noInitializationNeeded()) {
     result = ReturnError(terminator, Initialize(to, *derived, terminator));
   }
@@ -277,7 +277,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncId))};
+      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
       if (stat == StatOk) {
         if (HasDynamicComponent(from)) {
           // If 'from' has allocatable/automatic component, we cannot
diff --git a/flang-rt/lib/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp
index d1152ee1caefb..f140d202e118e 100644
--- a/flang-rt/lib/runtime/character.cpp
+++ b/flang-rt/lib/runtime/character.cpp
@@ -118,7 +118,7 @@ static RT_API_ATTRS void Compare(Descriptor &result, const Descriptor &x,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("Compare: could not allocate storage for result");
   }
   std::size_t xChars{x.ElementBytes() >> shift<CHAR>};
@@ -173,7 +173,7 @@ static RT_API_ATTRS void AdjustLRHelper(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("ADJUSTL/R: could not allocate storage for result");
   }
   for (SubscriptValue resultAt{0}; elements-- > 0;
@@ -227,7 +227,7 @@ static RT_API_ATTRS void LenTrim(Descriptor &result, const Descriptor &string,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("LEN_TRIM: could not allocate storage for result");
   }
   std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@@ -427,7 +427,7 @@ static RT_API_ATTRS void GeneralCharFunc(Descriptor &result,
   for (int j{0}; j < rank; ++j) {
     result.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("SCAN/VERIFY: could not allocate storage for result");
   }
   std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@@ -530,7 +530,8 @@ static RT_API_ATTRS void MaxMinHelper(Descriptor &accumulator,
     for (int j{0}; j < rank; ++j) {
       accumulator.GetDimension(j).SetBounds(1, ub[j]);
     }
-    RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+    RUNTIME_CHECK(
+        terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   }
   for (CHAR *result{accumulator.OffsetElement<CHAR>()}; elements-- > 0;
        accumData += accumChars, result += chars, x.IncrementSubscripts(xAt)) {
@@ -606,7 +607,7 @@ void RTDEF(CharacterConcatenate)(Descriptor &accumulator,
   for (int j{0}; j < rank; ++j) {
     accumulator.GetDimension(j).SetBounds(1, ub[j]);
   }
-  if (accumulator.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (accumulator.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash(
         "CharacterConcatenate: could not allocate storage for result");
   }
@@ -629,7 +630,8 @@ void RTDEF(CharacterConcatenateScalar1)(
   accumulator.set_base_addr(nullptr);
   std::size_t oldLen{accumulator.ElementBytes()};
   accumulator.raw().elem_len += chars;
-  RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(
+      terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   std::memcpy(accumulator.OffsetElement<char>(oldLen), from, chars);
   FreeMemory(old);
 }
@@ -831,7 +833,7 @@ void RTDEF(Repeat)(Descriptor &result, const Descriptor &string,
   std::size_t origBytes{string.ElementBytes()};
   result.Establish(string.type(), origBytes * ncopies, nullptr, 0, nullptr,
       CFI_attribute_allocatable);
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
     terminator.Crash("REPEAT could not allocate storage for result");
   }
   const char *from{string.OffsetElement()};
@@ -865,7 +867,7 @@ void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
   }
   result.Establish(string.type(), resultBytes, nullptr, 0, nullptr,
       CFI_attribute_allocatable);
-  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncObject) == CFI_SUCCESS);
   std::memcpy(result.OffsetElement(), string.OffsetElement(), resultBytes);
 }
 
diff --git a/flang-rt/lib/runtime/copy.cpp b/flang-rt/lib/runtime/copy.cpp
index 3a0f98cf8d376..f990f46e0be66 100644
--- a/flang-rt/lib/runtime/copy.cpp
+++ b/flang-rt/lib/runtime/copy.cpp
@@ -171,8 +171,8 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
                 *reinterpret_cast<Descriptor *>(toPtr + component->offset())};
             if (toDesc.raw().base_addr != nullptr) {
               toDesc.set_base_addr(nullptr);
-              RUNTIME_CHECK(
-                  terminator, toDesc.Allocate(/*asyncId=*/-1) == CFI_SUCCESS);
+              RUNTIME_CHECK(terminator,
+                  toDesc.Allocate(/*asyncObject=*/nullptr) == CFI_SUCCESS);
               const Descriptor &fromDesc{*reinterpret_cast<const Descriptor *>(
                   fromPtr + component->offset())};
               copyStack.emplace(toDesc, fromDesc);
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index c46ea806a430a..35037036f63e7 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -52,7 +52,7 @@ RT_API_ATTRS int Initialize(const Descriptor &instance,
         allocDesc.raw().attribute = CFI_attribute_allocatable;
         if (comp.genre() == typeInfo::Component::Genre::Automatic) {
           stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
           if (stat == StatOk) {
             if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
               if (const auto *derived{addendum->derivedType()}) {
@@ -153,7 +153,7 @@ RT_API_ATTRS int InitializeClone(const Descriptor &clone,
         if (origDesc.IsAllocated()) {
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
           stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
           if (stat == StatOk) {
             if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
               if (const typeInfo::DerivedType *
@@ -260,7 +260,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy.raw().attribute = CFI_attribute_allocatable;
         Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
         RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncId) == CFI_SUCCESS);
+            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index 3debf53bb5290..67336d01380e0 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -158,7 +158,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
 #endif
 }
 
-RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
+RT_API_ATTRS int Descriptor::Allocate(std::int64_t *asyncObject) {
   std::size_t elementBytes{ElementBytes()};
   if (static_cast<std::int64_t>(elementBytes) < 0) {
     // F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
   // Zero size allocation is possible in Fortran and the resulting
   // descriptor must be allocated/associated. Since std::malloc(0)
   // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1, asyncId)};
+  void *p{alloc(byteSize ? byteSize : 1, asyncObject)};
   if (!p) {
     return CFI_ERROR_MEM_ALLOCATION;
   }
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/r...
[truncated]

wangzpgi · 2025-05-01T21:17:42Z

flang-rt/unittests/Runtime/CUDA/Allocatable.cpp

  EXPECT_TRUE(b->IsAllocated());
  cudaDeviceSynchronize();
  EXPECT_EQ(cudaSuccess, cudaGetLastError());

  RTNAME(AllocatableAllocate)
-  (*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  (*c, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,


Question: asyncId=-1 translates to asyncObject=nullptr. Here asyncId=1 also translates to asyncObject=nullptr?

Yes since it's a pointer now. This is a unittest so we don't really care which stream is used.

This reverts commit 7f922f1.

Reverts #138186