[flang][cuda] Use async id for device stream allocation #118733

clementval · 2024-12-05T02:28:41Z

When stream is specified use cudaMallocAsync with the specified stream

llvmbot · 2024-12-05T02:29:16Z

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

When stream is specified use cudaMallocAsync with the specified stream

Full diff: https://github.com/llvm/llvm-project/pull/118733.diff

3 Files Affected:

(modified) flang/include/flang/Runtime/CUDA/allocator.h (+4-3)
(modified) flang/runtime/CUDA/allocator.cpp (+11-10)
(modified) flang/unittests/Runtime/CUDA/AllocatorCUF.cpp (+18)

diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 40423c5ce04885..618da44c675d85 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 #define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 
+#include "common.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 
@@ -19,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }
 
-void *CUFAllocPinned(std::size_t, std::int64_t);
+void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreePinned(void *);
 
 void *CUFAllocDevice(std::size_t, std::int64_t);
 void CUFFreeDevice(void *);
 
-void *CUFAllocManaged(std::size_t, std::int64_t);
+void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreeManaged(void *);
 
-void *CUFAllocUnified(std::size_t, std::int64_t);
+void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreeUnified(void *);
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index e41ed77e40ff99..d848f1811dcf3f 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -33,8 +33,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 }
 
-void *CUFAllocPinned(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -42,17 +41,20 @@ void *CUFAllocPinned(
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) {
   void *p;
-  CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  if (stream >= 0) {
+    CUDA_REPORT_IF_ERROR(
+        cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream));
+  } else {
+    CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  }
   return p;
 }
 
 void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocManaged(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -61,10 +63,9 @@ void *CUFAllocManaged(
 
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocUnified(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes, asyncId);
+  return CUFAllocManaged(sizeInBytes);
 }
 
 void CUFFreeUnified(void *p) {
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 435172890472da..848093939dc57f 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -43,6 +43,24 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   EXPECT_FALSE(a->IsAllocated());
 }
 
+TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+  RTNAME(AllocatableAllocate)
+  (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+}
+
 TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   using Fortran::common::TypeCategory;
   RTNAME(CUFRegisterAllocator)();

github-actions · 2024-12-05T02:32:06Z

✅ With the latest revision this PR passed the C/C++ code formatter.

When stream is specified use cudaMallocAsync with the specified stream

…118713)' and #118733 (#120997) Device runtime build have been fixed. Attempt to re-land these patches that have been approved before. #118713 #118733

…criptor (#118713)' and #118733" (#121029) This still cause issue for device runtime build.

…descriptor (#118713)' and #118733 (#120997) Device runtime build have been fixed. Attempt to re-land these patches that have been approved before. llvm/llvm-project#118713 llvm/llvm-project#118733

[flang][cuda] Use async id for stream allocation

2a61f51

clementval requested review from wangzpgi and Renaud-K December 5, 2024 02:28

llvmbot added flang:runtime flang Flang issues not falling into any other category labels Dec 5, 2024

clang-format

1d67389

wangzpgi approved these changes Dec 5, 2024

View reviewed changes

clementval merged commit 83ccaad into llvm:main Dec 5, 2024
8 checks passed

clementval deleted the cuf_allocator_async branch December 5, 2024 18:13

clementval added a commit to clementval/llvm-project that referenced this pull request Dec 23, 2024

[flang][cuda] Use async id for device stream allocation (llvm#118733)

7bf7262

When stream is specified use cudaMallocAsync with the specified stream

clementval mentioned this pull request Dec 23, 2024

Reland '[flang] Allow to pass an async id to allocate the descriptor (#118713)' and #118733 #120997

Merged

clementval added a commit that referenced this pull request Dec 24, 2024

Revert "Reland '[flang] Allow to pass an async id to allocate the des…

4cb2a51

…criptor (#118713)' and #118733" (#121029) This still cause issue for device runtime build.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] Use async id for device stream allocation #118733

[flang][cuda] Use async id for device stream allocation #118733

Uh oh!

clementval commented Dec 5, 2024

Uh oh!

llvmbot commented Dec 5, 2024

Uh oh!

github-actions bot commented Dec 5, 2024 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

[flang][cuda] Use async id for device stream allocation #118733

[flang][cuda] Use async id for device stream allocation #118733

Uh oh!

Conversation

clementval commented Dec 5, 2024

Uh oh!

llvmbot commented Dec 5, 2024

Uh oh!

github-actions bot commented Dec 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Dec 5, 2024 •

edited

Loading