Skip to content

[flang][cuda] Use async id for device stream allocation #118733

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 5, 2024

Conversation

clementval
Copy link
Contributor

When stream is specified use cudaMallocAsync with the specified stream

@llvmbot llvmbot added flang:runtime flang Flang issues not falling into any other category labels Dec 5, 2024
@llvmbot
Copy link
Member

llvmbot commented Dec 5, 2024

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

When stream is specified use cudaMallocAsync with the specified stream


Full diff: https://github.com/llvm/llvm-project/pull/118733.diff

3 Files Affected:

  • (modified) flang/include/flang/Runtime/CUDA/allocator.h (+4-3)
  • (modified) flang/runtime/CUDA/allocator.cpp (+11-10)
  • (modified) flang/unittests/Runtime/CUDA/AllocatorCUF.cpp (+18)
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 40423c5ce04885..618da44c675d85 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 #define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 
+#include "common.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 
@@ -19,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }
 
-void *CUFAllocPinned(std::size_t, std::int64_t);
+void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreePinned(void *);
 
 void *CUFAllocDevice(std::size_t, std::int64_t);
 void CUFFreeDevice(void *);
 
-void *CUFAllocManaged(std::size_t, std::int64_t);
+void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreeManaged(void *);
 
-void *CUFAllocUnified(std::size_t, std::int64_t);
+void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream);
 void CUFFreeUnified(void *);
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index e41ed77e40ff99..d848f1811dcf3f 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -33,8 +33,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }
 }
 
-void *CUFAllocPinned(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
@@ -42,17 +41,20 @@ void *CUFAllocPinned(
 
 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
-void *CUFAllocDevice(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) {
   void *p;
-  CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  if (stream >= 0) {
+    CUDA_REPORT_IF_ERROR(
+        cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream));
+  } else {
+    CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  }
   return p;
 }
 
 void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocManaged(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -61,10 +63,9 @@ void *CUFAllocManaged(
 
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
-void *CUFAllocUnified(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) {
   // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes, asyncId);
+  return CUFAllocManaged(sizeInBytes);
 }
 
 void CUFFreeUnified(void *p) {
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 435172890472da..848093939dc57f 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -43,6 +43,24 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   EXPECT_FALSE(a->IsAllocated());
 }
 
+TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  a->SetAllocIdx(kDeviceAllocatorPos);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+  EXPECT_FALSE(a->HasAddendum());
+  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+  RTNAME(AllocatableAllocate)
+  (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
+  EXPECT_TRUE(a->IsAllocated());
+  RTNAME(AllocatableDeallocate)
+  (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  EXPECT_FALSE(a->IsAllocated());
+}
+
 TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   using Fortran::common::TypeCategory;
   RTNAME(CUFRegisterAllocator)();

Copy link

github-actions bot commented Dec 5, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

@clementval clementval merged commit 83ccaad into llvm:main Dec 5, 2024
8 checks passed
@clementval clementval deleted the cuf_allocator_async branch December 5, 2024 18:13
clementval added a commit to clementval/llvm-project that referenced this pull request Dec 23, 2024
When stream is specified use cudaMallocAsync with the specified stream
clementval added a commit that referenced this pull request Dec 23, 2024
…118713)' and #118733 (#120997)

Device runtime build have been fixed. Attempt to re-land these patches
that have been approved before.

#118713
#118733
clementval added a commit that referenced this pull request Dec 24, 2024
…criptor (#118713)' and #118733" (#121029)

This still cause issue for device runtime build.
github-actions bot pushed a commit to arm/arm-toolchain that referenced this pull request Jan 10, 2025
…descriptor (#118713)' and #118733 (#120997)

Device runtime build have been fixed. Attempt to re-land these patches
that have been approved before.

llvm/llvm-project#118713
llvm/llvm-project#118733
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
flang:runtime flang Flang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants