-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[flang][cuda] Add CUF allocator #101216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[flang][cuda] Add CUF allocator #101216
Conversation
@llvm/pr-subscribers-flang-runtime Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAdd allocators for CUDA fortran allocation on the device. 3 allocators are added for pinned, device and managed/unified memory allocation. Since this require CUDA, a cmake option Full diff: https://github.com/llvm/llvm-project/pull/101216.diff 8 Files Affected:
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 070c39eb6e9ab..971e5d5c93f23 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -461,6 +461,13 @@ option(FLANG_BUILD_TOOLS
if (FLANG_BUILD_TOOLS)
add_subdirectory(tools)
endif()
+
+option(FLANG_CUF_RUNTIME
+ "Compile CUDA Fortran runtime sources" OFF)
+if (FLANG_CUF_RUNTIME)
+ find_package(CUDAToolkit REQUIRED)
+endif()
+
add_subdirectory(runtime)
if (LLVM_INCLUDE_EXAMPLES)
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
new file mode 100644
index 0000000000000..0738d1e3a8bf3
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -0,0 +1,43 @@
+//===-- include/flang/Runtime/CUDA/allocator.h ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
+#define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
+
+#include "flang/Runtime/descriptor.h"
+
+static constexpr unsigned kPinnedAllocatorPos = 1;
+static constexpr unsigned kDeviceAllocatorPos = 2;
+static constexpr unsigned kManagedAllocatorPos = 3;
+
+#define CUDA_REPORT_IF_ERROR(expr) \
+ [](CUresult result) { \
+ if (!result) \
+ return; \
+ const char *name = nullptr; \
+ cuGetErrorName(result, &name); \
+ if (!name) \
+ name = "<unknown>"; \
+ fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
+ }(expr)
+
+namespace Fortran::runtime::cuf {
+
+void CUFRegisterAllocator();
+
+void *CUFAllocPinned(std::size_t);
+void CUFFreePinned(void *);
+
+void *CUFAllocDevice(std::size_t);
+void CUFFreeDevice(void *);
+
+void *CUFAllocManaged(std::size_t);
+void CUFFreeManaged(void *);
+
+} // namespace Fortran::runtime::cuf
+#endif // FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 1f3ae23dcbf12..4537b2d059d65 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -309,3 +309,6 @@ if (TARGET flang-new AND TARGET module_files)
add_dependencies(FortranRuntime flang-new module_files)
endif()
+if (FLANG_CUF_RUNTIME)
+ add_subdirectory(CUDA)
+endif()
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
new file mode 100644
index 0000000000000..e963b6062abc4
--- /dev/null
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -0,0 +1,18 @@
+#===-- runtime/CUDA/CMakeLists.txt -----------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+include_directories(${CUDAToolkit_INCLUDE_DIRS})
+find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
+
+add_flang_library(CufRuntime
+ allocator.cpp
+)
+target_link_libraries(CufRuntime
+PRIVATE
+${CUDA_RUNTIME_LIBRARY}
+)
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
new file mode 100644
index 0000000000000..3c913e344335b
--- /dev/null
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -0,0 +1,62 @@
+//===-- runtime/CUDA/allocator.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/allocator.h"
+#include "../allocator-registry.h"
+#include "../derived.h"
+#include "../stat.h"
+#include "../terminator.h"
+#include "../type-info.h"
+#include "flang/Common/Fortran.h"
+#include "flang/ISO_Fortran_binding_wrapper.h"
+
+#include "cuda.h"
+
+namespace Fortran::runtime::cuf {
+
+void CUFRegisterAllocator() {
+ allocatorRegistry.Register(
+ kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
+ allocatorRegistry.Register(
+ kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
+ allocatorRegistry.Register(
+ kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
+}
+
+void *CUFAllocPinned(std::size_t sizeInBytes) {
+ void *p;
+ CUDA_REPORT_IF_ERROR(cuMemAllocHost(&p, sizeInBytes));
+ return p;
+}
+
+void CUFFreePinned(void *p) {
+ CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
+}
+
+void *CUFAllocDevice(std::size_t sizeInBytes) {
+ CUdeviceptr p = 0;
+ CUDA_REPORT_IF_ERROR(cuMemAlloc(&p, sizeInBytes));
+ return reinterpret_cast<void *>(p);
+}
+
+void CUFFreeDevice(void *p) {
+ CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
+}
+
+void *CUFAllocManaged(std::size_t sizeInBytes) {
+ CUdeviceptr p = 0;
+ CUDA_REPORT_IF_ERROR(
+ cuMemAllocManaged(&p, sizeInBytes, CU_MEM_ATTACH_GLOBAL));
+ return reinterpret_cast<void *>(p);
+}
+
+void CUFFreeManaged(void *p) {
+ CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
+}
+
+} // namespace Fortran::runtime::cuf
diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt
index ed047b08ada35..2c3f8c1a9e9ac 100644
--- a/flang/unittests/Runtime/CMakeLists.txt
+++ b/flang/unittests/Runtime/CMakeLists.txt
@@ -35,3 +35,5 @@ target_link_libraries(FlangRuntimeTests
PRIVATE
FortranRuntime
)
+
+add_subdirectory(CUDA)
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
new file mode 100644
index 0000000000000..204826d3f2a96
--- /dev/null
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -0,0 +1,87 @@
+//===-- flang/unittests/Runtime/AllocatableCUF.cpp ---------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "flang/Common/Fortran.h"
+#include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/allocatable.h"
+
+#include "cuda.h"
+
+using namespace Fortran::runtime;
+
+static OwningPtr<Descriptor> createAllocatable(
+ Fortran::common::TypeCategory tc, int kind, int rank = 1) {
+ return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr,
+ CFI_attribute_allocatable);
+}
+
+thread_local static int32_t defaultDevice = 0;
+
+CUdevice getDefaultCuDevice() {
+ CUdevice device;
+ CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+ return device;
+}
+
+class ScopedContext {
+public:
+ ScopedContext() {
+ // Static reference to CUDA primary context for device ordinal
+ // defaultDevice.
+ static CUcontext context = [] {
+ CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
+ CUcontext ctx;
+ // Note: this does not affect the current context.
+ CUDA_REPORT_IF_ERROR(
+ cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
+ return ctx;
+ }();
+
+ CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
+ }
+
+ ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
+};
+
+TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
+ using Fortran::common::TypeCategory;
+ Fortran::runtime::cuf::CUFRegisterAllocator();
+ ScopedContext ctx;
+ // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+ auto a{createAllocatable(TypeCategory::Real, 4)};
+ a->raw().SetAllocIdx(kDeviceAllocatorPos);
+ EXPECT_EQ((int)kDeviceAllocatorPos, a->raw().GetAllocIdx());
+ EXPECT_FALSE(a->raw().HasAddendum());
+ RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+ RTNAME(AllocatableAllocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_TRUE(a->IsAllocated());
+ RTNAME(AllocatableDeallocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(a->IsAllocated());
+}
+
+TEST(AllocatableCUFTest, SimplePinnedAllocate) {
+ using Fortran::common::TypeCategory;
+ Fortran::runtime::cuf::CUFRegisterAllocator();
+ ScopedContext ctx;
+ // INTEGER(4), PINNED, ALLOCATABLE :: a(:)
+ auto a{createAllocatable(TypeCategory::Integer, 4)};
+ EXPECT_FALSE(a->raw().HasAddendum());
+ a->raw().SetAllocIdx(kPinnedAllocatorPos);
+ EXPECT_EQ((int)kPinnedAllocatorPos, a->raw().GetAllocIdx());
+ EXPECT_FALSE(a->raw().HasAddendum());
+ RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
+ RTNAME(AllocatableAllocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_TRUE(a->IsAllocated());
+ RTNAME(AllocatableDeallocate)
+ (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+ EXPECT_FALSE(a->IsAllocated());
+}
diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt
new file mode 100644
index 0000000000000..14b5c788719b8
--- /dev/null
+++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt
@@ -0,0 +1,15 @@
+if (FLANG_CUF_RUNTIME)
+
+add_flang_unittest(FlangCufRuntimeTests
+ AllocatorCUF.cpp
+)
+
+target_link_libraries(FlangCufRuntimeTests
+ PRIVATE
+ CufRuntime
+ FortranRuntime
+)
+
+target_include_directories(FlangCufRuntimeTests PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+endif()
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
e544123
to
b739e74
Compare
Add allocators for CUDA fortran allocation on the device. 3 allocators are added for pinned, device and managed/unified memory allocation.
CUFRegisterAllocator()
is called to register the allocators in the allocator registry added in #100690.Since this require CUDA, a cmake option
FLANG_CUF_RUNTIME
is added to conditionally build these.