Skip to content

Commit d3921e4

Browse files
authored
[OpenMP] Basic BumpAllocator for (AMD)GPUs (#69806)
The patch contains a basic BumpAllocator for (AMD)GPUs to allow us to run more tests. The allocator implements `malloc`, both internally and externally, while we continue to default to the NVIDIA `malloc` when we target NVIDIA GPUs. Once we have smarter or customizable allocators we should consider this choice, for now, this allocator is better than none. It traps if it is out of memory, making it easy to debug. Heap size is configured via `LIBOMPTARGET_HEAP_SIZE` and defaults to 512MB. It allows to track allocation statistics via `LIBOMPTARGET_DEVICE_RTL_DEBUG=8` (together with `-fopenmp-target-debug=8`). Two tests were added, and one was enabled. This is the next step towards fixing #66708
1 parent d571af7 commit d3921e4

File tree

15 files changed

+367
-39
lines changed

15 files changed

+367
-39
lines changed

openmp/docs/design/Runtimes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,3 +1465,4 @@ debugging features are supported.
14651465

14661466
* Enable debugging assertions in the device. ``0x01``
14671467
* Enable diagnosing common problems during offloading . ``0x4``
1468+
* Enable device malloc statistics (amdgpu only). ``0x8``

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ endif()
8383
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
8484

8585
set(include_files
86+
${include_directory}/Allocator.h
8687
${include_directory}/Configuration.h
8788
${include_directory}/Debug.h
8889
${include_directory}/Interface.h
@@ -95,6 +96,7 @@ set(include_files
9596
)
9697

9798
set(src_files
99+
${source_directory}/Allocator.cpp
98100
${source_directory}/Configuration.cpp
99101
${source_directory}/Debug.cpp
100102
${source_directory}/Kernel.cpp
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
#ifndef OMPTARGET_ALLOCATOR_H
13+
#define OMPTARGET_ALLOCATOR_H
14+
15+
#include "Types.h"
16+
17+
// Forward declaration.
18+
struct KernelEnvironmentTy;
19+
20+
#pragma omp begin declare target device_type(nohost)
21+
22+
namespace ompx {
23+
24+
namespace allocator {
25+
26+
static uint64_t constexpr ALIGNMENT = 16;
27+
28+
/// Initialize the allocator according to \p KernelEnvironment
29+
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
30+
31+
/// Allocate \p Size bytes.
32+
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
33+
alloc(uint64_t Size);
34+
35+
/// Free the allocation pointed to by \p Ptr.
36+
void free(void *Ptr);
37+
38+
} // namespace allocator
39+
40+
} // namespace ompx
41+
42+
#pragma omp end declare target
43+
44+
#endif
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "Allocator.h"
12+
#include "Configuration.h"
13+
#include "Environment.h"
14+
#include "Mapping.h"
15+
#include "Synchronization.h"
16+
#include "Types.h"
17+
#include "Utils.h"
18+
19+
using namespace ompx;
20+
21+
#pragma omp begin declare target device_type(nohost)
22+
23+
[[gnu::used, gnu::retain, gnu::weak,
24+
gnu::visibility(
25+
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
26+
[[gnu::used, gnu::retain, gnu::weak,
27+
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
28+
__omp_rtl_device_memory_pool_tracker;
29+
30+
/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
31+
/// directly.
32+
struct BumpAllocatorTy final {
33+
34+
void *alloc(uint64_t Size) {
35+
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
36+
37+
if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
38+
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
39+
atomic::seq_cst);
40+
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
41+
atomic::seq_cst);
42+
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
43+
atomic::seq_cst);
44+
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
45+
atomic::seq_cst);
46+
}
47+
48+
uint64_t *Data =
49+
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
50+
uint64_t End =
51+
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
52+
53+
uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
54+
if (OldData + Size > End)
55+
__builtin_trap();
56+
57+
return reinterpret_cast<void *>(OldData);
58+
}
59+
60+
void free(void *) {}
61+
};
62+
63+
BumpAllocatorTy BumpAllocator;
64+
65+
/// allocator namespace implementation
66+
///
67+
///{
68+
69+
void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
70+
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
71+
// more than one.
72+
}
73+
74+
void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
75+
76+
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
77+
78+
///}
79+
80+
#pragma omp end declare target

openmp/libomptarget/DeviceRTL/src/Kernel.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
#include "Allocator.h"
1314
#include "Debug.h"
1415
#include "Environment.h"
1516
#include "Interface.h"
@@ -30,6 +31,7 @@ static void inititializeRuntime(bool IsSPMD,
3031
synchronize::init(IsSPMD);
3132
mapping::init(IsSPMD);
3233
state::init(IsSPMD, KernelEnvironment);
34+
allocator::init(IsSPMD, KernelEnvironment);
3335
}
3436

3537
/// Simple generic state machine for worker threads.

openmp/libomptarget/DeviceRTL/src/State.cpp

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "State.h"
12+
#include "Allocator.h"
13+
#include "Configuration.h"
1214
#include "Debug.h"
1315
#include "Environment.h"
1416
#include "Interface.h"
@@ -26,48 +28,36 @@ using namespace ompx;
2628
///
2729
///{
2830

29-
/// Add worst-case padding so that future allocations are properly aligned.
30-
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
31-
/// passed in as an argument and the stack rewritten to support it.
32-
constexpr const uint32_t Alignment = 16;
33-
3431
/// External symbol to access dynamic shared memory.
35-
[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[];
32+
[[gnu::aligned(
33+
allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
3634
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
3735

3836
/// The kernel environment passed to the init method by the compiler.
3937
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
4038

39+
///}
40+
4141
namespace {
4242

4343
/// Fallback implementations are missing to trigger a link time error.
4444
/// Implementations for new devices, including the host, should go into a
4545
/// dedicated begin/end declare variant.
4646
///
4747
///{
48-
4948
extern "C" {
50-
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
51-
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
52-
}
49+
#ifdef __AMDGPU__
5350

54-
///}
51+
[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
52+
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
5553

56-
/// AMDGCN implementations of the shuffle sync idiom.
57-
///
58-
///{
59-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
54+
#else
6055

61-
extern "C" {
62-
void *malloc(uint64_t Size) {
63-
// TODO: Use some preallocated space for dynamic malloc.
64-
return nullptr;
65-
}
56+
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
57+
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
6658

67-
void free(void *Ptr) {}
59+
#endif
6860
}
69-
70-
#pragma omp end declare variant
7161
///}
7262

7363
/// A "smart" stack in shared memory.
@@ -96,7 +86,7 @@ struct SharedMemorySmartStackTy {
9686
uint32_t computeThreadStorageTotal() {
9787
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
9888
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
99-
Alignment);
89+
allocator::ALIGNMENT);
10090
}
10191

10292
/// Return the top address of the warp data stack, that is the first address
@@ -106,8 +96,10 @@ struct SharedMemorySmartStackTy {
10696
}
10797

10898
/// The actual storage, shared among all warps.
109-
[[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize];
110-
[[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
99+
[[gnu::aligned(
100+
allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
101+
[[gnu::aligned(
102+
allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
111103
};
112104

113105
static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
@@ -122,7 +114,9 @@ void SharedMemorySmartStackTy::init(bool IsSPMD) {
122114

123115
void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
124116
// First align the number of requested bytes.
125-
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
117+
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
118+
/// be passed in as an argument and the stack rewritten to support it.
119+
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
126120

127121
uint32_t StorageTotal = computeThreadStorageTotal();
128122

@@ -150,7 +144,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
150144
}
151145

152146
void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
153-
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
147+
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
154148
if (utils::isSharedMemPtr(Ptr)) {
155149
int TId = mapping::getThreadIdInBlock();
156150
Usage[TId] -= AlignedBytes;

openmp/libomptarget/include/Environment.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,27 @@ struct DeviceEnvironmentTy {
4343
uint64_t HardwareParallelism;
4444
};
4545

46+
struct DeviceMemoryPoolTy {
47+
void *Ptr;
48+
uint64_t Size;
49+
};
50+
51+
struct DeviceMemoryPoolTrackingTy {
52+
uint64_t NumAllocations;
53+
uint64_t AllocationTotal;
54+
uint64_t AllocationMin;
55+
uint64_t AllocationMax;
56+
57+
void combine(DeviceMemoryPoolTrackingTy &Other) {
58+
NumAllocations += Other.NumAllocations;
59+
AllocationTotal += Other.AllocationTotal;
60+
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
61+
: AllocationMin;
62+
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
63+
: AllocationMax;
64+
}
65+
};
66+
4667
// NOTE: Please don't change the order of those members as their indices are
4768
// used in the middle end. Always add the new data member at the end.
4869
// Different from KernelEnvironmentTy below, this structure contains members

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2529,10 +2529,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25292529
return Plugin::success();
25302530
}
25312531
Error getDeviceHeapSize(uint64_t &Value) override {
2532-
Value = 0;
2532+
Value = DeviceMemoryPoolSize;
2533+
return Plugin::success();
2534+
}
2535+
Error setDeviceHeapSize(uint64_t Value) override {
2536+
for (DeviceImageTy *Image : LoadedImages)
2537+
if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
2538+
return Err;
2539+
DeviceMemoryPoolSize = Value;
25332540
return Plugin::success();
25342541
}
2535-
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
25362542

25372543
/// AMDGPU-specific function to get device attributes.
25382544
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
@@ -2625,6 +2631,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26252631

26262632
/// Reference to the host device.
26272633
AMDHostDeviceTy &HostDevice;
2634+
2635+
/// The current size of the global device memory pool (managed by us).
2636+
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
26282637
};
26292638

26302639
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {

0 commit comments

Comments
 (0)