Skip to content

Commit 9819e84

Browse files
committed
[OpenMP] Basic BumpAllocator for (AMD)GPUs
The patch contains a basic BumpAllocator for (AMD)GPUs to allow us to run more tests. The allocator implements `malloc`, both internally and externally, while we continue to default to the NVIDIA `malloc` when we target NVIDIA GPUs. Once we have smarter or customizable allocators we should consider this choice, for now, this allocator is better than none. It traps if it is out of memory, making it easy to debug. Heap size is configured via `LIBOMPTARGET_HEAP_SIZE` and defaults to 512MB. It allows to track allocation statistics via `LIBOMPTARGET_DEVICE_RTL_DEBUG=8` (together with `-fopenmp-target-debug=8`). Two tests were added, and one was enabled. This is the next step towards fixing #66708
1 parent 1cea309 commit 9819e84

File tree

15 files changed

+367
-39
lines changed

15 files changed

+367
-39
lines changed

openmp/docs/design/Runtimes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,3 +1465,4 @@ debugging features are supported.
14651465

14661466
* Enable debugging assertions in the device. ``0x01``
14671467
* Enable diagnosing common problems during offloading . ``0x4``
1468+
* Enable device malloc statistics (amdgpu only). ``0x8``

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ endif()
8383
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
8484

8585
set(include_files
86+
${include_directory}/Allocator.h
8687
${include_directory}/Configuration.h
8788
${include_directory}/Debug.h
8889
${include_directory}/Interface.h
@@ -95,6 +96,7 @@ set(include_files
9596
)
9697

9798
set(src_files
99+
${source_directory}/Allocator.cpp
98100
${source_directory}/Configuration.cpp
99101
${source_directory}/Debug.cpp
100102
${source_directory}/Kernel.cpp
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
#ifndef OMPTARGET_ALLOCATOR_H
13+
#define OMPTARGET_ALLOCATOR_H
14+
15+
#include "Types.h"
16+
17+
// Forward declaration.
18+
struct KernelEnvironmentTy;
19+
20+
#pragma omp begin declare target device_type(nohost)
21+
22+
namespace ompx {
23+
24+
namespace allocator {
25+
26+
static uint64_t constexpr ALIGNMENT = 16;
27+
28+
/// Initialize the allocator according to \p KernelEnvironment
29+
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
30+
31+
/// Allocate \p Size bytes.
32+
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
33+
alloc(uint64_t Size);
34+
35+
/// Free the allocation pointed to by \p Ptr.
36+
void free(void *Ptr);
37+
38+
} // namespace allocator
39+
40+
} // namespace ompx
41+
42+
#pragma omp end declare target
43+
44+
#endif
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "Allocator.h"
12+
#include "Configuration.h"
13+
#include "Environment.h"
14+
#include "Mapping.h"
15+
#include "Synchronization.h"
16+
#include "Types.h"
17+
#include "Utils.h"
18+
19+
using namespace ompx;
20+
21+
#pragma omp begin declare target device_type(nohost)
22+
23+
[[gnu::used, gnu::retain, gnu::weak,
24+
gnu::visibility(
25+
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
26+
[[gnu::used, gnu::retain, gnu::weak,
27+
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
28+
__omp_rtl_device_memory_pool_tracker;
29+
30+
/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
31+
/// directly.
32+
struct BumpAllocatorTy final {
33+
34+
void *alloc(uint64_t Size) {
35+
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
36+
37+
if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
38+
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
39+
atomic::seq_cst);
40+
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
41+
atomic::seq_cst);
42+
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
43+
atomic::seq_cst);
44+
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
45+
atomic::seq_cst);
46+
}
47+
48+
uint64_t *Data =
49+
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
50+
uint64_t End =
51+
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
52+
53+
uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
54+
if (OldData + Size > End)
55+
__builtin_trap();
56+
57+
return reinterpret_cast<void *>(OldData);
58+
}
59+
60+
void free(void *) {}
61+
};
62+
63+
BumpAllocatorTy BumpAllocator;
64+
65+
/// allocator namespace implementation
66+
///
67+
///{
68+
69+
void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
70+
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
71+
// more than one.
72+
}
73+
74+
void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
75+
76+
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
77+
78+
///}
79+
80+
#pragma omp end declare target

openmp/libomptarget/DeviceRTL/src/Kernel.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
#include "Allocator.h"
1314
#include "Debug.h"
1415
#include "Environment.h"
1516
#include "Interface.h"
@@ -30,6 +31,7 @@ static void inititializeRuntime(bool IsSPMD,
3031
synchronize::init(IsSPMD);
3132
mapping::init(IsSPMD);
3233
state::init(IsSPMD, KernelEnvironment);
34+
allocator::init(IsSPMD, KernelEnvironment);
3335
}
3436

3537
/// Simple generic state machine for worker threads.

openmp/libomptarget/DeviceRTL/src/State.cpp

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "State.h"
12+
#include "Allocator.h"
13+
#include "Configuration.h"
1214
#include "Debug.h"
1315
#include "Environment.h"
1416
#include "Interface.h"
@@ -25,48 +27,36 @@ using namespace ompx;
2527
///
2628
///{
2729

28-
/// Add worst-case padding so that future allocations are properly aligned.
29-
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
30-
/// passed in as an argument and the stack rewritten to support it.
31-
constexpr const uint32_t Alignment = 16;
32-
3330
/// External symbol to access dynamic shared memory.
34-
[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[];
31+
[[gnu::aligned(
32+
allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
3533
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
3634

3735
/// The kernel environment passed to the init method by the compiler.
3836
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
3937

38+
///}
39+
4040
namespace {
4141

4242
/// Fallback implementations are missing to trigger a link time error.
4343
/// Implementations for new devices, including the host, should go into a
4444
/// dedicated begin/end declare variant.
4545
///
4646
///{
47-
4847
extern "C" {
49-
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
50-
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
51-
}
48+
#ifdef __AMDGPU__
5249

53-
///}
50+
[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
51+
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
5452

55-
/// AMDGCN implementations of the shuffle sync idiom.
56-
///
57-
///{
58-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
53+
#else
5954

60-
extern "C" {
61-
void *malloc(uint64_t Size) {
62-
// TODO: Use some preallocated space for dynamic malloc.
63-
return nullptr;
64-
}
55+
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
56+
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
6557

66-
void free(void *Ptr) {}
58+
#endif
6759
}
68-
69-
#pragma omp end declare variant
7060
///}
7161

7262
/// A "smart" stack in shared memory.
@@ -95,7 +85,7 @@ struct SharedMemorySmartStackTy {
9585
uint32_t computeThreadStorageTotal() {
9686
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
9787
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
98-
Alignment);
88+
allocator::ALIGNMENT);
9989
}
10090

10191
/// Return the top address of the warp data stack, that is the first address
@@ -105,8 +95,10 @@ struct SharedMemorySmartStackTy {
10595
}
10696

10797
/// The actual storage, shared among all warps.
108-
[[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize];
109-
[[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
98+
[[gnu::aligned(
99+
allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
100+
[[gnu::aligned(
101+
allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
110102
};
111103

112104
static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
@@ -121,7 +113,9 @@ void SharedMemorySmartStackTy::init(bool IsSPMD) {
121113

122114
void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
123115
// First align the number of requested bytes.
124-
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
116+
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
117+
/// be passed in as an argument and the stack rewritten to support it.
118+
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
125119

126120
uint32_t StorageTotal = computeThreadStorageTotal();
127121

@@ -149,7 +143,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
149143
}
150144

151145
void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
152-
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
146+
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
153147
if (utils::isSharedMemPtr(Ptr)) {
154148
int TId = mapping::getThreadIdInBlock();
155149
Usage[TId] -= AlignedBytes;

openmp/libomptarget/include/Environment.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,27 @@ struct DeviceEnvironmentTy {
4343
uint64_t HardwareParallelism;
4444
};
4545

46+
struct DeviceMemoryPoolTy {
47+
void *Ptr;
48+
uint64_t Size;
49+
};
50+
51+
struct DeviceMemoryPoolTrackingTy {
52+
uint64_t NumAllocations;
53+
uint64_t AllocationTotal;
54+
uint64_t AllocationMin;
55+
uint64_t AllocationMax;
56+
57+
void combine(DeviceMemoryPoolTrackingTy &Other) {
58+
NumAllocations += Other.NumAllocations;
59+
AllocationTotal += Other.AllocationTotal;
60+
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
61+
: AllocationMin;
62+
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
63+
: AllocationMax;
64+
}
65+
};
66+
4667
// NOTE: Please don't change the order of those members as their indices are
4768
// used in the middle end. Always add the new data member at the end.
4869
// Different from KernelEnvironmentTy below, this structure contains members

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2529,10 +2529,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25292529
return Plugin::success();
25302530
}
25312531
Error getDeviceHeapSize(uint64_t &Value) override {
2532-
Value = 0;
2532+
Value = DeviceMemoryPoolSize;
2533+
return Plugin::success();
2534+
}
2535+
Error setDeviceHeapSize(uint64_t Value) override {
2536+
for (DeviceImageTy *Image : LoadedImages)
2537+
if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
2538+
return Err;
2539+
DeviceMemoryPoolSize = Value;
25332540
return Plugin::success();
25342541
}
2535-
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
25362542

25372543
/// AMDGPU-specific function to get device attributes.
25382544
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
@@ -2625,6 +2631,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26252631

26262632
/// Reference to the host device.
26272633
AMDHostDeviceTy &HostDevice;
2634+
2635+
/// The current size of the global device memory pool (managed by us).
2636+
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
26282637
};
26292638

26302639
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {

0 commit comments

Comments
 (0)