Skip to content

Commit d841fbf

Browse files
jdoerfertronlieb
authored andcommitted
[OpenMP] Basic BumpAllocator for (AMD)GPUs (llvm#69806)
The patch contains a basic BumpAllocator for (AMD)GPUs to allow us to run more tests. The allocator implements `malloc`, both internally and externally, while we continue to default to the NVIDIA `malloc` when we target NVIDIA GPUs. Once we have smarter or customizable allocators we should consider this choice, for now, this allocator is better than none. It traps if it is out of memory, making it easy to debug. Heap size is configured via `LIBOMPTARGET_HEAP_SIZE` and defaults to 512MB. It allows to track allocation statistics via `LIBOMPTARGET_DEVICE_RTL_DEBUG=8` (together with `-fopenmp-target-debug=8`). Two tests were added, and one was enabled. This is the next step towards fixing llvm#66708 Change-Id: I181cdca714994b285c0cd1d16dd3546809cc5dd2
1 parent 6198658 commit d841fbf

File tree

12 files changed

+172
-42
lines changed

12 files changed

+172
-42
lines changed

openmp/docs/design/Runtimes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1456,3 +1456,4 @@ debugging features are supported.
14561456

14571457
* Enable debugging assertions in the device. ``0x01``
14581458
* Enable diagnosing common problems during offloading . ``0x4``
1459+
* Enable device malloc statistics (amdgpu only). ``0x8``

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ endif()
9191
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
9292

9393
set(include_files
94+
${include_directory}/Allocator.h
9495
${include_directory}/Configuration.h
9596
${include_directory}/Debug.h
9697
${include_directory}/Interface.h
@@ -105,6 +106,7 @@ set(include_files
105106
)
106107

107108
set(src_files
109+
${source_directory}/Allocator.cpp
108110
${source_directory}/Configuration.cpp
109111
${source_directory}/Debug.cpp
110112
${source_directory}/Kernel.cpp

openmp/libomptarget/DeviceRTL/src/Kernel.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
#include "Allocator.h"
1314
#include "Debug.h"
1415
#include "Environment.h"
1516
#include "Interface.h"
@@ -32,6 +33,8 @@ static void inititializeRuntime(bool IsSPMD,
3233
state::init(IsSPMD, KernelEnvironment);
3334
if (__kmpc_get_hardware_thread_id_in_block() == 0)
3435
__init_ThreadDSTPtrPtr();
36+
37+
allocator::init(IsSPMD, KernelEnvironment);
3538
}
3639

3740
/// Simple generic state machine for worker threads.

openmp/libomptarget/DeviceRTL/src/State.cpp

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "State.h"
12+
#include "Allocator.h"
13+
#include "Configuration.h"
1214
#include "Debug.h"
1315
#include "Environment.h"
1416
#include "Interface.h"
@@ -31,18 +33,16 @@ void internal_free(void *Ptr);
3133
///
3234
///{
3335

34-
/// Add worst-case padding so that future allocations are properly aligned.
35-
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
36-
/// passed in as an argument and the stack rewritten to support it.
37-
constexpr const uint32_t Alignment = 16;
38-
3936
/// External symbol to access dynamic shared memory.
40-
[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[];
37+
[[gnu::aligned(
38+
allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
4139
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
4240

4341
/// The kernel environment passed to the init method by the compiler.
4442
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
4543

44+
///}
45+
4646
namespace {
4747

4848
/// Malloc/Free API implementation
@@ -61,8 +61,18 @@ extern "C" size_t __ockl_get_local_size(uint32_t dim);
6161
extern "C" size_t __ockl_get_num_groups(uint32_t dim);
6262

6363
extern "C" {
64+
#ifdef __AMDGPU__
65+
size_t external_get_local_size(uint32_t dim) { return __ockl_get_local_size(dim);}
66+
size_t external_get_num_groups(uint32_t dim) { return __ockl_get_num_groups(dim);}
67+
[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
68+
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
69+
70+
#else
71+
6472
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
6573
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
74+
75+
#endif
6676
}
6777

6878
#pragma omp begin declare variant match(device = {arch(amdgcn)})
@@ -76,19 +86,6 @@ void internal_free(void *Ptr) { __ockl_dm_dealloc((uint64_t)Ptr); }
7686
}
7787
#pragma omp end declare variant
7888
///}
79-
80-
extern "C" {
81-
#ifdef __AMDGCN__
82-
void *malloc(uint64_t Size) { return internal_malloc(Size); }
83-
void free(void *Ptr) { internal_free(Ptr); }
84-
size_t external_get_local_size(uint32_t dim) { return __ockl_get_local_size(dim);}
85-
size_t external_get_num_groups(uint32_t dim) { return __ockl_get_num_groups(dim);}
86-
#else
87-
__attribute__((leaf)) void *malloc(uint64_t Size);
88-
__attribute__((leaf)) void free(void *Ptr);
89-
#endif
90-
} // extern "C"
91-
9289
/// NVPTX implementations of internal mallocs
9390
///
9491
///{
@@ -127,7 +124,7 @@ struct SharedMemorySmartStackTy {
127124
uint32_t computeThreadStorageTotal() {
128125
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
129126
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
130-
Alignment);
127+
allocator::ALIGNMENT);
131128
}
132129

133130
/// Return the top address of the warp data stack, that is the first address
@@ -137,8 +134,10 @@ struct SharedMemorySmartStackTy {
137134
}
138135

139136
/// The actual storage, shared among all warps.
140-
[[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize];
141-
[[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
137+
[[gnu::aligned(
138+
allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
139+
[[gnu::aligned(
140+
allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
142141
};
143142

144143
static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
@@ -153,7 +152,9 @@ void SharedMemorySmartStackTy::init(bool IsSPMD) {
153152

154153
void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
155154
// First align the number of requested bytes.
156-
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
155+
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
156+
/// be passed in as an argument and the stack rewritten to support it.
157+
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
157158

158159
uint32_t StorageTotal = computeThreadStorageTotal();
159160

@@ -181,7 +182,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
181182
}
182183

183184
void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
184-
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
185+
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
185186
if (utils::isSharedMemPtr(Ptr)) {
186187
int TId = mapping::getThreadIdInBlock();
187188
Usage[TId] -= AlignedBytes;

openmp/libomptarget/include/Environment.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,27 @@ struct DeviceEnvironmentTy {
4343
uint64_t HardwareParallelism;
4444
};
4545

46+
struct DeviceMemoryPoolTy {
47+
void *Ptr;
48+
uint64_t Size;
49+
};
50+
51+
struct DeviceMemoryPoolTrackingTy {
52+
uint64_t NumAllocations;
53+
uint64_t AllocationTotal;
54+
uint64_t AllocationMin;
55+
uint64_t AllocationMax;
56+
57+
void combine(DeviceMemoryPoolTrackingTy &Other) {
58+
NumAllocations += Other.NumAllocations;
59+
AllocationTotal += Other.AllocationTotal;
60+
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
61+
: AllocationMin;
62+
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
63+
: AllocationMax;
64+
}
65+
};
66+
4667
// NOTE: Please don't change the order of those members as their indices are
4768
// used in the middle end. Always add the new data member at the end.
4869
// Different from KernelEnvironmentTy below, this structure contains members

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3176,10 +3176,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
31763176
return Plugin::success();
31773177
}
31783178
Error getDeviceHeapSize(uint64_t &Value) override {
3179-
Value = 0;
3179+
Value = DeviceMemoryPoolSize;
3180+
return Plugin::success();
3181+
}
3182+
Error setDeviceHeapSize(uint64_t Value) override {
3183+
for (DeviceImageTy *Image : LoadedImages)
3184+
if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
3185+
return Err;
3186+
DeviceMemoryPoolSize = Value;
31803187
return Plugin::success();
31813188
}
3182-
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
31833189

31843190
/// AMDGPU-specific function to get device attributes.
31853191
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
@@ -3340,6 +3346,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
33403346

33413347
/// Pointer to the preallocated device memory pool
33423348
void *PreAllocatedDeviceMemoryPool;
3349+
3350+
/// The current size of the global device memory pool (managed by us).
3351+
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
33433352
};
33443353

33453354
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

Lines changed: 83 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,35 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
608608
}
609609

610610
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
611+
612+
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
613+
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
614+
for (auto *Image : LoadedImages) {
615+
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
616+
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
617+
sizeof(DeviceMemoryPoolTrackingTy),
618+
&ImageDeviceMemoryPoolTracking);
619+
if (auto Err =
620+
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal))
621+
return Err;
622+
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
623+
}
624+
625+
// TODO: Write this by default into a file.
626+
printf("\n\n|-----------------------\n"
627+
"| Device memory tracker:\n"
628+
"|-----------------------\n"
629+
"| #Allocations: %lu\n"
630+
"| Byes allocated: %lu\n"
631+
"| Minimal allocation: %lu\n"
632+
"| Maximal allocation: %lu\n"
633+
"|-----------------------\n\n\n",
634+
DeviceMemoryPoolTracking.NumAllocations,
635+
DeviceMemoryPoolTracking.AllocationTotal,
636+
DeviceMemoryPoolTracking.AllocationMin,
637+
DeviceMemoryPoolTracking.AllocationMax);
638+
}
639+
611640
// Delete the memory manager before deinitializing the device. Otherwise,
612641
// we may delete device allocations after the device is deinitialized.
613642
if (MemoryManager)
@@ -668,6 +697,17 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
668697
if (auto Err = setupDeviceEnvironment(Plugin, *Image))
669698
return std::move(Err);
670699

700+
// Setup the global device memory pool if needed.
701+
if (shouldSetupDeviceMemoryPool()) {
702+
uint64_t HeapSize;
703+
auto SizeOrErr = getDeviceHeapSize(HeapSize);
704+
if (SizeOrErr) {
705+
REPORT("No global device memory pool due to error: %s\n",
706+
toString(std::move(SizeOrErr)).data());
707+
} else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
708+
return std::move(Err);
709+
}
710+
671711
// Register all offload entries of the image.
672712
if (auto Err = registerOffloadEntries(*Image))
673713
return std::move(Err);
@@ -733,6 +773,45 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
733773
return Plugin::success();
734774
}
735775

776+
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
777+
DeviceImageTy &Image,
778+
uint64_t PoolSize) {
779+
// Free the old pool, if any.
780+
if (DeviceMemoryPool.Ptr) {
781+
if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
782+
TargetAllocTy::TARGET_ALLOC_DEVICE))
783+
return Err;
784+
}
785+
786+
DeviceMemoryPool.Size = PoolSize;
787+
auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
788+
TargetAllocTy::TARGET_ALLOC_DEVICE);
789+
if (AllocOrErr) {
790+
DeviceMemoryPool.Ptr = *AllocOrErr;
791+
} else {
792+
auto Err = AllocOrErr.takeError();
793+
REPORT("Failure to allocate device memory for global memory pool: %s\n",
794+
toString(std::move(Err)).data());
795+
DeviceMemoryPool.Ptr = nullptr;
796+
DeviceMemoryPool.Size = 0;
797+
}
798+
799+
// Create the metainfo of the device environment global.
800+
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
801+
sizeof(DeviceMemoryPoolTrackingTy),
802+
&DeviceMemoryPoolTracking);
803+
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
804+
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
805+
return Err;
806+
807+
// Create the metainfo of the device environment global.
808+
GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
809+
sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
810+
811+
// Write device environment values to the device.
812+
return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
813+
}
814+
736815
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
737816
DeviceImageTy &Image) {
738817
// The plugin either does not need an RPC server or it is unavailible.
@@ -1401,10 +1480,6 @@ Error GenericPluginTy::init() {
14011480
}
14021481

14031482
Error GenericPluginTy::deinit() {
1404-
// There is no global handler if no device is available.
1405-
if (GlobalHandler)
1406-
delete GlobalHandler;
1407-
14081483
// Deinitialize all active devices.
14091484
for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
14101485
if (Devices[DeviceId]) {
@@ -1414,6 +1489,10 @@ Error GenericPluginTy::deinit() {
14141489
assert(!Devices[DeviceId] && "Device was not deinitialized");
14151490
}
14161491

1492+
// There is no global handler if no device is available.
1493+
if (GlobalHandler)
1494+
delete GlobalHandler;
1495+
14171496
#if RPC_FIXME
14181497
if (RPCServer)
14191498
delete RPCServer;

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
660660
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
661661
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
662662

663+
/// Setup the global device memory pool, if the plugin requires one.
664+
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
665+
uint64_t PoolSize);
666+
663667
// Setup the RPC server for this device if needed. This may not run on some
664668
// plugins like the CPU targets. By default, it will not be executed so it is
665669
// up to the target to override this using the shouldSetupRPCServer function.
@@ -893,6 +897,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
893897
/// setupDeviceEnvironment() function.
894898
virtual bool shouldSetupDeviceEnvironment() const { return true; }
895899

900+
/// Indicate whether the device should setup the global device memory pool. If
901+
/// false is return the value on the device will be uninitialized.
902+
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
903+
896904
/// Indicate whether or not the device should setup the RPC server. This is
897905
/// only necessary for unhosted targets like the GPU.
898906
virtual bool shouldSetupRPCServer() const { return false; }
@@ -954,10 +962,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
954962
RPCHandleTy *RPCHandle;
955963

956964
private:
957-
/// Return the kernel environment object for kernel \p Name.
958-
Expected<KernelEnvironmentTy>
959-
getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
960-
961965
#ifdef OMPT_SUPPORT
962966
/// OMPT callback functions
963967
#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
@@ -972,6 +976,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
972976
/// Internal representation for OMPT device (initialize & finalize)
973977
std::atomic<bool> OmptInitialized;
974978
#endif
979+
980+
/// Return the kernel environment object for kernel \p Name.
981+
Expected<KernelEnvironmentTy>
982+
getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
983+
984+
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
985+
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
975986
};
976987

977988
/// Class implementing common functionalities of offload plugins. Each plugin

openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
847847
return Plugin::success();
848848
}
849849

850+
virtual bool shouldSetupDeviceMemoryPool() const override {
851+
/// We use the CUDA malloc for now.
852+
return false;
853+
}
854+
850855
/// Getters and setters for stack and heap sizes.
851856
Error getDeviceStackSize(uint64_t &Value) override {
852857
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);

openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
322322
return Plugin::success();
323323
}
324324

325-
/// This plugin should not setup the device environment.
325+
/// This plugin should not setup the device environment or memory pool.
326326
virtual bool shouldSetupDeviceEnvironment() const override { return false; };
327+
virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
327328

328329
/// Getters and setters for stack size and heap size not relevant.
329330
Error getDeviceStackSize(uint64_t &Value) override {

0 commit comments

Comments
 (0)