Skip to content

[OpenMP] Basic BumpAllocator for (AMD)GPUs #69806

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openmp/docs/design/Runtimes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1465,3 +1465,4 @@ debugging features are supported.

* Enable debugging assertions in the device. ``0x01``
* Enable diagnosing common problems during offloading . ``0x4``
* Enable device malloc statistics (amdgpu only). ``0x8``
2 changes: 2 additions & 0 deletions openmp/libomptarget/DeviceRTL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ endif()
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)

set(include_files
${include_directory}/Allocator.h
${include_directory}/Configuration.h
${include_directory}/Debug.h
${include_directory}/Interface.h
Expand All @@ -95,6 +96,7 @@ set(include_files
)

set(src_files
${source_directory}/Allocator.cpp
${source_directory}/Configuration.cpp
${source_directory}/Debug.cpp
${source_directory}/Kernel.cpp
Expand Down
44 changes: 44 additions & 0 deletions openmp/libomptarget/DeviceRTL/include/Allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
//===-------- Allocator.h - OpenMP memory allocator interface ---- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//

#ifndef OMPTARGET_ALLOCATOR_H
#define OMPTARGET_ALLOCATOR_H

#include "Types.h"

// Forward declaration.
struct KernelEnvironmentTy;

#pragma omp begin declare target device_type(nohost)

namespace ompx {

namespace allocator {

static uint64_t constexpr ALIGNMENT = 16;

/// Initialize the allocator according to \p KernelEnvironment
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);

/// Allocate \p Size bytes.
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
alloc(uint64_t Size);

/// Free the allocation pointed to by \p Ptr.
void free(void *Ptr);

} // namespace allocator

} // namespace ompx

#pragma omp end declare target

#endif
80 changes: 80 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#include "Allocator.h"
#include "Configuration.h"
#include "Environment.h"
#include "Mapping.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"

using namespace ompx;

#pragma omp begin declare target device_type(nohost)

[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;

/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
/// directly.
struct BumpAllocatorTy final {

void *alloc(uint64_t Size) {
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));

if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
atomic::seq_cst);
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
atomic::seq_cst);
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
atomic::seq_cst);
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
atomic::seq_cst);
}

uint64_t *Data =
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
uint64_t End =
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;

uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
if (OldData + Size > End)
__builtin_trap();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just return nullptr? Isn't that the expect failure mode for a malloc call?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This allocator is not a production allocator. Right now, we run into too many problems. And honestly, nobody checks for nullptr.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but I figured the inevitable dereference of the nullptr would be a good trap that's usually more easily connected with malloc.

Copy link
Contributor

@shiltian shiltian Oct 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on my experience it's not always easy to expose a dereference nullptr on a GPU, or to put another way, sometimes even dereference a nullptr will not trigger a trap. Given what we have right now in this allocator, it is better to just trap if we don't have enough memory.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I'm just averse to traps because they currently deadlock on my machine. I still haven't tracked down the cause of that however.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but I figured the inevitable dereference of the nullptr would be a good trap that's usually more easily connected with malloc.

That is arguably not true, especially for "implicit"/"hidden" mallocs. People that run on AMD complain often about tests not running, or only running with O1/2/3 but don't connect it to the malloc. The problem with *nullptr is that it's UB. So we don't necessarily do it at all. I have seen many tests "pass" even though the smart stack was full but the fallback to malloc was removed and somehow writing some shared memory worked out. But that also leads to spurious fails. Let's crash loud and fast, not via silent corruption.


return reinterpret_cast<void *>(OldData);
}

void free(void *) {}
};

BumpAllocatorTy BumpAllocator;

/// allocator namespace implementation
///
///{

void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
// more than one.
}

void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }

void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }

///}

#pragma omp end declare target
2 changes: 2 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//

#include "Allocator.h"
#include "Debug.h"
#include "Environment.h"
#include "Interface.h"
Expand All @@ -30,6 +31,7 @@ static void inititializeRuntime(bool IsSPMD,
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
state::init(IsSPMD, KernelEnvironment);
allocator::init(IsSPMD, KernelEnvironment);
}

/// Simple generic state machine for worker threads.
Expand Down
50 changes: 22 additions & 28 deletions openmp/libomptarget/DeviceRTL/src/State.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
//===----------------------------------------------------------------------===//

#include "State.h"
#include "Allocator.h"
#include "Configuration.h"
#include "Debug.h"
#include "Environment.h"
#include "Interface.h"
Expand All @@ -25,48 +27,36 @@ using namespace ompx;
///
///{

/// Add worst-case padding so that future allocations are properly aligned.
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
/// passed in as an argument and the stack rewritten to support it.
constexpr const uint32_t Alignment = 16;

/// External symbol to access dynamic shared memory.
[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[];
[[gnu::aligned(
allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)

/// The kernel environment passed to the init method by the compiler.
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);

///}

namespace {

/// Fallback implementations are missing to trigger a link time error.
/// Implementations for new devices, including the host, should go into a
/// dedicated begin/end declare variant.
///
///{

extern "C" {
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
}
#ifdef __AMDGPU__

///}
[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }

/// AMDGCN implementations of the shuffle sync idiom.
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
#else

extern "C" {
void *malloc(uint64_t Size) {
// TODO: Use some preallocated space for dynamic malloc.
return nullptr;
}
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
[[gnu::weak, gnu::leaf]] void free(void *Ptr);

void free(void *Ptr) {}
#endif
}

#pragma omp end declare variant
///}

/// A "smart" stack in shared memory.
Expand Down Expand Up @@ -95,7 +85,7 @@ struct SharedMemorySmartStackTy {
uint32_t computeThreadStorageTotal() {
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
Alignment);
allocator::ALIGNMENT);
}

/// Return the top address of the warp data stack, that is the first address
Expand All @@ -105,8 +95,10 @@ struct SharedMemorySmartStackTy {
}

/// The actual storage, shared among all warps.
[[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize];
[[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
[[gnu::aligned(
allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
[[gnu::aligned(
allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
};

static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
Expand All @@ -121,7 +113,9 @@ void SharedMemorySmartStackTy::init(bool IsSPMD) {

void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
// First align the number of requested bytes.
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
/// be passed in as an argument and the stack rewritten to support it.
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);

uint32_t StorageTotal = computeThreadStorageTotal();

Expand Down Expand Up @@ -149,7 +143,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
}

void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
if (utils::isSharedMemPtr(Ptr)) {
int TId = mapping::getThreadIdInBlock();
Usage[TId] -= AlignedBytes;
Expand Down
21 changes: 21 additions & 0 deletions openmp/libomptarget/include/Environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,27 @@ struct DeviceEnvironmentTy {
uint64_t HardwareParallelism;
};

struct DeviceMemoryPoolTy {
void *Ptr;
uint64_t Size;
};

struct DeviceMemoryPoolTrackingTy {
uint64_t NumAllocations;
uint64_t AllocationTotal;
uint64_t AllocationMin;
uint64_t AllocationMax;

void combine(DeviceMemoryPoolTrackingTy &Other) {
NumAllocations += Other.NumAllocations;
AllocationTotal += Other.AllocationTotal;
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
: AllocationMin;
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
: AllocationMax;
}
};

// NOTE: Please don't change the order of those members as their indices are
// used in the middle end. Always add the new data member at the end.
// Different from KernelEnvironmentTy below, this structure contains members
Expand Down
13 changes: 11 additions & 2 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2529,10 +2529,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
Value = 0;
Value = DeviceMemoryPoolSize;
return Plugin::success();
}
Error setDeviceHeapSize(uint64_t Value) override {
for (DeviceImageTy *Image : LoadedImages)
if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
return Err;
DeviceMemoryPoolSize = Value;
return Plugin::success();
}
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }

/// AMDGPU-specific function to get device attributes.
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
Expand Down Expand Up @@ -2625,6 +2631,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {

/// Reference to the host device.
AMDHostDeviceTy &HostDevice;

/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
};

Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
Expand Down
Loading