Skip to content

Commit 8ca8a22

Browse files
committed
Revert "[OpenMP] Basic BumpAllocator for (AMD)GPUs (llvm#69806)"
This reverts commit d841fbf. Change-Id: I91980da09a1af7fb8cbb1c4fc8d28c1c768d5f5f
1 parent 2a380c7 commit 8ca8a22

File tree

12 files changed

+42
-172
lines changed

12 files changed

+42
-172
lines changed

openmp/docs/design/Runtimes.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1456,4 +1456,3 @@ debugging features are supported.
14561456

14571457
* Enable debugging assertions in the device. ``0x01``
14581458
* Enable diagnosing common problems during offloading . ``0x4``
1459-
* Enable device malloc statistics (amdgpu only). ``0x8``

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ endif()
9191
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
9292

9393
set(include_files
94-
${include_directory}/Allocator.h
9594
${include_directory}/Configuration.h
9695
${include_directory}/Debug.h
9796
${include_directory}/Interface.h
@@ -106,7 +105,6 @@ set(include_files
106105
)
107106

108107
set(src_files
109-
${source_directory}/Allocator.cpp
110108
${source_directory}/Configuration.cpp
111109
${source_directory}/Debug.cpp
112110
${source_directory}/Kernel.cpp

openmp/libomptarget/DeviceRTL/src/Kernel.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13-
#include "Allocator.h"
1413
#include "Debug.h"
1514
#include "Environment.h"
1615
#include "Interface.h"
@@ -33,8 +32,6 @@ static void inititializeRuntime(bool IsSPMD,
3332
state::init(IsSPMD, KernelEnvironment);
3433
if (__kmpc_get_hardware_thread_id_in_block() == 0)
3534
__init_ThreadDSTPtrPtr();
36-
37-
allocator::init(IsSPMD, KernelEnvironment);
3835
}
3936

4037
/// Simple generic state machine for worker threads.

openmp/libomptarget/DeviceRTL/src/State.cpp

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "State.h"
12-
#include "Allocator.h"
13-
#include "Configuration.h"
1412
#include "Debug.h"
1513
#include "Environment.h"
1614
#include "Interface.h"
@@ -33,16 +31,18 @@ void internal_free(void *Ptr);
3331
///
3432
///{
3533

34+
/// Add worst-case padding so that future allocations are properly aligned.
35+
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
36+
/// passed in as an argument and the stack rewritten to support it.
37+
constexpr const uint32_t Alignment = 16;
38+
3639
/// External symbol to access dynamic shared memory.
37-
[[gnu::aligned(
38-
allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
40+
[[gnu::aligned(Alignment)]] extern unsigned char DynamicSharedBuffer[];
3941
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
4042

4143
/// The kernel environment passed to the init method by the compiler.
4244
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
4345

44-
///}
45-
4646
namespace {
4747

4848
/// Malloc/Free API implementation
@@ -61,18 +61,8 @@ extern "C" size_t __ockl_get_local_size(uint32_t dim);
6161
extern "C" size_t __ockl_get_num_groups(uint32_t dim);
6262

6363
extern "C" {
64-
#ifdef __AMDGPU__
65-
size_t external_get_local_size(uint32_t dim) { return __ockl_get_local_size(dim);}
66-
size_t external_get_num_groups(uint32_t dim) { return __ockl_get_num_groups(dim);}
67-
[[gnu::weak]] void *malloc(uint64_t Size) { return allocator::alloc(Size); }
68-
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
69-
70-
#else
71-
7264
[[gnu::weak, gnu::leaf]] void *malloc(uint64_t Size);
7365
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
74-
75-
#endif
7666
}
7767

7868
#pragma omp begin declare variant match(device = {arch(amdgcn)})
@@ -86,6 +76,19 @@ void internal_free(void *Ptr) { __ockl_dm_dealloc((uint64_t)Ptr); }
8676
}
8777
#pragma omp end declare variant
8878
///}
79+
80+
extern "C" {
81+
#ifdef __AMDGCN__
82+
void *malloc(uint64_t Size) { return internal_malloc(Size); }
83+
void free(void *Ptr) { internal_free(Ptr); }
84+
size_t external_get_local_size(uint32_t dim) { return __ockl_get_local_size(dim);}
85+
size_t external_get_num_groups(uint32_t dim) { return __ockl_get_num_groups(dim);}
86+
#else
87+
__attribute__((leaf)) void *malloc(uint64_t Size);
88+
__attribute__((leaf)) void free(void *Ptr);
89+
#endif
90+
} // extern "C"
91+
8992
/// NVPTX implementations of internal mallocs
9093
///
9194
///{
@@ -124,7 +127,7 @@ struct SharedMemorySmartStackTy {
124127
uint32_t computeThreadStorageTotal() {
125128
uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock();
126129
return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
127-
allocator::ALIGNMENT);
130+
Alignment);
128131
}
129132

130133
/// Return the top address of the warp data stack, that is the first address
@@ -134,10 +137,8 @@ struct SharedMemorySmartStackTy {
134137
}
135138

136139
/// The actual storage, shared among all warps.
137-
[[gnu::aligned(
138-
allocator::ALIGNMENT)]] unsigned char Data[state::SharedScratchpadSize];
139-
[[gnu::aligned(
140-
allocator::ALIGNMENT)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
140+
[[gnu::aligned(Alignment)]] unsigned char Data[state::SharedScratchpadSize];
141+
[[gnu::aligned(Alignment)]] unsigned char Usage[mapping::MaxThreadsPerTeam];
141142
};
142143

143144
static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
@@ -152,9 +153,7 @@ void SharedMemorySmartStackTy::init(bool IsSPMD) {
152153

153154
void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
154155
// First align the number of requested bytes.
155-
/// FIXME: The stack shouldn't require worst-case padding. Alignment needs to
156-
/// be passed in as an argument and the stack rewritten to support it.
157-
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
156+
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
158157

159158
uint32_t StorageTotal = computeThreadStorageTotal();
160159

@@ -182,7 +181,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
182181
}
183182

184183
void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
185-
uint64_t AlignedBytes = utils::align_up(Bytes, allocator::ALIGNMENT);
184+
uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
186185
if (utils::isSharedMemPtr(Ptr)) {
187186
int TId = mapping::getThreadIdInBlock();
188187
Usage[TId] -= AlignedBytes;

openmp/libomptarget/include/Environment.h

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -43,27 +43,6 @@ struct DeviceEnvironmentTy {
4343
uint64_t HardwareParallelism;
4444
};
4545

46-
struct DeviceMemoryPoolTy {
47-
void *Ptr;
48-
uint64_t Size;
49-
};
50-
51-
struct DeviceMemoryPoolTrackingTy {
52-
uint64_t NumAllocations;
53-
uint64_t AllocationTotal;
54-
uint64_t AllocationMin;
55-
uint64_t AllocationMax;
56-
57-
void combine(DeviceMemoryPoolTrackingTy &Other) {
58-
NumAllocations += Other.NumAllocations;
59-
AllocationTotal += Other.AllocationTotal;
60-
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
61-
: AllocationMin;
62-
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
63-
: AllocationMax;
64-
}
65-
};
66-
6746
// NOTE: Please don't change the order of those members as their indices are
6847
// used in the middle end. Always add the new data member at the end.
6948
// Different from KernelEnvironmentTy below, this structure contains members

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3217,16 +3217,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
32173217
return Plugin::success();
32183218
}
32193219
Error getDeviceHeapSize(uint64_t &Value) override {
3220-
Value = DeviceMemoryPoolSize;
3221-
return Plugin::success();
3222-
}
3223-
Error setDeviceHeapSize(uint64_t Value) override {
3224-
for (DeviceImageTy *Image : LoadedImages)
3225-
if (auto Err = setupDeviceMemoryPool(Plugin::get(), *Image, Value))
3226-
return Err;
3227-
DeviceMemoryPoolSize = Value;
3220+
Value = 0;
32283221
return Plugin::success();
32293222
}
3223+
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
32303224

32313225
/// AMDGPU-specific function to get device attributes.
32323226
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
@@ -3387,9 +3381,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
33873381

33883382
/// Pointer to the preallocated device memory pool
33893383
void *PreAllocatedDeviceMemoryPool;
3390-
3391-
/// The current size of the global device memory pool (managed by us).
3392-
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
33933384
};
33943385

33953386
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

Lines changed: 4 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -608,35 +608,6 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
608608
}
609609

610610
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
611-
612-
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
613-
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
614-
for (auto *Image : LoadedImages) {
615-
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
616-
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
617-
sizeof(DeviceMemoryPoolTrackingTy),
618-
&ImageDeviceMemoryPoolTracking);
619-
if (auto Err =
620-
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal))
621-
return Err;
622-
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
623-
}
624-
625-
// TODO: Write this by default into a file.
626-
printf("\n\n|-----------------------\n"
627-
"| Device memory tracker:\n"
628-
"|-----------------------\n"
629-
"| #Allocations: %lu\n"
630-
"| Byes allocated: %lu\n"
631-
"| Minimal allocation: %lu\n"
632-
"| Maximal allocation: %lu\n"
633-
"|-----------------------\n\n\n",
634-
DeviceMemoryPoolTracking.NumAllocations,
635-
DeviceMemoryPoolTracking.AllocationTotal,
636-
DeviceMemoryPoolTracking.AllocationMin,
637-
DeviceMemoryPoolTracking.AllocationMax);
638-
}
639-
640611
// Delete the memory manager before deinitializing the device. Otherwise,
641612
// we may delete device allocations after the device is deinitialized.
642613
if (MemoryManager)
@@ -697,17 +668,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
697668
if (auto Err = setupDeviceEnvironment(Plugin, *Image))
698669
return std::move(Err);
699670

700-
// Setup the global device memory pool if needed.
701-
if (shouldSetupDeviceMemoryPool()) {
702-
uint64_t HeapSize;
703-
auto SizeOrErr = getDeviceHeapSize(HeapSize);
704-
if (SizeOrErr) {
705-
REPORT("No global device memory pool due to error: %s\n",
706-
toString(std::move(SizeOrErr)).data());
707-
} else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
708-
return std::move(Err);
709-
}
710-
711671
// Register all offload entries of the image.
712672
if (auto Err = registerOffloadEntries(*Image))
713673
return std::move(Err);
@@ -773,45 +733,6 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
773733
return Plugin::success();
774734
}
775735

776-
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
777-
DeviceImageTy &Image,
778-
uint64_t PoolSize) {
779-
// Free the old pool, if any.
780-
if (DeviceMemoryPool.Ptr) {
781-
if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
782-
TargetAllocTy::TARGET_ALLOC_DEVICE))
783-
return Err;
784-
}
785-
786-
DeviceMemoryPool.Size = PoolSize;
787-
auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
788-
TargetAllocTy::TARGET_ALLOC_DEVICE);
789-
if (AllocOrErr) {
790-
DeviceMemoryPool.Ptr = *AllocOrErr;
791-
} else {
792-
auto Err = AllocOrErr.takeError();
793-
REPORT("Failure to allocate device memory for global memory pool: %s\n",
794-
toString(std::move(Err)).data());
795-
DeviceMemoryPool.Ptr = nullptr;
796-
DeviceMemoryPool.Size = 0;
797-
}
798-
799-
// Create the metainfo of the device environment global.
800-
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
801-
sizeof(DeviceMemoryPoolTrackingTy),
802-
&DeviceMemoryPoolTracking);
803-
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
804-
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
805-
return Err;
806-
807-
// Create the metainfo of the device environment global.
808-
GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
809-
sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
810-
811-
// Write device environment values to the device.
812-
return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
813-
}
814-
815736
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
816737
DeviceImageTy &Image) {
817738
// The plugin either does not need an RPC server or it is unavailible.
@@ -1480,6 +1401,10 @@ Error GenericPluginTy::init() {
14801401
}
14811402

14821403
Error GenericPluginTy::deinit() {
1404+
// There is no global handler if no device is available.
1405+
if (GlobalHandler)
1406+
delete GlobalHandler;
1407+
14831408
// Deinitialize all active devices.
14841409
for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
14851410
if (Devices[DeviceId]) {
@@ -1489,10 +1414,6 @@ Error GenericPluginTy::deinit() {
14891414
assert(!Devices[DeviceId] && "Device was not deinitialized");
14901415
}
14911416

1492-
// There is no global handler if no device is available.
1493-
if (GlobalHandler)
1494-
delete GlobalHandler;
1495-
14961417
#if RPC_FIXME
14971418
if (RPCServer)
14981419
delete RPCServer;

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -660,10 +660,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
660660
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
661661
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
662662

663-
/// Setup the global device memory pool, if the plugin requires one.
664-
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
665-
uint64_t PoolSize);
666-
667663
// Setup the RPC server for this device if needed. This may not run on some
668664
// plugins like the CPU targets. By default, it will not be executed so it is
669665
// up to the target to override this using the shouldSetupRPCServer function.
@@ -897,10 +893,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
897893
/// setupDeviceEnvironment() function.
898894
virtual bool shouldSetupDeviceEnvironment() const { return true; }
899895

900-
/// Indicate whether the device should setup the global device memory pool. If
901-
/// false is return the value on the device will be uninitialized.
902-
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
903-
904896
/// Indicate whether or not the device should setup the RPC server. This is
905897
/// only necessary for unhosted targets like the GPU.
906898
virtual bool shouldSetupRPCServer() const { return false; }
@@ -962,6 +954,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
962954
RPCHandleTy *RPCHandle;
963955

964956
private:
957+
/// Return the kernel environment object for kernel \p Name.
958+
Expected<KernelEnvironmentTy>
959+
getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
960+
965961
#ifdef OMPT_SUPPORT
966962
/// OMPT callback functions
967963
#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
@@ -976,13 +972,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
976972
/// Internal representation for OMPT device (initialize & finalize)
977973
std::atomic<bool> OmptInitialized;
978974
#endif
979-
980-
/// Return the kernel environment object for kernel \p Name.
981-
Expected<KernelEnvironmentTy>
982-
getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
983-
984-
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
985-
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
986975
};
987976

988977
/// Class implementing common functionalities of offload plugins. Each plugin

openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -847,11 +847,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
847847
return Plugin::success();
848848
}
849849

850-
virtual bool shouldSetupDeviceMemoryPool() const override {
851-
/// We use the CUDA malloc for now.
852-
return false;
853-
}
854-
855850
/// Getters and setters for stack and heap sizes.
856851
Error getDeviceStackSize(uint64_t &Value) override {
857852
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);

openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,9 +322,8 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
322322
return Plugin::success();
323323
}
324324

325-
/// This plugin should not setup the device environment or memory pool.
325+
/// This plugin should not setup the device environment.
326326
virtual bool shouldSetupDeviceEnvironment() const override { return false; };
327-
virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
328327

329328
/// Getters and setters for stack size and heap size not relevant.
330329
Error getDeviceStackSize(uint64_t &Value) override {

0 commit comments

Comments
 (0)