Skip to content

Commit d6a3d6b

Browse files
authored
[openmp] Fixed Support for VA for record-replay. (#70396)
The commit was discussed in phabricator (https://reviews.llvm.org/D157186). Record replay currently fails on AMD as it conflicts with the heap memory allocator introduced in #69806. The workaround is setting `LIBOMPTARGET_HEAP_SIZE=0` during both record and replay run.
1 parent d346c82 commit d6a3d6b

File tree

14 files changed

+356
-45
lines changed

14 files changed

+356
-45
lines changed

openmp/libomptarget/include/Utilities.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,11 @@ template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
253253
return std::align(Alignment, sizeof(char), Ptr, Space);
254254
}
255255

256+
/// Round up \p V to a \p Boundary.
257+
template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
258+
return (V + Boundary - 1) / Boundary * Boundary;
259+
}
260+
256261
} // namespace target
257262
} // namespace omp
258263
} // namespace llvm

openmp/libomptarget/include/omptarget.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ void __tgt_set_info_flag(uint32_t);
439439
int __tgt_print_device_info(int64_t DeviceId);
440440

441441
int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
442-
bool IsRecord, bool SaveOutput);
442+
void *VAddr, bool IsRecord, bool SaveOutput);
443443

444444
#ifdef __cplusplus
445445
}

openmp/libomptarget/include/rtl.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ struct RTLInfoTy {
7373
typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
7474
typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
7575
typedef int32_t(set_device_offset_ty)(int32_t);
76-
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
76+
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
77+
bool);
7778

7879
int32_t Idx = -1; // RTL index, index is the number of devices
7980
// of other RTLs that were registered before,

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2579,6 +2579,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25792579
DeviceMemoryPoolSize = Value;
25802580
return Plugin::success();
25812581
}
2582+
Error getDeviceMemorySize(uint64_t &Value) override {
2583+
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
2584+
if (Pool->isGlobal()) {
2585+
hsa_status_t Status =
2586+
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
2587+
return Plugin::check(Status, "Error in getting device memory size: %s");
2588+
}
2589+
}
2590+
return Plugin::error("getDeviceMemorySize:: no global pool");
2591+
}
25822592

25832593
/// AMDGPU-specific function to get device attributes.
25842594
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp

Lines changed: 92 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -49,40 +49,87 @@ struct RecordReplayTy {
4949
void *MemoryStart;
5050
void *MemoryPtr;
5151
size_t MemorySize;
52+
size_t TotalSize;
5253
GenericDeviceTy *Device;
5354
std::mutex AllocationLock;
5455

5556
RRStatusTy Status;
5657
bool ReplaySaveOutput;
57-
uint64_t DeviceMemorySize;
58-
59-
// Record/replay pre-allocates the largest possible device memory using the
60-
// default kind.
61-
// TODO: Expand allocation to include other kinds (device, host, shared) and
62-
// possibly use a MemoryManager to track (de-)allocations for
63-
// storing/retrieving when recording/replaying.
64-
Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
65-
// Pre-allocate memory on device. Starts with 64GB and subtracts in steps
66-
// of 1GB until allocation succeeds.
67-
const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
58+
59+
void *suggestAddress(uint64_t MaxMemoryAllocation) {
60+
// Get a valid pointer address for this system
61+
void *Addr =
62+
Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
63+
Device->free(Addr);
64+
// Align Address to MaxMemoryAllocation
65+
Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
66+
return Addr;
67+
}
68+
69+
Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
70+
size_t ASize = MaxMemoryAllocation;
71+
72+
if (!VAddr && isRecording())
73+
VAddr = suggestAddress(MaxMemoryAllocation);
74+
75+
DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
76+
77+
if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
78+
return Err;
79+
80+
if (isReplaying() && VAddr != MemoryStart) {
81+
return Plugin::error("Record-Replay cannot assign the"
82+
"requested recorded address (%p, %p)",
83+
VAddr, MemoryStart);
84+
}
85+
86+
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
87+
"Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
88+
89+
MemoryPtr = MemoryStart;
90+
MemorySize = 0;
91+
TotalSize = ASize;
92+
return Plugin::success();
93+
}
94+
95+
Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
96+
const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
6897
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
6998
MemoryStart = nullptr;
70-
for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
71-
MemoryStart =
72-
Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
99+
for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
100+
MemoryStart = Device->allocate(TotalSize, /* HstPtr */ nullptr,
101+
TARGET_ALLOC_DEFAULT);
73102
if (MemoryStart)
74103
break;
75104
}
76105

106+
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
107+
"Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
108+
MemoryStart);
109+
77110
if (!MemoryStart)
78111
return Plugin::error("Allocating record/replay memory");
79112

113+
if (VAddr && VAddr != MemoryStart)
114+
return Plugin::error("Cannot allocate recorded address");
115+
80116
MemoryPtr = MemoryStart;
81117
MemorySize = 0;
82118

83119
return Plugin::success();
84120
}
85121

122+
Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
123+
if (Device->supportVAManagement())
124+
return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
125+
126+
uint64_t DevMemSize;
127+
if (Device->getDeviceMemorySize(DevMemSize))
128+
return Plugin::error("Cannot determine Device Memory Size");
129+
130+
return preAllocateHeuristic(DevMemSize, ReqVAddr);
131+
}
132+
86133
void dumpDeviceMemory(StringRef Filename) {
87134
ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
88135
WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
@@ -114,8 +161,7 @@ struct RecordReplayTy {
114161
bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
115162

116163
RecordReplayTy()
117-
: Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
118-
DeviceMemorySize(-1) {}
164+
: Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}
119165

120166
void saveImage(const char *Name, const DeviceImageTy &Image) {
121167
SmallString<128> ImageName = {Name, ".image"};
@@ -197,6 +243,7 @@ struct RecordReplayTy {
197243
JsonKernelInfo["LoopTripCount"] = LoopTripCount;
198244
JsonKernelInfo["DeviceMemorySize"] = MemorySize;
199245
JsonKernelInfo["DeviceId"] = Device->getDeviceId();
246+
JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
200247

201248
json::Array JsonArgPtrs;
202249
for (int I = 0; I < NumArgs; ++I)
@@ -244,27 +291,33 @@ struct RecordReplayTy {
244291
return Alloc;
245292
}
246293

247-
Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
248-
bool SaveOutput) {
294+
Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
295+
RRStatusTy Status, bool SaveOutput) {
249296
this->Device = Device;
250297
this->Status = Status;
251-
this->DeviceMemorySize = MemSize;
252298
this->ReplaySaveOutput = SaveOutput;
253299

254-
if (auto Err = preallocateDeviceMemory(MemSize))
300+
if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
255301
return Err;
256302

257303
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
258304
"Record Replay Initialized (%p)"
259305
" as starting address, %lu Memory Size"
260306
" and set on status %s\n",
261-
MemoryStart, MemSize,
307+
MemoryStart, TotalSize,
262308
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
263309

264310
return Plugin::success();
265311
}
266312

267-
void deinit() { Device->free(MemoryStart); }
313+
void deinit() {
314+
if (Device->supportVAManagement()) {
315+
if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
316+
report_fatal_error("Error on releasing virtual memory space");
317+
} else {
318+
Device->free(MemoryStart);
319+
}
320+
}
268321

269322
} RecordReplay;
270323

@@ -1184,6 +1237,19 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
11841237
return queryAsyncImpl(*AsyncInfo);
11851238
}
11861239

1240+
Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
1241+
return Plugin::error("Device does not suppport VA Management");
1242+
}
1243+
1244+
Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
1245+
return Plugin::error("Device does not suppport VA Management");
1246+
}
1247+
1248+
Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
1249+
return Plugin::error(
1250+
"Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
1251+
}
1252+
11871253
Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
11881254
TargetAllocTy Kind) {
11891255
void *Alloc = nullptr;
@@ -1552,16 +1618,17 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
15521618
return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
15531619
}
15541620

1555-
int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
1556-
uint64_t MemorySize, bool isRecord,
1621+
int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
1622+
void *VAddr, bool isRecord,
15571623
bool SaveOutput) {
15581624
GenericPluginTy &Plugin = Plugin::get();
15591625
GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
15601626
RecordReplayTy::RRStatusTy Status =
15611627
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
15621628
: RecordReplayTy::RRStatusTy::RRReplaying;
15631629

1564-
if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
1630+
if (auto Err =
1631+
RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
15651632
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
15661633
"(Error: %s)\n",
15671634
MemorySize, toString(std::move(Err)).data());

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,21 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
655655
Error queryAsync(__tgt_async_info *AsyncInfo);
656656
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
657657

658+
/// Check whether the architecture supports VA management
659+
virtual bool supportVAManagement() const { return false; }
660+
661+
/// Get the total device memory size
662+
virtual Error getDeviceMemorySize(uint64_t &DSize);
663+
664+
/// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
665+
/// map it to \p VAddr. The obtained address is stored in \p Addr. At return
666+
/// \p RSize contains the actual size which can be equal or larger than the
667+
/// requested size.
668+
virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
669+
670+
/// De-allocates device memory and unmaps the virtual address \p VAddr
671+
virtual Error memoryVAUnMap(void *VAddr, size_t Size);
672+
658673
/// Allocate data on the device or involving the device.
659674
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
660675

openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,16 @@ DLWRAP(cuEventDestroy, 1)
8181

8282
DLWRAP_FINALIZE()
8383

84+
DLWRAP(cuMemUnmap, 2)
85+
DLWRAP(cuMemRelease, 1)
86+
DLWRAP(cuMemAddressFree, 2)
87+
DLWRAP(cuMemGetInfo, 2)
88+
DLWRAP(cuMemAddressReserve, 5)
89+
DLWRAP(cuMemMap, 5)
90+
DLWRAP(cuMemCreate, 4)
91+
DLWRAP(cuMemSetAccess, 4)
92+
DLWRAP(cuMemGetAllocationGranularity, 3)
93+
8494
#ifndef DYNAMIC_CUDA_PATH
8595
#define DYNAMIC_CUDA_PATH "libcuda.so"
8696
#endif

openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,71 @@ typedef struct CUevent_st *CUevent;
2626

2727
#define CU_DEVICE_INVALID ((CUdevice)-2)
2828

29+
typedef unsigned long long CUmemGenericAllocationHandle_v1;
30+
typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
31+
32+
#define CU_DEVICE_INVALID ((CUdevice)-2)
33+
34+
typedef enum CUmemAllocationGranularity_flags_enum {
35+
CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
36+
CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
37+
} CUmemAllocationGranularity_flags;
38+
39+
typedef enum CUmemAccess_flags_enum {
40+
CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
41+
CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
42+
CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
43+
CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF
44+
} CUmemAccess_flags;
45+
46+
typedef enum CUmemLocationType_enum {
47+
CU_MEM_LOCATION_TYPE_INVALID = 0x0,
48+
CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
49+
CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF
50+
} CUmemLocationType;
51+
52+
typedef struct CUmemLocation_st {
53+
CUmemLocationType type;
54+
int id;
55+
} CUmemLocation_v1;
56+
typedef CUmemLocation_v1 CUmemLocation;
57+
58+
typedef struct CUmemAccessDesc_st {
59+
CUmemLocation location;
60+
CUmemAccess_flags flags;
61+
} CUmemAccessDesc_v1;
62+
63+
typedef CUmemAccessDesc_v1 CUmemAccessDesc;
64+
65+
typedef enum CUmemAllocationType_enum {
66+
CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
67+
CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
68+
CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF
69+
} CUmemAllocationType;
70+
71+
typedef enum CUmemAllocationHandleType_enum {
72+
CU_MEM_HANDLE_TYPE_NONE = 0x0,
73+
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
74+
CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
75+
CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
76+
CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF
77+
} CUmemAllocationHandleType;
78+
79+
typedef struct CUmemAllocationProp_st {
80+
CUmemAllocationType type;
81+
CUmemAllocationHandleType requestedHandleTypes;
82+
CUmemLocation location;
83+
84+
void *win32HandleMetaData;
85+
struct {
86+
unsigned char compressionType;
87+
unsigned char gpuDirectRDMACapable;
88+
unsigned short usage;
89+
unsigned char reserved[4];
90+
} allocFlags;
91+
} CUmemAllocationProp_v1;
92+
typedef CUmemAllocationProp_v1 CUmemAllocationProp;
93+
2994
typedef enum cudaError_enum {
3095
CUDA_SUCCESS = 0,
3196
CUDA_ERROR_INVALID_VALUE = 1,
@@ -268,4 +333,21 @@ CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
268333
CUresult cuEventSynchronize(CUevent);
269334
CUresult cuEventDestroy(CUevent);
270335

336+
CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
337+
CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
338+
CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
339+
CUresult cuMemGetInfo(size_t *free, size_t *total);
340+
CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
341+
CUdeviceptr addr, unsigned long long flags);
342+
CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
343+
CUmemGenericAllocationHandle handle,
344+
unsigned long long flags);
345+
CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
346+
const CUmemAllocationProp *prop, unsigned long long flags);
347+
CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
348+
const CUmemAccessDesc *desc, size_t count);
349+
CUresult cuMemGetAllocationGranularity(size_t *granularity,
350+
const CUmemAllocationProp *prop,
351+
CUmemAllocationGranularity_flags option);
352+
271353
#endif

0 commit comments

Comments
 (0)