Skip to content

[openmp] Fixed Support for VA for record-replay. #70396

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions openmp/libomptarget/include/Utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,11 @@ template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
return std::align(Alignment, sizeof(char), Ptr, Space);
}

/// Round up \p V to a \p Boundary.
template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
return (V + Boundary - 1) / Boundary * Boundary;
}

} // namespace target
} // namespace omp
} // namespace llvm
Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ void __tgt_set_info_flag(uint32_t);
int __tgt_print_device_info(int64_t DeviceId);

int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
bool IsRecord, bool SaveOutput);
void *VAddr, bool IsRecord, bool SaveOutput);

#ifdef __cplusplus
}
Expand Down
3 changes: 2 additions & 1 deletion openmp/libomptarget/include/rtl.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ struct RTLInfoTy {
typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
typedef int32_t(set_device_offset_ty)(int32_t);
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
bool);

int32_t Idx = -1; // RTL index, index is the number of devices
// of other RTLs that were registered before,
Expand Down
10 changes: 10 additions & 0 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2581,6 +2581,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
DeviceMemoryPoolSize = Value;
return Plugin::success();
}
Error getDeviceMemorySize(uint64_t &Value) override {
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
if (Pool->isGlobal()) {
hsa_status_t Status =
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
return Plugin::check(Status, "Error in getting device memory size: %s");
}
}
return Plugin::error("getDeviceMemorySize:: no global pool");
}

/// AMDGPU-specific function to get device attributes.
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,40 +49,87 @@ struct RecordReplayTy {
void *MemoryStart;
void *MemoryPtr;
size_t MemorySize;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not about this patch, but we really need to add documentation here, MemorySize doesn't really imply what this variable tracks. Probably renaming and docs is adequate.

size_t TotalSize;
GenericDeviceTy *Device;
std::mutex AllocationLock;

RRStatusTy Status;
bool ReplaySaveOutput;
uint64_t DeviceMemorySize;

// Record/replay pre-allocates the largest possible device memory using the
// default kind.
// TODO: Expand allocation to include other kinds (device, host, shared) and
// possibly use a MemoryManager to track (de-)allocations for
// storing/retrieving when recording/replaying.
Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
// Pre-allocate memory on device. Starts with 64GB and subtracts in steps
// of 1GB until allocation succeeds.
const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;

void *suggestAddress(uint64_t MaxMemoryAllocation) {
// Get a valid pointer address for this system
void *Addr =
Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
Device->free(Addr);
// Align Address to MaxMemoryAllocation
Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
return Addr;
}

Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
size_t ASize = MaxMemoryAllocation;

if (!VAddr && isRecording())
VAddr = suggestAddress(MaxMemoryAllocation);

DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);

if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
return Err;

if (isReplaying() && VAddr != MemoryStart) {
return Plugin::error("Record-Replay cannot assign the"
"requested recorded address (%p, %p)",
VAddr, MemoryStart);
}

INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);

MemoryPtr = MemoryStart;
MemorySize = 0;
TotalSize = ASize;
return Plugin::success();
}

Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
MemoryStart = nullptr;
for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
MemoryStart =
Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
MemoryStart = Device->allocate(TotalSize, /* HstPtr */ nullptr,
TARGET_ALLOC_DEFAULT);
if (MemoryStart)
break;
}

INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
MemoryStart);

if (!MemoryStart)
return Plugin::error("Allocating record/replay memory");

if (VAddr && VAddr != MemoryStart)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could check if VAddr is within the allocation and we have enough space, in that case we succeeded, effectively.

return Plugin::error("Cannot allocate recorded address");

MemoryPtr = MemoryStart;
MemorySize = 0;

return Plugin::success();
}

Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
if (Device->supportVAManagement())
return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);

uint64_t DevMemSize;
if (Device->getDeviceMemorySize(DevMemSize))
return Plugin::error("Cannot determine Device Memory Size");

return preAllocateHeuristic(DevMemSize, ReqVAddr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We ignore DeviceMemorySize here, is that expected? Should we not at least check against it?

}

void dumpDeviceMemory(StringRef Filename) {
ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
Expand Down Expand Up @@ -114,8 +161,7 @@ struct RecordReplayTy {
bool isSaveOutputEnabled() const { return ReplaySaveOutput; }

RecordReplayTy()
: Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
DeviceMemorySize(-1) {}
: Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}

void saveImage(const char *Name, const DeviceImageTy &Image) {
SmallString<128> ImageName = {Name, ".image"};
Expand Down Expand Up @@ -197,6 +243,7 @@ struct RecordReplayTy {
JsonKernelInfo["LoopTripCount"] = LoopTripCount;
JsonKernelInfo["DeviceMemorySize"] = MemorySize;
JsonKernelInfo["DeviceId"] = Device->getDeviceId();
JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;

json::Array JsonArgPtrs;
for (int I = 0; I < NumArgs; ++I)
Expand Down Expand Up @@ -244,27 +291,33 @@ struct RecordReplayTy {
return Alloc;
}

Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
bool SaveOutput) {
Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
RRStatusTy Status, bool SaveOutput) {
this->Device = Device;
this->Status = Status;
this->DeviceMemorySize = MemSize;
this->ReplaySaveOutput = SaveOutput;

if (auto Err = preallocateDeviceMemory(MemSize))
if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
return Err;

INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Record Replay Initialized (%p)"
" as starting address, %lu Memory Size"
" and set on status %s\n",
MemoryStart, MemSize,
MemoryStart, TotalSize,
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");

return Plugin::success();
}

void deinit() { Device->free(MemoryStart); }
void deinit() {
if (Device->supportVAManagement()) {
if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
report_fatal_error("Error on releasing virtual memory space");
} else {
Device->free(MemoryStart);
}
}

} RecordReplay;

Expand Down Expand Up @@ -1204,6 +1257,19 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
return queryAsyncImpl(*AsyncInfo);
}

Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
return Plugin::error("Device does not suppport VA Management");
}

Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
return Plugin::error("Device does not suppport VA Management");
}

Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
return Plugin::error(
"Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
}

Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
TargetAllocTy Kind) {
void *Alloc = nullptr;
Expand Down Expand Up @@ -1572,16 +1638,17 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
}

int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
uint64_t MemorySize, bool isRecord,
int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
void *VAddr, bool isRecord,
bool SaveOutput) {
GenericPluginTy &Plugin = Plugin::get();
GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
RecordReplayTy::RRStatusTy Status =
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
: RecordReplayTy::RRStatusTy::RRReplaying;

if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
if (auto Err =
RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
"(Error: %s)\n",
MemorySize, toString(std::move(Err)).data());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,21 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error queryAsync(__tgt_async_info *AsyncInfo);
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;

/// Check whether the architecture supports VA management
virtual bool supportVAManagement() const { return false; }

/// Get the total device memory size
virtual Error getDeviceMemorySize(uint64_t &DSize);

/// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
/// map it to \p VAddr. The obtained address is stored in \p Addr. At return
/// \p RSize contains the actual size which can be equal or larger than the
/// requested size.
virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);

/// De-allocates device memory and unmaps the virtual address \p VAddr
virtual Error memoryVAUnMap(void *VAddr, size_t Size);

/// Allocate data on the device or involving the device.
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);

Expand Down
10 changes: 10 additions & 0 deletions openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ DLWRAP(cuEventDestroy, 1)

DLWRAP_FINALIZE()

DLWRAP(cuMemUnmap, 2)
DLWRAP(cuMemRelease, 1)
DLWRAP(cuMemAddressFree, 2)
DLWRAP(cuMemGetInfo, 2)
DLWRAP(cuMemAddressReserve, 5)
DLWRAP(cuMemMap, 5)
DLWRAP(cuMemCreate, 4)
DLWRAP(cuMemSetAccess, 4)
DLWRAP(cuMemGetAllocationGranularity, 3)

#ifndef DYNAMIC_CUDA_PATH
#define DYNAMIC_CUDA_PATH "libcuda.so"
#endif
Expand Down
82 changes: 82 additions & 0 deletions openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,71 @@ typedef struct CUevent_st *CUevent;

#define CU_DEVICE_INVALID ((CUdevice)-2)

typedef unsigned long long CUmemGenericAllocationHandle_v1;
typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;

#define CU_DEVICE_INVALID ((CUdevice)-2)

typedef enum CUmemAllocationGranularity_flags_enum {
CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
} CUmemAllocationGranularity_flags;

typedef enum CUmemAccess_flags_enum {
CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF
} CUmemAccess_flags;

typedef enum CUmemLocationType_enum {
CU_MEM_LOCATION_TYPE_INVALID = 0x0,
CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF
} CUmemLocationType;

typedef struct CUmemLocation_st {
CUmemLocationType type;
int id;
} CUmemLocation_v1;
typedef CUmemLocation_v1 CUmemLocation;

typedef struct CUmemAccessDesc_st {
CUmemLocation location;
CUmemAccess_flags flags;
} CUmemAccessDesc_v1;

typedef CUmemAccessDesc_v1 CUmemAccessDesc;

typedef enum CUmemAllocationType_enum {
CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF
} CUmemAllocationType;

typedef enum CUmemAllocationHandleType_enum {
CU_MEM_HANDLE_TYPE_NONE = 0x0,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF
} CUmemAllocationHandleType;

typedef struct CUmemAllocationProp_st {
CUmemAllocationType type;
CUmemAllocationHandleType requestedHandleTypes;
CUmemLocation location;

void *win32HandleMetaData;
struct {
unsigned char compressionType;
unsigned char gpuDirectRDMACapable;
unsigned short usage;
unsigned char reserved[4];
} allocFlags;
} CUmemAllocationProp_v1;
typedef CUmemAllocationProp_v1 CUmemAllocationProp;

typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
Expand Down Expand Up @@ -268,4 +333,21 @@ CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
CUresult cuEventSynchronize(CUevent);
CUresult cuEventDestroy(CUevent);

CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
CUresult cuMemGetInfo(size_t *free, size_t *total);
CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
CUdeviceptr addr, unsigned long long flags);
CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
CUmemGenericAllocationHandle handle,
unsigned long long flags);
CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
const CUmemAllocationProp *prop, unsigned long long flags);
CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
const CUmemAccessDesc *desc, size_t count);
CUresult cuMemGetAllocationGranularity(size_t *granularity,
const CUmemAllocationProp *prop,
CUmemAllocationGranularity_flags option);

#endif
Loading