Skip to content

[Offload] Introduce the concept of "default streams" #95371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ set(llvm_offload_wrapper_files
llvm_offload_wrappers/__llvm_offload.h
llvm_offload_wrappers/__llvm_offload_host.h
llvm_offload_wrappers/__llvm_offload_device.h
llvm_offload_wrappers/cuda_runtime.h
)

set(llvm_libc_wrapper_files
Expand Down
137 changes: 137 additions & 0 deletions clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/

#ifndef __CUDA_RUNTIME_API__
#define __CUDA_RUNTIME_API__

#include <cstddef>
#include <cstdint>
#include <optional>

extern "C" {
int omp_get_initial_device(void);
void omp_target_free(void *Ptr, int Device);
void *omp_target_alloc(size_t Size, int Device);
int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice);
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum,
void *AsyncInfoQueue);
}

// TODO: There are many fields missing in this enumeration.
typedef enum cudaError {
cudaSuccess = 0,
cudaErrorInvalidValue = 1,
cudaErrorMemoryAllocation = 2,
cudaErrorNoDevice = 100,
cudaErrorInvalidDevice = 101,
cudaErrorOTHER = -1,
} cudaError_t;

enum cudaMemcpyKind {
cudaMemcpyHostToHost = 0,
cudaMemcpyHostToDevice = 1,
cudaMemcpyDeviceToHost = 2,
cudaMemcpyDeviceToDevice = 3,
cudaMemcpyDefault = 4
};

typedef void *cudaStream_t;

static thread_local cudaError_t __cudaomp_last_error = cudaSuccess;

// Returns the last error that has been produced and resets it to cudaSuccess.
inline cudaError_t cudaGetLastError() {
cudaError_t TempError = __cudaomp_last_error;
__cudaomp_last_error = cudaSuccess;
return TempError;
}

// Returns the last error that has been produced without reseting it.
inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }

inline cudaError_t cudaDeviceSynchronize() {
int DeviceNum = 0;
return __cudaomp_last_error =
(cudaError_t)__tgt_target_synchronize_async_info_queue(
/*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr);
}

inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
int DeviceNum = 0;
*devPtr = omp_target_alloc(size, DeviceNum);
if (*devPtr == NULL)
return __cudaomp_last_error = cudaErrorMemoryAllocation;

return __cudaomp_last_error = cudaSuccess;
}

template <class T> cudaError_t cudaMalloc(T **devPtr, size_t size) {
return __cudaMalloc((void **)devPtr, size);
}

inline cudaError_t __cudaFree(void *devPtr) {
int DeviceNum = 0;
omp_target_free(devPtr, DeviceNum);
return __cudaomp_last_error = cudaSuccess;
}

template <class T> inline cudaError_t cudaFree(T *ptr) {
return __cudaFree((void *)ptr);
}

inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count,
cudaMemcpyKind kind) {
// get the host device number (which is the inital device)
int HostDeviceNum = omp_get_initial_device();

// use the default device for gpu
int GPUDeviceNum = 0;

// default to copy from host to device
int DstDeviceNum = GPUDeviceNum;
int SrcDeviceNum = HostDeviceNum;

if (kind == cudaMemcpyDeviceToHost)
std::swap(DstDeviceNum, SrcDeviceNum);

// omp_target_memcpy returns 0 on success and non-zero on failure
if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum))
return __cudaomp_last_error = cudaErrorInvalidValue;
return __cudaomp_last_error = cudaSuccess;
}

template <class T>
inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count,
cudaMemcpyKind kind) {
return __cudaMemcpy((void *)dst, (const void *)src, count, kind);
}

inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count,
cudaStream_t stream = 0) {
int DeviceNum = 0;
if (!omp_target_memset(devPtr, value, count, DeviceNum))
return __cudaomp_last_error = cudaErrorInvalidValue;
return __cudaomp_last_error = cudaSuccess;
}

template <class T>
inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
return __cudaMemset((void *)devPtr, value, count);
}

inline cudaError_t cudaDeviceReset(void) {
cudaDeviceSynchronize();
// TODO: not implemented.
return __cudaomp_last_error = cudaSuccess;
}

#endif
2 changes: 1 addition & 1 deletion llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ enum class IdentFlag {
#include "llvm/Frontend/OpenMP/OMPKinds.def"

// Version of the kernel argument format used by the omp runtime.
#define OMP_KERNEL_ARG_VERSION 3
#define OMP_KERNEL_ARG_VERSION 4

// Minimum version of the compiler that generates a kernel dynamic pointer.
#define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3)
__OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr)
__OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr,
VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr,
Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr)
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
__OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
constexpr const size_t MaxDim = 3;
Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy());

assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());

Expand Down Expand Up @@ -529,7 +530,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
Flags,
NumTeams3D,
NumThreads3D,
KernelArgs.DynCGGroupMem};
KernelArgs.DynCGGroupMem,
AsyncInfoQueue};
}

void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
Expand Down
9 changes: 8 additions & 1 deletion offload/include/Shared/APITypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ struct __tgt_async_info {
/// ensure it is a valid location while the transfer to the device is
/// happening.
KernelLaunchEnvironmentTy KernelLaunchEnvironment;

/// Flag to indicate the Queue should be persistent.
bool PersistentQueue = false;
};

/// This struct contains all of the arguments to a target kernel region launch.
Expand All @@ -110,12 +113,16 @@ struct KernelArgsTy {
// The number of threads (for x,y,z dimension).
uint32_t ThreadLimit[3] = {0, 0, 0};
uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
// A __tgt_async_info queue pointer to be used for the kernel and all
// associated device interactions. The operations are implicitly made
// non-blocking.
void *AsyncInfoQueue = nullptr;
};
static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
"Invalid struct size");
static_assert(sizeof(KernelArgsTy) ==
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
5 * sizeof(void **) + 2 * sizeof(int64_t *)),
"Invalid struct size");

/// Flat array of kernel launch parameters and their total size.
Expand Down
18 changes: 16 additions & 2 deletions offload/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,19 @@ class AsyncInfoTy {
/// Synchronization method to be used.
SyncTy SyncType;

AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
: Device(Device), SyncType(SyncType) {}
AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue)
: Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) {
AsyncInfo.Queue = AsyncInfoQueue;
AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
}
AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType)
: Device(Device), SyncType(SyncType) {
AsyncInfo.Queue = AsyncInfoQueue;
AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
}

~AsyncInfoTy() { synchronize(); }

/// Implicit conversion to the __tgt_async_info which is used in the
Expand Down Expand Up @@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy {
void **TaskAsyncInfoPtr = nullptr;

public:
TaskAsyncInfoWrapperTy(DeviceTy &Device)
TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue= nullptr)
: ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!");
// If we failed to acquired the current global thread id, we cannot
// re-enqueue the current task. Thus we should use the local blocking async
// info.
Expand Down Expand Up @@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
void *VAddr, bool IsRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset);

void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId);

#ifdef __cplusplus
}
#endif
Expand Down
21 changes: 15 additions & 6 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2215,8 +2215,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Err;

// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo. This is to make sure the synchronization only works for its
// own tasks.
// AsyncInfo if the queue is not persistent. This is to make sure the
// synchronization only works for its own tasks.
if (AsyncInfo.PersistentQueue)
return Plugin::success();

AsyncInfo.Queue = nullptr;
return AMDGPUStreamManager.returnResource(Stream);
}
Expand All @@ -2235,9 +2238,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (!(*CompletedOrErr))
return Plugin::success();

// Once the stream is completed, return it to stream pool and reset
// AsyncInfo. This is to make sure the synchronization only works for its
// own tasks.
// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo if the queue is not persistent. This is to make sure the
// synchronization only works for its own tasks.
if (AsyncInfo.PersistentQueue)
return Plugin::success();

AsyncInfo.Queue = nullptr;
return AMDGPUStreamManager.returnResource(Stream);
}
Expand Down Expand Up @@ -2450,7 +2456,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {

/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
AMDGPUStreamTy *Stream;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;

return Plugin::success();
}

Expand Down
2 changes: 2 additions & 0 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1518,8 +1518,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,

Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
assert(AsyncInfoPtr && "Invalid async info");
assert(!(*AsyncInfoPtr) && "Already initialized async info");

*AsyncInfoPtr = new __tgt_async_info();
(*AsyncInfoPtr)->PersistentQueue = true;

AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);

Expand Down
16 changes: 11 additions & 5 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
}

// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo. This is to make sure the synchronization only works for its
// own tasks.
// AsyncInfo if the queue is not persistent. This is to make sure the
// synchronization only works for its own tasks.
if (AsyncInfo.PersistentQueue)
return Plugin::success();

AsyncInfo.Queue = nullptr;
if (auto Err = CUDAStreamManager.returnResource(Stream))
return Err;
Expand Down Expand Up @@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (Res == CUDA_ERROR_NOT_READY)
return Plugin::success();

// Once the stream is synchronized and the operations completed (or an error
// occurs), return it to stream pool and reset AsyncInfo. This is to make
// sure the synchronization only works for its own tasks.
// Once the stream is synchronized, return it to stream pool and reset
// AsyncInfo if the queue is not persistent. This is to make sure the
// synchronization only works for its own tasks.
if (AsyncInfo.PersistentQueue)
return Plugin::success();

AsyncInfo.Queue = nullptr;
if (auto Err = CUDAStreamManager.returnResource(Stream))
return Err;
Expand Down
15 changes: 14 additions & 1 deletion offload/src/KernelLanguage/API.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@

#include "Shared/APITypes.h"

#include "llvm/Frontend/OpenMP/OMPConstants.h"

#include <cstdint>
#include <cstdio>

struct dim3 {
Expand Down Expand Up @@ -55,10 +58,13 @@ unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
int32_t ThreadLimit, const void *HostPtr,
KernelArgsTy *Args);
void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId);

unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
void *args, size_t sharedMem, void *stream) {
int64_t DeviceNo = 0;
KernelArgsTy Args = {};
Args.Version = OMP_KERNEL_ARG_VERSION;
Args.DynCGroupMem = sharedMem;
Args.NumTeams[0] = gridDim.x;
Args.NumTeams[1] = gridDim.y;
Expand All @@ -68,6 +74,13 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
Args.ThreadLimit[2] = blockDim.z;
Args.ArgPtrs = reinterpret_cast<void **>(args);
Args.Flags.IsCUDA = true;
return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args);
if (stream)
Args.AsyncInfoQueue = stream;
else
Args.AsyncInfoQueue =
__tgt_target_get_default_async_info_queue(nullptr, DeviceNo);
int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func,
&Args);
return rv;
}
}
2 changes: 2 additions & 0 deletions offload/src/exports
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ VERS1.0 {
__tgt_target_kernel;
__tgt_target_kernel_nowait;
__tgt_target_nowait_query;
__tgt_target_get_default_async_info_queue;
__tgt_target_synchronize_async_info_queue;
__tgt_target_kernel_replay;
__tgt_activate_record_replay;
__tgt_mapper_num_components;
Expand Down
Loading
Loading