Skip to content

Commit a7ea753

Browse files
committed
[Offload] Introduce the concept of "default streams"
The offload APIs, and the CUDA wrappers in clang, now support "default streams" per thread (and per device). It should be per context but we don't really expose that concept yet. The KernelArguments allow an LLVM/Offload user to provide a "AsyncInfoQueue", which is plugin dependent and can later also be created outside or queried from the runtime. User managed "queues" are kept persistent, thus not returned to the pool once synchronized. The CUDA tests will synchronize via `cudaDeviceSynchronize` before checking the results. Based on: llvm#94821
1 parent 34c8bf7 commit a7ea753

File tree

20 files changed

+143
-25
lines changed

20 files changed

+143
-25
lines changed

clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define __CUDA_RUNTIME_API__
1212

1313
#include <cstddef>
14+
#include <cstdint>
1415
#include <optional>
1516

1617
extern "C" {
@@ -21,6 +22,8 @@ int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
2122
size_t DstOffset, size_t SrcOffset, int DstDevice,
2223
int SrcDevice);
2324
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
25+
int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum,
26+
void *AsyncInfoQueue);
2427
}
2528

2629
// TODO: There are many fields missing in this enumeration.
@@ -55,6 +58,13 @@ inline cudaError_t cudaGetLastError() {
5558
// Returns the last error that has been produced without reseting it.
5659
inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
5760

61+
inline cudaError_t cudaDeviceSynchronize() {
62+
int DeviceNum = 0;
63+
return __cudaomp_last_error =
64+
(cudaError_t)__tgt_target_synchronize_async_info_queue(
65+
/*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr);
66+
}
67+
5868
inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
5969
int DeviceNum = 0;
6070
*devPtr = omp_target_alloc(size, DeviceNum);
@@ -118,12 +128,8 @@ inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
118128
return __cudaMemset((void *)devPtr, value, count);
119129
}
120130

121-
inline cudaError_t cudaDeviceSynchronize() {
122-
// TODO: not implemented, not async yet.
123-
return __cudaomp_last_error = cudaSuccess;
124-
}
125-
126131
inline cudaError_t cudaDeviceReset(void) {
132+
cudaDeviceSynchronize();
127133
// TODO: not implemented.
128134
return __cudaomp_last_error = cudaSuccess;
129135
}

llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ enum class IdentFlag {
7272
#include "llvm/Frontend/OpenMP/OMPKinds.def"
7373

7474
// Version of the kernel argument format used by the omp runtime.
75-
#define OMP_KERNEL_ARG_VERSION 3
75+
#define OMP_KERNEL_ARG_VERSION 4
7676

7777
// Minimum version of the compiler that generates a kernel dynamic pointer.
7878
#define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3

llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3)
9090
__OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr)
9191
__OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr,
9292
VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr,
93-
Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
93+
Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr)
9494
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
9595
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
9696
__OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
501501
constexpr const size_t MaxDim = 3;
502502
Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503503
Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504+
Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy());
504505

505506
assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506507

@@ -529,7 +530,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
529530
Flags,
530531
NumTeams3D,
531532
NumThreads3D,
532-
KernelArgs.DynCGGroupMem};
533+
KernelArgs.DynCGGroupMem,
534+
AsyncInfoQueue};
533535
}
534536

535537
void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {

offload/include/Shared/APITypes.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ struct __tgt_async_info {
8585
/// ensure it is a valid location while the transfer to the device is
8686
/// happening.
8787
KernelLaunchEnvironmentTy KernelLaunchEnvironment;
88+
89+
/// Flag to indicate the Queue should be persistent.
90+
bool PersistentQueue = false;
8891
};
8992

9093
/// This struct contains all of the arguments to a target kernel region launch.
@@ -110,12 +113,16 @@ struct KernelArgsTy {
110113
// The number of threads (for x,y,z dimension).
111114
uint32_t ThreadLimit[3] = {0, 0, 0};
112115
uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
116+
// A __tgt_async_info queue pointer to be used for the kernel and all
117+
// associated device interactions. The operations are implicitly made
118+
// non-blocking.
119+
void *AsyncInfoQueue = nullptr;
113120
};
114121
static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
115122
"Invalid struct size");
116123
static_assert(sizeof(KernelArgsTy) ==
117124
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
118-
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
125+
5 * sizeof(void **) + 2 * sizeof(int64_t *)),
119126
"Invalid struct size");
120127

121128
/// Flat array of kernel launch parameters and their total size.

offload/include/omptarget.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,19 @@ class AsyncInfoTy {
136136
/// Synchronization method to be used.
137137
SyncTy SyncType;
138138

139-
AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
139+
AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
140140
: Device(Device), SyncType(SyncType) {}
141+
AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue)
142+
: Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) {
143+
AsyncInfo.Queue = AsyncInfoQueue;
144+
AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
145+
}
146+
AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType)
147+
: Device(Device), SyncType(SyncType) {
148+
AsyncInfo.Queue = AsyncInfoQueue;
149+
AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
150+
}
151+
141152
~AsyncInfoTy() { synchronize(); }
142153

143154
/// Implicit conversion to the __tgt_async_info which is used in the
@@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy {
207218
void **TaskAsyncInfoPtr = nullptr;
208219

209220
public:
210-
TaskAsyncInfoWrapperTy(DeviceTy &Device)
221+
TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue= nullptr)
211222
: ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
223+
assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!");
212224
// If we failed to acquired the current global thread id, we cannot
213225
// re-enqueue the current task. Thus we should use the local blocking async
214226
// info.
@@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
425437
void *VAddr, bool IsRecord, bool SaveOutput,
426438
uint64_t &ReqPtrArgOffset);
427439

440+
void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId);
441+
428442
#ifdef __cplusplus
429443
}
430444
#endif

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2215,8 +2215,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22152215
return Err;
22162216

22172217
// Once the stream is synchronized, return it to stream pool and reset
2218-
// AsyncInfo. This is to make sure the synchronization only works for its
2219-
// own tasks.
2218+
// AsyncInfo if the queue is not persistent. This is to make sure the
2219+
// synchronization only works for its own tasks.
2220+
if (AsyncInfo.PersistentQueue)
2221+
return Plugin::success();
2222+
22202223
AsyncInfo.Queue = nullptr;
22212224
return AMDGPUStreamManager.returnResource(Stream);
22222225
}
@@ -2235,9 +2238,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22352238
if (!(*CompletedOrErr))
22362239
return Plugin::success();
22372240

2238-
// Once the stream is completed, return it to stream pool and reset
2239-
// AsyncInfo. This is to make sure the synchronization only works for its
2240-
// own tasks.
2241+
// Once the stream is synchronized, return it to stream pool and reset
2242+
// AsyncInfo if the queue is not persistent. This is to make sure the
2243+
// synchronization only works for its own tasks.
2244+
if (AsyncInfo.PersistentQueue)
2245+
return Plugin::success();
2246+
22412247
AsyncInfo.Queue = nullptr;
22422248
return AMDGPUStreamManager.returnResource(Stream);
22432249
}
@@ -2450,7 +2456,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
24502456

24512457
/// Initialize the async info for interoperability purposes.
24522458
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2453-
// TODO: Implement this function.
2459+
AMDGPUStreamTy *Stream;
2460+
if (auto Err = getStream(AsyncInfoWrapper, Stream))
2461+
return Err;
2462+
24542463
return Plugin::success();
24552464
}
24562465

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,8 +1518,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
15181518

15191519
Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
15201520
assert(AsyncInfoPtr && "Invalid async info");
1521+
assert(!(*AsyncInfoPtr) && "Already initialized async info");
15211522

15221523
*AsyncInfoPtr = new __tgt_async_info();
1524+
(*AsyncInfoPtr)->PersistentQueue = true;
15231525

15241526
AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
15251527

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
643643
}
644644

645645
// Once the stream is synchronized, return it to stream pool and reset
646-
// AsyncInfo. This is to make sure the synchronization only works for its
647-
// own tasks.
646+
// AsyncInfo if the queue is not persistent. This is to make sure the
647+
// synchronization only works for its own tasks.
648+
if (AsyncInfo.PersistentQueue)
649+
return Plugin::success();
650+
648651
AsyncInfo.Queue = nullptr;
649652
if (auto Err = CUDAStreamManager.returnResource(Stream))
650653
return Err;
@@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
777780
if (Res == CUDA_ERROR_NOT_READY)
778781
return Plugin::success();
779782

780-
// Once the stream is synchronized and the operations completed (or an error
781-
// occurs), return it to stream pool and reset AsyncInfo. This is to make
782-
// sure the synchronization only works for its own tasks.
783+
// Once the stream is synchronized, return it to stream pool and reset
784+
// AsyncInfo if the queue is not persistent. This is to make sure the
785+
// synchronization only works for its own tasks.
786+
if (AsyncInfo.PersistentQueue)
787+
return Plugin::success();
788+
783789
AsyncInfo.Queue = nullptr;
784790
if (auto Err = CUDAStreamManager.returnResource(Stream))
785791
return Err;

offload/src/KernelLanguage/API.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
#include "Shared/APITypes.h"
1212

13+
#include "llvm/Frontend/OpenMP/OMPConstants.h"
14+
15+
#include <cstdint>
1316
#include <cstdio>
1417

1518
struct dim3 {
@@ -55,10 +58,13 @@ unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
5558
int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
5659
int32_t ThreadLimit, const void *HostPtr,
5760
KernelArgsTy *Args);
61+
void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId);
5862

5963
unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
6064
void *args, size_t sharedMem, void *stream) {
65+
int64_t DeviceNo = 0;
6166
KernelArgsTy Args = {};
67+
Args.Version = OMP_KERNEL_ARG_VERSION;
6268
Args.DynCGroupMem = sharedMem;
6369
Args.NumTeams[0] = gridDim.x;
6470
Args.NumTeams[1] = gridDim.y;
@@ -68,6 +74,13 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
6874
Args.ThreadLimit[2] = blockDim.z;
6975
Args.ArgPtrs = reinterpret_cast<void **>(args);
7076
Args.Flags.IsCUDA = true;
71-
return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args);
77+
if (stream)
78+
Args.AsyncInfoQueue = stream;
79+
else
80+
Args.AsyncInfoQueue =
81+
__tgt_target_get_default_async_info_queue(nullptr, DeviceNo);
82+
int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func,
83+
&Args);
84+
return rv;
7285
}
7386
}

offload/src/exports

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ VERS1.0 {
2929
__tgt_target_kernel;
3030
__tgt_target_kernel_nowait;
3131
__tgt_target_nowait_query;
32+
__tgt_target_get_default_async_info_queue;
33+
__tgt_target_synchronize_async_info_queue;
3234
__tgt_target_kernel_replay;
3335
__tgt_activate_record_replay;
3436
__tgt_mapper_num_components;

offload/src/interface.cpp

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "OpenMP/OMPT/Callback.h"
1717
#include "OpenMP/omp.h"
1818
#include "PluginManager.h"
19+
#include "Shared/APITypes.h"
1920
#include "omptarget.h"
2021
#include "private.h"
2122

@@ -352,7 +353,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
352353
if (!DeviceOrErr)
353354
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
354355

355-
TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
356+
TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue);
356357
AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
357358
/// RAII to establish tool anchors before and after target region
358359
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
@@ -550,3 +551,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
550551
delete AsyncInfo;
551552
*AsyncHandle = nullptr;
552553
}
554+
555+
EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc,
556+
int64_t DeviceId) {
557+
assert(PM && "Runtime not initialized");
558+
559+
static thread_local void **AsyncInfoQueue = nullptr;
560+
561+
if (!AsyncInfoQueue)
562+
AsyncInfoQueue = reinterpret_cast<void **>(
563+
calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0])));
564+
565+
if (!AsyncInfoQueue[DeviceId]) {
566+
auto DeviceOrErr = PM->getDevice(DeviceId);
567+
if (!DeviceOrErr)
568+
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
569+
570+
__tgt_async_info *AsyncInfo = nullptr;
571+
DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo);
572+
AsyncInfoQueue[DeviceId] = AsyncInfo->Queue;
573+
}
574+
575+
return AsyncInfoQueue[DeviceId];
576+
}
577+
578+
EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc,
579+
int64_t DeviceId,
580+
void *AsyncInfoQueue) {
581+
assert(PM && "Runtime not initialized");
582+
583+
auto DeviceOrErr = PM->getDevice(DeviceId);
584+
if (!DeviceOrErr)
585+
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
586+
if (!AsyncInfoQueue)
587+
AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId);
588+
AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue,
589+
AsyncInfoTy::SyncTy::BLOCKING);
590+
591+
if (AsyncInfo.synchronize())
592+
FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
593+
[[maybe_unused]] __tgt_async_info *ASI = AsyncInfo;
594+
assert(ASI->Queue);
595+
assert(ASI->Queue && ASI->PersistentQueue);
596+
597+
return 0;
598+
}

offload/src/omptarget.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() {
4949
case SyncTy::BLOCKING:
5050
// If we have a queue we need to synchronize it now.
5151
Result = Device.synchronize(*this);
52-
assert(AsyncInfo.Queue == nullptr &&
52+
assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) &&
5353
"The device plugin should have nulled the queue to indicate there "
5454
"are no outstanding actions!");
5555
break;

offload/test/offloading/CUDA/basic_api_malloc_free.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ int main(int argc, char **argv) {
3232
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3333
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
3434
kernel<<<1, 1>>>(Ptr, DevPtr, 42);
35+
cudaDeviceSynchronize();
3536
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3637
// CHECK: Ptr [[Ptr]], *Ptr: 42
3738
Err = cudaFree(DevPtr);

offload/test/offloading/CUDA/basic_api_memcpy.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ int main(int argc, char **argv) {
3131
printf("Res: %i\n", Res);
3232
// CHECK: Res: 0
3333
kernel<<<1, 1>>>(DevPtr, 42);
34+
cudaDeviceSynchronize();
3435
Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
3536
if (Err != cudaSuccess)
3637
return -1;

offload/test/offloading/CUDA/basic_api_memset.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ int main(int argc, char **argv) {
3434
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3535
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
3636
kernel<<<1, 1>>>(Ptr, DevPtr, 42);
37+
cudaDeviceSynchronize();
3738
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3839
// CHECK: Ptr [[Ptr]], *Ptr: 42
3940
Err = cudaFree(DevPtr);

offload/test/offloading/CUDA/basic_launch.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
// UNSUPPORTED: x86_64-pc-linux-gnu
1111
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
1212

13+
#include <cuda_runtime.h>
1314
#include <stdio.h>
1415

1516
extern "C" {
@@ -26,6 +27,7 @@ int main(int argc, char **argv) {
2627
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
2728
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
2829
square<<<1, 1>>>(Ptr);
30+
cudaDeviceSynchronize();
2931
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3032
// CHECK: Ptr [[Ptr]], *Ptr: 42
3133
llvm_omp_target_free_shared(Ptr, DevNo);

0 commit comments

Comments
 (0)