Skip to content

Commit f93a697

Browse files
authored
[libomptarget][OpenMP] Initial implementation of omp_target_memset() and omp_target_memset_async() (#68706)
Implement a slow-path version of omp_target_memset*() There is a TODO to implement a fast path that uses an on-device kernel instead of the host-based memory fill operation. This may require some additional plumbing to have kernels in libomptarget.so
1 parent 970e745 commit f93a697

File tree

10 files changed

+220
-24
lines changed

10 files changed

+220
-24
lines changed

openmp/libomptarget/include/omptarget.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
312312
const size_t *DstDimensions,
313313
const size_t *SrcDimensions, int DstDevice,
314314
int SrcDevice);
315+
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
315316
int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
316317
size_t Size, size_t DeviceOffset, int DeviceNum);
317318
int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);

openmp/libomptarget/src/api.cpp

Lines changed: 108 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
210210
}
211211

212212
// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
213-
static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
213+
static int libomp_target_memcpy_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
214214
if (Task == nullptr)
215215
return OFFLOAD_FAIL;
216216

@@ -241,47 +241,129 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
241241
return Rc;
242242
}
243243

244-
// Allocate and launch helper task
245-
static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
246-
int DepObjCount,
247-
omp_depend_t *DepObjList) {
244+
static int libomp_target_memset_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
245+
if (!Task)
246+
return OFFLOAD_FAIL;
247+
248+
auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
249+
if (!Args)
250+
return OFFLOAD_FAIL;
251+
252+
// call omp_target_memset()
253+
omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);
254+
255+
delete Args;
256+
257+
return OFFLOAD_SUCCESS;
258+
}
259+
260+
static inline void
261+
convertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
262+
omp_depend_t *DepObjList) {
263+
for (int i = 0; i < DepObjCount; ++i) {
264+
omp_depend_t DepObj = DepObjList[i];
265+
Vec.push_back(*((kmp_depend_info_t *)DepObj));
266+
}
267+
}
268+
269+
template <class T>
270+
static inline int
271+
libomp_helper_task_creation(T *Args, int (*Fn)(kmp_int32, kmp_task_t *),
272+
int DepObjCount, omp_depend_t *DepObjList) {
248273
// Create global thread ID
249274
int Gtid = __kmpc_global_thread_num(nullptr);
250-
int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
251275

252-
// Setup the hidden helper flags;
276+
// Setup the hidden helper flags
253277
kmp_int32 Flags = 0;
254278
kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
255279
InputFlags->hidden_helper = 1;
256280

257-
// Alloc helper task
258-
kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
259-
sizeof(kmp_task_t), 0, Fn, -1);
260-
261-
if (Ptr == nullptr) {
262-
// Task allocation failed, delete the argument object
281+
// Alloc the helper task
282+
kmp_task_t *Task = __kmpc_omp_target_task_alloc(
283+
nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
284+
if (!Task) {
263285
delete Args;
264-
265286
return OFFLOAD_FAIL;
266287
}
267288

268-
// Setup the arguments passed to helper task
269-
Ptr->shareds = Args;
289+
// Setup the arguments for the helper task
290+
Task->shareds = Args;
270291

271-
// Convert the type of depend objects
292+
// Convert types of depend objects
272293
llvm::SmallVector<kmp_depend_info_t> DepObjs;
273-
for (int i = 0; i < DepObjCount; i++) {
274-
omp_depend_t DepObj = DepObjList[i];
275-
DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
276-
}
294+
convertDepObjVector(DepObjs, DepObjCount, DepObjList);
277295

278296
// Launch the helper task
279-
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
297+
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
280298
DepObjs.data(), 0, nullptr);
281299

282300
return Rc;
283301
}
284302

303+
EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
304+
int DeviceNum) {
305+
TIMESCOPE();
306+
DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
307+
DeviceNum, Ptr, NumBytes);
308+
309+
// Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
310+
// of unspecified behavior, see OpenMP spec).
311+
if (!Ptr || NumBytes == 0) {
312+
return Ptr;
313+
}
314+
315+
if (DeviceNum == omp_get_initial_device()) {
316+
DP("filling memory on host via memset");
317+
memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
318+
} else {
319+
// TODO: replace the omp_target_memset() slow path with the fast path.
320+
// That will require the ability to execute a kernel from within
321+
// libomptarget.so (which we do not have at the moment).
322+
323+
// This is a very slow path: create a filled array on the host and upload
324+
// it to the GPU device.
325+
int InitialDevice = omp_get_initial_device();
326+
void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
327+
if (Shadow) {
328+
(void)memset(Shadow, ByteVal, NumBytes);
329+
(void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
330+
InitialDevice);
331+
(void)omp_target_free(Shadow, InitialDevice);
332+
} else {
333+
// If the omp_target_alloc has failed, let's just not do anything.
334+
// omp_target_memset does not have any good way to fail, so we
335+
// simply avoid a catastrophic failure of the process for now.
336+
DP("omp_target_memset failed to fill memory due to error with "
337+
"omp_target_alloc");
338+
}
339+
}
340+
341+
DP("omp_target_memset returns %p\n", Ptr);
342+
return Ptr;
343+
}
344+
345+
EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes,
346+
int DeviceNum, int DepObjCount,
347+
omp_depend_t *DepObjList) {
348+
DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
349+
DeviceNum, Ptr, NumBytes);
350+
351+
// Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
352+
// of unspecified behavior, see OpenMP spec).
353+
if (!Ptr || NumBytes == 0)
354+
return Ptr;
355+
356+
// Create the task object to deal with the async invocation
357+
auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};
358+
359+
// omp_target_memset_async() cannot fail via a return code, so ignore the
360+
// return code of the helper function
361+
(void)libomp_helper_task_creation(Args, &libomp_target_memset_async_task,
362+
DepObjCount, DepObjList);
363+
364+
return Ptr;
365+
}
366+
285367
EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
286368
size_t DstOffset, size_t SrcOffset,
287369
int DstDevice, int SrcDevice,
@@ -302,7 +384,8 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
302384
Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
303385

304386
// Create and launch helper task
305-
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
387+
int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
388+
DepObjCount, DepObjList);
306389

307390
DP("omp_target_memcpy_async returns %d\n", Rc);
308391
return Rc;
@@ -399,7 +482,8 @@ EXTERN int omp_target_memcpy_rect_async(
399482
DstDimensions, SrcDimensions, DstDevice, SrcDevice);
400483

401484
// Create and launch helper task
402-
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
485+
int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
486+
DepObjCount, DepObjList);
403487

404488
DP("omp_target_memcpy_rect_async returns %d\n", Rc);
405489
return Rc;

openmp/libomptarget/src/exports

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ VERS1.0 {
4444
omp_target_memcpy_rect;
4545
omp_target_memcpy_async;
4646
omp_target_memcpy_rect_async;
47+
omp_target_memset;
48+
omp_target_memset_async;
4749
omp_target_associate_ptr;
4850
omp_target_disassociate_ptr;
4951
llvm_omp_target_alloc_host;

openmp/libomptarget/src/private.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,17 @@ struct TargetMemcpyArgsTy {
253253
DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
254254
DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
255255
};
256+
257+
struct TargetMemsetArgsTy {
258+
// Common attributes of a memset operation
259+
void *Ptr;
260+
int C;
261+
size_t N;
262+
int DeviceNum;
263+
264+
// no constructors defined, because this is a PoD
265+
};
266+
256267
// Invalid GTID as defined by libomp; keep in sync
257268
#define KMP_GTID_DNE (-2)
258269
#ifdef __cplusplus
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// RUN: %libomptarget-compile-and-run-generic
2+
3+
#include "stdio.h"
4+
#include <omp.h>
5+
#include <stdlib.h>
6+
7+
int main() {
8+
int d = omp_get_default_device();
9+
int id = omp_get_initial_device();
10+
int q[128], i;
11+
void *p;
12+
void *result;
13+
14+
if (d < 0 || d >= omp_get_num_devices())
15+
d = id;
16+
17+
p = omp_target_alloc(130 * sizeof(int), d);
18+
if (p == NULL)
19+
return 0;
20+
21+
for (i = 0; i < 128; i++)
22+
q[i] = i;
23+
24+
result = omp_target_memset(p, 0, 130 * sizeof(int), d);
25+
if (result != p) {
26+
abort();
27+
}
28+
29+
int q2[128];
30+
for (i = 0; i < 128; ++i)
31+
q2[i] = i;
32+
if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
33+
0, NULL))
34+
abort();
35+
36+
#pragma omp taskwait
37+
38+
for (i = 0; i < 128; ++i)
39+
if (q2[i] != 0)
40+
abort();
41+
42+
omp_target_free(p, d);
43+
44+
return 0;
45+
}

openmp/runtime/src/dllexports

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,8 @@ kmp_set_warnings_off 780
518518
omp_target_memcpy_rect 887
519519
omp_target_associate_ptr 888
520520
omp_target_disassociate_ptr 889
521+
omp_target_memset 3000
522+
omp_target_memset_async 3001
521523
%endif
522524

523525
kmp_set_disp_num_buffers 890

openmp/runtime/src/include/omp.h.var

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,11 @@
236236
extern int __KAI_KMPC_CONVENTION omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
237237
const size_t *, const size_t *, const size_t *, const size_t *, int, int,
238238
int, omp_depend_t *);
239+
240+
/* OpenMP 6.0 device memory routines */
241+
extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
242+
extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);
243+
239244
/*!
240245
* The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
241246
*/

openmp/runtime/src/include/omp_lib.f90.var

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,28 @@
635635
integer (omp_depend_kind), optional :: depobj_list(*)
636636
end function omp_target_memcpy_rect_async
637637

638+
function omp_target_memset(ptr, val, count, device_num) bind(c)
639+
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
640+
type(c_ptr) :: omp_target_memset
641+
type(c_ptr), value :: ptr
642+
integer(c_int), value :: val
643+
integer(c_size_t), value :: count
644+
integer(c_int), value :: device_num
645+
end function
646+
647+
function omp_target_memset_async(ptr, val, count, device_num, &
648+
depobj_count, depobj_list) bind(c)
649+
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
650+
use omp_lib_kinds
651+
type(c_ptr) :: omp_target_memset_async
652+
type(c_ptr), value :: ptr
653+
integer(c_int), value :: val
654+
integer(c_size_t), value :: count
655+
integer(c_int), value :: device_num
656+
integer(c_int), value :: depobj_count
657+
integer(omp_depend_kind), optional :: depobj_list(*)
658+
end function
659+
638660
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
639661
device_offset, device_num) bind(c)
640662
use omp_lib_kinds

openmp/runtime/src/include/omp_lib.h.var

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,28 @@
732732
integer(omp_depend_kind), optional :: depobj_list(*)
733733
end function omp_target_memcpy_rect_async
734734

735+
function omp_target_memset(ptr, val, count, device_num) bind(c)
736+
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
737+
type(c_ptr) :: omp_target_memset
738+
type(c_ptr), value :: ptr
739+
integer(c_int), value :: val
740+
integer(c_size_t), value :: count
741+
integer(c_int), value :: device_num
742+
end function
743+
744+
function omp_target_memset_async(ptr, val, count, device_num, &
745+
depobj_count, depobj_list) bind(c)
746+
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
747+
use omp_lib_kinds
748+
type(c_ptr) :: omp_target_memset_async
749+
type(c_ptr), value :: ptr
750+
integer(c_int), value :: val
751+
integer(c_size_t), value :: count
752+
integer(c_int), value :: device_num
753+
integer(c_int), value :: depobj_count
754+
integer(omp_depend_kind), optional :: depobj_list(*)
755+
end function
756+
735757
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
736758
& device_offset, device_num) bind(c)
737759
use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int

openmp/runtime/src/kmp_ftn_os.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@
116116
#define FTN_TARGET_IS_PRESENT omp_target_is_present
117117
#define FTN_TARGET_MEMCPY omp_target_memcpy
118118
#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
119+
#define FTN_TARGET_MEMSET omp_target_memset
120+
#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
119121
#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
120122
#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
121123
#endif

0 commit comments

Comments
 (0)