@@ -210,7 +210,7 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
210
210
}
211
211
212
212
// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
213
- static int libomp_target_memcpy_async_helper (kmp_int32 Gtid, kmp_task_t *Task) {
213
+ static int libomp_target_memcpy_async_task (kmp_int32 Gtid, kmp_task_t *Task) {
214
214
if (Task == nullptr )
215
215
return OFFLOAD_FAIL;
216
216
@@ -241,47 +241,129 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
241
241
return Rc;
242
242
}
243
243
244
- // Allocate and launch helper task
245
- static int libomp_helper_task_creation (TargetMemcpyArgsTy *Args,
246
- int DepObjCount,
247
- omp_depend_t *DepObjList) {
244
+ static int libomp_target_memset_async_task (kmp_int32 Gtid, kmp_task_t *Task) {
245
+ if (!Task)
246
+ return OFFLOAD_FAIL;
247
+
248
+ auto *Args = reinterpret_cast <TargetMemsetArgsTy *>(Task->shareds );
249
+ if (!Args)
250
+ return OFFLOAD_FAIL;
251
+
252
+ // call omp_target_memset()
253
+ omp_target_memset (Args->Ptr , Args->C , Args->N , Args->DeviceNum );
254
+
255
+ delete Args;
256
+
257
+ return OFFLOAD_SUCCESS;
258
+ }
259
+
260
+ static inline void
261
+ convertDepObjVector (llvm::SmallVector<kmp_depend_info_t > &Vec, int DepObjCount,
262
+ omp_depend_t *DepObjList) {
263
+ for (int i = 0 ; i < DepObjCount; ++i) {
264
+ omp_depend_t DepObj = DepObjList[i];
265
+ Vec.push_back (*((kmp_depend_info_t *)DepObj));
266
+ }
267
+ }
268
+
269
+ template <class T >
270
+ static inline int
271
+ libomp_helper_task_creation (T *Args, int (*Fn)(kmp_int32, kmp_task_t *),
272
+ int DepObjCount, omp_depend_t *DepObjList) {
248
273
// Create global thread ID
249
274
int Gtid = __kmpc_global_thread_num (nullptr );
250
- int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
251
275
252
- // Setup the hidden helper flags;
276
+ // Setup the hidden helper flags
253
277
kmp_int32 Flags = 0 ;
254
278
kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
255
279
InputFlags->hidden_helper = 1 ;
256
280
257
- // Alloc helper task
258
- kmp_task_t *Ptr = __kmpc_omp_target_task_alloc (nullptr , Gtid, Flags,
259
- sizeof (kmp_task_t ), 0 , Fn, -1 );
260
-
261
- if (Ptr == nullptr ) {
262
- // Task allocation failed, delete the argument object
281
+ // Alloc the helper task
282
+ kmp_task_t *Task = __kmpc_omp_target_task_alloc (
283
+ nullptr , Gtid, Flags, sizeof (kmp_task_t ), 0 , Fn, -1 );
284
+ if (!Task) {
263
285
delete Args;
264
-
265
286
return OFFLOAD_FAIL;
266
287
}
267
288
268
- // Setup the arguments passed to helper task
269
- Ptr ->shareds = Args;
289
+ // Setup the arguments for the helper task
290
+ Task ->shareds = Args;
270
291
271
- // Convert the type of depend objects
292
+ // Convert types of depend objects
272
293
llvm::SmallVector<kmp_depend_info_t > DepObjs;
273
- for (int i = 0 ; i < DepObjCount; i++) {
274
- omp_depend_t DepObj = DepObjList[i];
275
- DepObjs.push_back (*((kmp_depend_info_t *)DepObj));
276
- }
294
+ convertDepObjVector (DepObjs, DepObjCount, DepObjList);
277
295
278
296
// Launch the helper task
279
- int Rc = __kmpc_omp_task_with_deps (nullptr , Gtid, Ptr , DepObjCount,
297
+ int Rc = __kmpc_omp_task_with_deps (nullptr , Gtid, Task , DepObjCount,
280
298
DepObjs.data (), 0 , nullptr );
281
299
282
300
return Rc;
283
301
}
284
302
303
+ EXTERN void *omp_target_memset (void *Ptr, int ByteVal, size_t NumBytes,
304
+ int DeviceNum) {
305
+ TIMESCOPE ();
306
+ DP (" Call to omp_target_memset, device %d, device pointer %p, size %zu\n " ,
307
+ DeviceNum, Ptr, NumBytes);
308
+
309
+ // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
310
+ // of unspecified behavior, see OpenMP spec).
311
+ if (!Ptr || NumBytes == 0 ) {
312
+ return Ptr;
313
+ }
314
+
315
+ if (DeviceNum == omp_get_initial_device ()) {
316
+ DP (" filling memory on host via memset" );
317
+ memset (Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
318
+ } else {
319
+ // TODO: replace the omp_target_memset() slow path with the fast path.
320
+ // That will require the ability to execute a kernel from within
321
+ // libomptarget.so (which we do not have at the moment).
322
+
323
+ // This is a very slow path: create a filled array on the host and upload
324
+ // it to the GPU device.
325
+ int InitialDevice = omp_get_initial_device ();
326
+ void *Shadow = omp_target_alloc (NumBytes, InitialDevice);
327
+ if (Shadow) {
328
+ (void )memset (Shadow, ByteVal, NumBytes);
329
+ (void )omp_target_memcpy (Ptr, Shadow, NumBytes, 0 , 0 , DeviceNum,
330
+ InitialDevice);
331
+ (void )omp_target_free (Shadow, InitialDevice);
332
+ } else {
333
+ // If the omp_target_alloc has failed, let's just not do anything.
334
+ // omp_target_memset does not have any good way to fail, so we
335
+ // simply avoid a catastrophic failure of the process for now.
336
+ DP (" omp_target_memset failed to fill memory due to error with "
337
+ " omp_target_alloc" );
338
+ }
339
+ }
340
+
341
+ DP (" omp_target_memset returns %p\n " , Ptr);
342
+ return Ptr;
343
+ }
344
+
345
+ EXTERN void *omp_target_memset_async (void *Ptr, int ByteVal, size_t NumBytes,
346
+ int DeviceNum, int DepObjCount,
347
+ omp_depend_t *DepObjList) {
348
+ DP (" Call to omp_target_memset_async, device %d, device pointer %p, size %zu" ,
349
+ DeviceNum, Ptr, NumBytes);
350
+
351
+ // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
352
+ // of unspecified behavior, see OpenMP spec).
353
+ if (!Ptr || NumBytes == 0 )
354
+ return Ptr;
355
+
356
+ // Create the task object to deal with the async invocation
357
+ auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};
358
+
359
+ // omp_target_memset_async() cannot fail via a return code, so ignore the
360
+ // return code of the helper function
361
+ (void )libomp_helper_task_creation (Args, &libomp_target_memset_async_task,
362
+ DepObjCount, DepObjList);
363
+
364
+ return Ptr;
365
+ }
366
+
285
367
EXTERN int omp_target_memcpy_async (void *Dst, const void *Src, size_t Length,
286
368
size_t DstOffset, size_t SrcOffset,
287
369
int DstDevice, int SrcDevice,
@@ -302,7 +384,8 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
302
384
Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
303
385
304
386
// Create and launch helper task
305
- int Rc = libomp_helper_task_creation (Args, DepObjCount, DepObjList);
387
+ int Rc = libomp_helper_task_creation (Args, &libomp_target_memcpy_async_task,
388
+ DepObjCount, DepObjList);
306
389
307
390
DP (" omp_target_memcpy_async returns %d\n " , Rc);
308
391
return Rc;
@@ -399,7 +482,8 @@ EXTERN int omp_target_memcpy_rect_async(
399
482
DstDimensions, SrcDimensions, DstDevice, SrcDevice);
400
483
401
484
// Create and launch helper task
402
- int Rc = libomp_helper_task_creation (Args, DepObjCount, DepObjList);
485
+ int Rc = libomp_helper_task_creation (Args, &libomp_target_memcpy_async_task,
486
+ DepObjCount, DepObjList);
403
487
404
488
DP (" omp_target_memcpy_rect_async returns %d\n " , Rc);
405
489
return Rc;
0 commit comments