Skip to content

Commit 9b6ea5e

Browse files
fel-cabFelipe Cabarcasfel-cab
authored
[OpenMP] Improve omp offload profiler (#68016)
Summary: Adding information to the LIBOMPTARGET profiler runtime kernel and API calls. Key changes: * Adding information to runtime calls for better understanding of how the application is executing. For example teams requested by the user, size of memory transfers. * Profile timer was changed from 'us' to 'ns', since 'us' was too coarse-grain to register some important details like key kernel duration * Removed non API or Runtime calls, to reduce complexity of profile for application developers. --------- Co-authored-by: Felipe Cabarcas <[email protected]> Co-authored-by: fel-cab <[email protected]>
1 parent 1ba4a45 commit 9b6ea5e

File tree

4 files changed

+50
-29
lines changed

4 files changed

+50
-29
lines changed

openmp/libomptarget/include/Shared/Profile.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,4 +97,16 @@ class Profiler {
9797
std::string RTM = RegionTypeMsg; \
9898
llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
9999

100+
/// Time spend in the current scope, assigned to the regionType
101+
/// with details from runtime
102+
#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
103+
SourceInfo SI(IDENT); \
104+
std::string ProfileLocation = SI.getProfileLocation(); \
105+
llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
106+
107+
/// Time spend in the current scope, assigned to the function name and source
108+
/// with details
109+
#define TIMESCOPE_WITH_DETAILS(Details) \
110+
llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
111+
100112
#endif // OMPTARGET_SHARED_PROFILE_H

openmp/libomptarget/src/OpenMP/API.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ EXTERN int omp_get_initial_device(void) {
8383
}
8484

8585
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
86+
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
87+
";size=" + std::to_string(Size));
8688
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
8789
}
8890

@@ -99,6 +101,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
99101
}
100102

101103
EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
104+
TIMESCOPE();
102105
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
103106
}
104107

@@ -161,7 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
161164
EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
162165
size_t DstOffset, size_t SrcOffset, int DstDevice,
163166
int SrcDevice) {
164-
TIMESCOPE();
167+
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
168+
";src_dev=" + std::to_string(SrcDevice) +
169+
";size=" + std::to_string(Length));
165170
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
166171
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
167172
"src offset %zu, length %zu\n",
@@ -400,7 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
400405
size_t DstOffset, size_t SrcOffset,
401406
int DstDevice, int SrcDevice,
402407
int DepObjCount, omp_depend_t *DepObjList) {
403-
TIMESCOPE();
408+
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
409+
";src_dev=" + std::to_string(SrcDevice) +
410+
";size=" + std::to_string(Length));
404411
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
405412
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
406413
"src offset %zu, length %zu\n",
@@ -429,7 +436,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
429436
const size_t *DstOffsets, const size_t *SrcOffsets,
430437
const size_t *DstDimensions, const size_t *SrcDimensions,
431438
int DstDevice, int SrcDevice) {
432-
TIMESCOPE();
433439
DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
434440
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
435441
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -488,7 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
488494
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
489495
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
490496
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
491-
TIMESCOPE();
497+
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
498+
";src_dev=" + std::to_string(SrcDevice) +
499+
";size=" + std::to_string(ElementSize) +
500+
";num_dims=" + std::to_string(NumDims));
492501
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
493502
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
494503
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "

openmp/libomptarget/src/interface.cpp

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
3333
////////////////////////////////////////////////////////////////////////////////
3434
/// adds requires flags
3535
EXTERN void __tgt_register_requires(int64_t Flags) {
36-
TIMESCOPE();
3736
PM->addRequirements(Flags);
3837
}
3938

4039
////////////////////////////////////////////////////////////////////////////////
4140
/// adds a target shared library to the target execution image
4241
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
43-
TIMESCOPE();
4442
if (PM->delayRegisterLib(Desc))
4543
return;
4644

@@ -54,7 +52,6 @@ EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
5452
////////////////////////////////////////////////////////////////////////////////
5553
/// unloads a target shared library
5654
EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
57-
TIMESCOPE();
5855
PM->unregisterLib(Desc);
5956
}
6057

@@ -68,7 +65,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
6865
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
6966
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
7067

71-
TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
68+
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
69+
"NumArgs=" + std::to_string(ArgNum), Loc);
7270

7371
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
7472
RegionName, DeviceId, ArgNum);
@@ -240,9 +238,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
240238
KernelArgsTy *KernelArgs) {
241239
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
242240
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
243-
244-
TIMESCOPE_WITH_IDENT(Loc);
245-
246241
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
247242
"\n",
248243
DeviceId, DPxPTR(HostPtr));
@@ -267,6 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
267262
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
268263
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
269264
"OpenMP interface should not use multiple dimensions");
265+
TIMESCOPE_WITH_DETAILS_AND_IDENT(
266+
"Runtime: target exe",
267+
"NumTeams=" + std::to_string(NumTeams) +
268+
";NumArgs=" + std::to_string(KernelArgs->NumArgs),
269+
Loc);
270270

271271
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
272272
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
@@ -297,13 +297,14 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
297297

298298
int Rc = OFFLOAD_SUCCESS;
299299
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
300+
{ // required to show syncronization
301+
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
302+
if (Rc == OFFLOAD_SUCCESS)
303+
Rc = AsyncInfo.synchronize();
300304

301-
if (Rc == OFFLOAD_SUCCESS)
302-
Rc = AsyncInfo.synchronize();
303-
304-
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
305-
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
306-
305+
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
306+
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
307+
}
307308
return OMP_TGT_SUCCESS;
308309
}
309310

@@ -402,7 +403,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
402403

403404
// Get the current number of components for a user-defined mapper.
404405
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
405-
TIMESCOPE();
406406
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
407407
int64_t Size = MapperComponentsPtr->Components.size();
408408
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -414,7 +414,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
414414
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
415415
void *Begin, int64_t Size, int64_t Type,
416416
void *Name) {
417-
TIMESCOPE();
418417
DP("__tgt_push_mapper_component(Handle=" DPxMOD
419418
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
420419
", Type=0x%" PRIx64 ", Name=%s).\n",

openmp/libomptarget/src/omptarget.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,6 @@ static int32_t getParentIndex(int64_t Type) {
392392

393393
void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
394394
const char *Name) {
395-
TIMESCOPE();
396395
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
397396

398397
if (Size <= 0) {
@@ -419,7 +418,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
419418

420419
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
421420
const char *Name) {
422-
TIMESCOPE();
423421
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
424422
DPxPTR(DevicePtr));
425423

@@ -444,7 +442,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
444442

445443
void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
446444
const char *Name) {
447-
TIMESCOPE();
448445
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
449446

450447
if (Size <= 0) {
@@ -471,7 +468,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
471468
}
472469

473470
void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
474-
TIMESCOPE();
475471
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
476472

477473
auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,14 +527,14 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
531527
int64_t *ArgTypes, map_var_info_t *ArgNames,
532528
void **ArgMappers, AsyncInfoTy &AsyncInfo,
533529
bool FromMapper) {
534-
TIMESCOPE_WITH_IDENT(Loc);
535530
// process each input.
536531
for (int32_t I = 0; I < ArgNum; ++I) {
537532
// Ignore private variables and arrays - there is no mapping for them.
538533
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
539534
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
540535
continue;
541-
536+
TIMESCOPE_WITH_DETAILS_AND_IDENT(
537+
"HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
542538
if (ArgMappers && ArgMappers[I]) {
543539
// Instead of executing the regular path of targetDataBegin, call the
544540
// targetDataMapper variant which will call targetDataBegin again
@@ -913,7 +909,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
913909
!TPR.Flags.IsHostPointer && DataSize != 0) {
914910
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
915911
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
916-
912+
TIMESCOPE_WITH_DETAILS_AND_IDENT(
913+
"DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
917914
// Wait for any previous transfer if an event is present.
918915
if (void *Event = TPR.getEntry()->getEvent()) {
919916
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1403,7 +1400,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
14031400
SmallVector<ptrdiff_t> &TgtOffsets,
14041401
PrivateArgumentManagerTy &PrivateArgumentManager,
14051402
AsyncInfoTy &AsyncInfo) {
1406-
TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
14071403

14081404
auto DeviceOrErr = PM->getDevice(DeviceId);
14091405
if (!DeviceOrErr)
@@ -1537,7 +1533,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
15371533
map_var_info_t *ArgNames, void **ArgMappers,
15381534
PrivateArgumentManagerTy &PrivateArgumentManager,
15391535
AsyncInfoTy &AsyncInfo) {
1540-
TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
1536+
15411537
auto DeviceOrErr = PM->getDevice(DeviceId);
15421538
if (!DeviceOrErr)
15431539
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -1639,7 +1635,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
16391635

16401636
{
16411637
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
1642-
TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
1638+
TIMESCOPE_WITH_DETAILS_AND_IDENT(
1639+
"Kernel Target",
1640+
"NumArguments=" + std::to_string(KernelArgs.NumArgs) +
1641+
";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
1642+
";TripCount=" + std::to_string(KernelArgs.Tripcount),
1643+
Loc);
16431644

16441645
#ifdef OMPT_SUPPORT
16451646
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&

0 commit comments

Comments
 (0)