Skip to content

[OpenMP] Improve omp offload profiler #68016

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Dec 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions openmp/libomptarget/include/Shared/Profile.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,16 @@ class Profiler {
std::string RTM = RegionTypeMsg; \
llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)

/// Time spend in the current scope, assigned to the regionType
/// with details from runtime
#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)

/// Time spend in the current scope, assigned to the function name and source
/// with details
#define TIMESCOPE_WITH_DETAILS(Details) \
llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)

#endif // OMPTARGET_SHARED_PROFILE_H
17 changes: 13 additions & 4 deletions openmp/libomptarget/src/OpenMP/API.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ EXTERN int omp_get_initial_device(void) {
}

EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size));
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}

Expand All @@ -99,6 +101,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
}

EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
TIMESCOPE();
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}

Expand Down Expand Up @@ -161,7 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
TIMESCOPE();
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
";src_dev=" + std::to_string(SrcDevice) +
";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
Expand Down Expand Up @@ -400,7 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
int DepObjCount, omp_depend_t *DepObjList) {
TIMESCOPE();
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
";src_dev=" + std::to_string(SrcDevice) +
";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
Expand Down Expand Up @@ -429,7 +436,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions,
int DstDevice, int SrcDevice) {
TIMESCOPE();
DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
Expand Down Expand Up @@ -488,7 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
TIMESCOPE();
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
";src_dev=" + std::to_string(SrcDevice) +
";size=" + std::to_string(ElementSize) +
";num_dims=" + std::to_string(NumDims));
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
Expand Down
29 changes: 14 additions & 15 deletions openmp/libomptarget/src/interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
////////////////////////////////////////////////////////////////////////////////
/// adds requires flags
EXTERN void __tgt_register_requires(int64_t Flags) {
TIMESCOPE();
PM->addRequirements(Flags);
}

////////////////////////////////////////////////////////////////////////////////
/// adds a target shared library to the target execution image
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
TIMESCOPE();
if (PM->delayRegisterLib(Desc))
return;

Expand All @@ -54,7 +52,6 @@ EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
////////////////////////////////////////////////////////////////////////////////
/// unloads a target shared library
EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
TIMESCOPE();
PM->unregisterLib(Desc);
}

Expand All @@ -68,7 +65,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");

TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
"NumArgs=" + std::to_string(ArgNum), Loc);

DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
Expand Down Expand Up @@ -240,9 +238,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
KernelArgsTy *KernelArgs) {
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");

TIMESCOPE_WITH_IDENT(Loc);

DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
DeviceId, DPxPTR(HostPtr));
Expand All @@ -267,6 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
"Runtime: target exe",
"NumTeams=" + std::to_string(NumTeams) +
";NumArgs=" + std::to_string(KernelArgs->NumArgs),
Loc);

if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
Expand Down Expand Up @@ -297,13 +297,14 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,

int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
{ // required to show syncronization
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();

if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();

handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");

handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
}
return OMP_TGT_SUCCESS;
}

Expand Down Expand Up @@ -402,7 +403,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,

// Get the current number of components for a user-defined mapper.
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
TIMESCOPE();
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
int64_t Size = MapperComponentsPtr->Components.size();
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
Expand All @@ -414,7 +414,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
void *Begin, int64_t Size, int64_t Type,
void *Name) {
TIMESCOPE();
DP("__tgt_push_mapper_component(Handle=" DPxMOD
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s).\n",
Expand Down
21 changes: 11 additions & 10 deletions openmp/libomptarget/src/omptarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,6 @@ static int32_t getParentIndex(int64_t Type) {

void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name) {
TIMESCOPE();
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);

if (Size <= 0) {
Expand All @@ -419,7 +418,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,

void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
TIMESCOPE();
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));

Expand All @@ -444,7 +442,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,

void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
const char *Name) {
TIMESCOPE();
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);

if (Size <= 0) {
Expand All @@ -471,7 +468,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
}

void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
TIMESCOPE();
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);

auto DeviceOrErr = PM->getDevice(DeviceNum);
Expand Down Expand Up @@ -531,14 +527,14 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
bool FromMapper) {
TIMESCOPE_WITH_IDENT(Loc);
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
// Ignore private variables and arrays - there is no mapping for them.
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;

TIMESCOPE_WITH_DETAILS_AND_IDENT(
"HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
Expand Down Expand Up @@ -913,7 +909,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));

TIMESCOPE_WITH_DETAILS_AND_IDENT(
"DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
Expand Down Expand Up @@ -1403,7 +1400,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
SmallVector<ptrdiff_t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);

auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
Expand Down Expand Up @@ -1537,7 +1533,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);

auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
Expand Down Expand Up @@ -1639,7 +1635,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,

{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
TIMESCOPE_WITH_DETAILS_AND_IDENT(
"Kernel Target",
"NumArguments=" + std::to_string(KernelArgs.NumArgs) +
";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
";TripCount=" + std::to_string(KernelArgs.Tripcount),
Loc);

#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
Expand Down