Skip to content

Commit 944e354

Browse files
Kewen12ronlieb
authored andcommitted
[OpenMP][Offload][AMDGPU] Add support in runtime to obtain kernel duration
We are building a new component/feature in the runtime to perform post kernel-run processing on the kernel data to facilitate performance tuning. This patch added support to obtain the kernel duration. I will submit a PR to add smoke test to cover the new envar. Change-Id: I02b15041b67e43192a2af387fb81f2091763fa7d
1 parent c097067 commit 944e354

File tree

1 file changed

+52
-1
lines changed
  • offload/plugins-nextgen/amdgpu/src

1 file changed

+52
-1
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -749,7 +749,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
749749
OMPX_BigJumpLoopOccupancyBasedOpt(
750750
"OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT", false),
751751
OMPX_XTeamReductionOccupancyBasedOpt(
752-
"OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT", false) {}
752+
"OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT", false),
753+
OMPX_EnableRuntimeAutotuning("OMPX_ENABLE_RUNTIME_AUTOTUNING", false) {}
753754

754755
/// Initialize the AMDGPU kernel.
755756
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -885,6 +886,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
885886
/// Envar to enable occupancy-based optimization for cross team reduction.
886887
BoolEnvar OMPX_XTeamReductionOccupancyBasedOpt;
887888

889+
/// Envar to enable runtime tuning.
890+
BoolEnvar OMPX_EnableRuntimeAutotuning;
891+
888892
private:
889893
/// The kernel object to execute.
890894
uint64_t KernelObject;
@@ -1683,6 +1687,13 @@ struct AMDGPUStreamTy {
16831687
double TicksToTime;
16841688
};
16851689

1690+
/// Utility struct holding arguments for post kernel run processing.
1691+
struct PostKernelRunProcessingArgsTy {
1692+
hsa_agent_t Agent;
1693+
AMDGPUSignalTy *Signal;
1694+
double TicksToTime;
1695+
};
1696+
16861697
using AMDGPUStreamCallbackTy = Error(void *Data);
16871698

16881699
/// The stream is composed of N stream's slots. The struct below represents
@@ -1881,6 +1892,9 @@ struct AMDGPUStreamTy {
18811892
/// Use synchronous copy back.
18821893
bool UseSyncCopyBack;
18831894

1895+
/// Arguments for the callback function.
1896+
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1897+
18841898
/// Return the current number of asychronous operations on the stream.
18851899
uint32_t size() const { return NextSlot; }
18861900

@@ -2042,6 +2056,31 @@ struct AMDGPUStreamTy {
20422056
return Plugin::success();
20432057
}
20442058

2059+
static uint64_t getKernelDuration(PostKernelRunProcessingArgsTy *Args) {
2060+
assert(Args->Signal &&
2061+
"Invalid AMDGPUSignal Pointer in post kernel run processing");
2062+
hsa_amd_profiling_dispatch_time_t TimeRec;
2063+
hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
2064+
Args->Agent, Args->Signal->get(), &TimeRec);
2065+
2066+
uint64_t StartTime = TimeRec.start * Args->TicksToTime;
2067+
uint64_t EndTime = TimeRec.end * Args->TicksToTime;
2068+
2069+
return EndTime - StartTime;
2070+
}
2071+
2072+
/// Callback funtion to process the data for each kernel run.
2073+
static Error postKernelRunProcessingAction(void *Data) {
2074+
assert(Data && "Invalid data pointer for post kernel run processing");
2075+
PostKernelRunProcessingArgsTy *Args =
2076+
reinterpret_cast<PostKernelRunProcessingArgsTy *>(Data);
2077+
2078+
uint64_t KernelDuration = getKernelDuration(Args);
2079+
fprintf(stderr, "Kernel Duration: %lu ns\n", KernelDuration);
2080+
2081+
return Plugin::success();
2082+
}
2083+
20452084
#ifdef OMPT_SUPPORT
20462085
static Error timeKernelInNsAsync(void *Data) {
20472086
assert(Data && "Invalid data pointer in OMPT profiling");
@@ -2124,6 +2163,18 @@ struct AMDGPUStreamTy {
21242163
}
21252164
#endif
21262165

2166+
// If runtime autotuning is enabled, setup the callback functions to process
2167+
// the data after kernel completed.
2168+
if (Kernel.OMPX_EnableRuntimeAutotuning) {
2169+
PostKernelRunProcessingArgs.Agent = Agent;
2170+
PostKernelRunProcessingArgs.Signal = OutputSignal;
2171+
PostKernelRunProcessingArgs.TicksToTime = 1.0;
2172+
2173+
if (auto Err = Slots[Curr].schedCallback(postKernelRunProcessingAction,
2174+
&PostKernelRunProcessingArgs))
2175+
return Err;
2176+
}
2177+
21272178
// Push the kernel with the output signal and an input signal (optional)
21282179
DP("Using Queue: %p with HSA Queue: %p\n", Queue, Queue->getHsaQueue());
21292180
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,

0 commit comments

Comments
 (0)