@@ -749,7 +749,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
749
749
OMPX_BigJumpLoopOccupancyBasedOpt(
750
750
" OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT" , false ),
751
751
OMPX_XTeamReductionOccupancyBasedOpt(
752
- " OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT" , false ) {}
752
+ " OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT" , false ),
753
+ OMPX_EnableRuntimeAutotuning(" OMPX_ENABLE_RUNTIME_AUTOTUNING" , false ) {}
753
754
754
755
// / Initialize the AMDGPU kernel.
755
756
Error initImpl (GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -885,6 +886,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
885
886
// / Envar to enable occupancy-based optimization for cross team reduction.
886
887
BoolEnvar OMPX_XTeamReductionOccupancyBasedOpt;
887
888
889
+ // / Envar to enable runtime tuning.
890
+ BoolEnvar OMPX_EnableRuntimeAutotuning;
891
+
888
892
private:
889
893
// / The kernel object to execute.
890
894
uint64_t KernelObject;
@@ -1683,6 +1687,13 @@ struct AMDGPUStreamTy {
1683
1687
double TicksToTime;
1684
1688
};
1685
1689
1690
+ // / Utility struct holding arguments for post kernel run processing.
1691
+ struct PostKernelRunProcessingArgsTy {
1692
+ hsa_agent_t Agent;
1693
+ AMDGPUSignalTy *Signal;
1694
+ double TicksToTime;
1695
+ };
1696
+
1686
1697
using AMDGPUStreamCallbackTy = Error(void *Data);
1687
1698
1688
1699
// / The stream is composed of N stream's slots. The struct below represents
@@ -1881,6 +1892,9 @@ struct AMDGPUStreamTy {
1881
1892
// / Use synchronous copy back.
1882
1893
bool UseSyncCopyBack;
1883
1894
1895
+ // / Arguments for the callback function.
1896
+ PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1897
+
1884
1898
// / Return the current number of asychronous operations on the stream.
1885
1899
uint32_t size () const { return NextSlot; }
1886
1900
@@ -2042,6 +2056,31 @@ struct AMDGPUStreamTy {
2042
2056
return Plugin::success ();
2043
2057
}
2044
2058
2059
+ static uint64_t getKernelDuration (PostKernelRunProcessingArgsTy *Args) {
2060
+ assert (Args->Signal &&
2061
+ " Invalid AMDGPUSignal Pointer in post kernel run processing" );
2062
+ hsa_amd_profiling_dispatch_time_t TimeRec;
2063
+ hsa_status_t Status = hsa_amd_profiling_get_dispatch_time (
2064
+ Args->Agent , Args->Signal ->get (), &TimeRec);
2065
+
2066
+ uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
2067
+ uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
2068
+
2069
+ return EndTime - StartTime;
2070
+ }
2071
+
2072
+ // / Callback funtion to process the data for each kernel run.
2073
+ static Error postKernelRunProcessingAction (void *Data) {
2074
+ assert (Data && " Invalid data pointer for post kernel run processing" );
2075
+ PostKernelRunProcessingArgsTy *Args =
2076
+ reinterpret_cast <PostKernelRunProcessingArgsTy *>(Data);
2077
+
2078
+ uint64_t KernelDuration = getKernelDuration (Args);
2079
+ fprintf (stderr, " Kernel Duration: %lu ns\n " , KernelDuration);
2080
+
2081
+ return Plugin::success ();
2082
+ }
2083
+
2045
2084
#ifdef OMPT_SUPPORT
2046
2085
static Error timeKernelInNsAsync (void *Data) {
2047
2086
assert (Data && " Invalid data pointer in OMPT profiling" );
@@ -2124,6 +2163,18 @@ struct AMDGPUStreamTy {
2124
2163
}
2125
2164
#endif
2126
2165
2166
+ // If runtime autotuning is enabled, setup the callback functions to process
2167
+ // the data after kernel completed.
2168
+ if (Kernel.OMPX_EnableRuntimeAutotuning ) {
2169
+ PostKernelRunProcessingArgs.Agent = Agent;
2170
+ PostKernelRunProcessingArgs.Signal = OutputSignal;
2171
+ PostKernelRunProcessingArgs.TicksToTime = 1.0 ;
2172
+
2173
+ if (auto Err = Slots[Curr].schedCallback (postKernelRunProcessingAction,
2174
+ &PostKernelRunProcessingArgs))
2175
+ return Err;
2176
+ }
2177
+
2127
2178
// Push the kernel with the output signal and an input signal (optional)
2128
2179
DP (" Using Queue: %p with HSA Queue: %p\n " , Queue, Queue->getHsaQueue ());
2129
2180
return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
0 commit comments