@@ -185,10 +185,9 @@ hsa_status_t hostrpc_terminate();
185
185
__attribute__ ((weak)) hsa_status_t hostrpc_terminate() {
186
186
return HSA_STATUS_SUCCESS;
187
187
}
188
- __attribute__ ((weak)) uint64_t
189
- hostrpc_assign_buffer(hsa_agent_t , hsa_queue_t *, uint32_t DeviceId,
190
- hsa_amd_memory_pool_t HostMemoryPool,
191
- hsa_amd_memory_pool_t DevMemoryPool) {
188
+ __attribute__ ((weak)) uint64_t hostrpc_assign_buffer(
189
+ hsa_agent_t , hsa_queue_t *, uint32_t DeviceId,
190
+ hsa_amd_memory_pool_t HostMemoryPool, hsa_amd_memory_pool_t DevMemoryPool) {
192
191
// FIXME:THIS SHOULD BE HARD FAIL
193
192
DP (" Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
194
193
" missing\n " ,
@@ -553,8 +552,9 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
553
552
// / Class implementing the AMDGPU device images' properties.
554
553
struct AMDGPUDeviceImageTy : public DeviceImageTy {
555
554
// / Create the AMDGPU image with the id and the target image pointer.
556
- AMDGPUDeviceImageTy (int32_t ImageId, const __tgt_device_image *TgtImage)
557
- : DeviceImageTy(ImageId, TgtImage) {}
555
+ AMDGPUDeviceImageTy (int32_t ImageId, GenericDeviceTy &Device,
556
+ const __tgt_device_image *TgtImage)
557
+ : DeviceImageTy(ImageId, Device, TgtImage) {}
558
558
559
559
// / Prepare and load the executable corresponding to the image.
560
560
Error loadExecutable (const AMDGPUDeviceTy &Device);
@@ -608,8 +608,8 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
608
608
// / generic kernel class.
609
609
struct AMDGPUKernelTy : public GenericKernelTy {
610
610
// / Create an AMDGPU kernel with a name and an execution mode.
611
- AMDGPUKernelTy (const char *Name, OMPTgtExecModeFlags ExecutionMode )
612
- : GenericKernelTy(Name, ExecutionMode ),
611
+ AMDGPUKernelTy (const char *Name)
612
+ : GenericKernelTy(Name),
613
613
ServiceThreadDeviceBufferGlobal (" service_thread_buf" , sizeof (uint64_t )),
614
614
HostServiceBufferHandler(Plugin::createGlobalHandler()) {}
615
615
@@ -855,9 +855,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
855
855
CurrentMaxNumThreads = std::min (
856
856
static_cast <uint32_t >(TeamsThreadLimitEnvVar), CurrentMaxNumThreads);
857
857
858
- return std::min (CurrentMaxNumThreads,
859
- (ThreadLimitClause[ 0 ] > 0 ) ? ThreadLimitClause[0 ] :
860
- PreferredNumThreads);
858
+ return std::min (CurrentMaxNumThreads, (ThreadLimitClause[ 0 ] > 0 )
859
+ ? ThreadLimitClause[0 ]
860
+ : PreferredNumThreads);
861
861
}
862
862
uint64_t getNumBlocks (GenericDeviceTy &GenericDevice,
863
863
uint32_t NumTeamsClause[3 ], uint64_t LoopTripCount,
@@ -1637,7 +1637,6 @@ struct AMDGPUStreamTy {
1637
1637
assert (Slot && " Invalid slot" );
1638
1638
assert (Slot->Signal && " Invalid signal" );
1639
1639
1640
-
1641
1640
// Peform the operation.
1642
1641
if (auto Err = Slot->performAction ())
1643
1642
FATAL_MESSAGE (1 , " Error peforming post action: %s" ,
@@ -1826,21 +1825,21 @@ struct AMDGPUStreamTy {
1826
1825
// Consume stream slot and compute dependencies.
1827
1826
auto [Curr, InputSignal] = consume (OutputSignals[0 ]);
1828
1827
1829
- //
1830
- // For some reason, the kernel completion signal value gets turned to 0
1831
- // when it should be 1. The code we are commenting out causes this signal
1832
- // to be ignored below and the D2H copy process starts too soon.
1833
- // In this fix, we are not resetting the signal value to 1.
1834
- // We are just not ignoring the signal in the asyncMemCopy below.
1835
- //
1836
- // This fix does not solve the random SDMA problem.
1837
- // We need to understand how this InputSignal value which was a kernel
1838
- // completion signal became 0. More testing is needed.
1839
- //
1828
+ //
1829
+ // For some reason, the kernel completion signal value gets turned to 0
1830
+ // when it should be 1. The code we are commenting out causes this signal
1831
+ // to be ignored below and the D2H copy process starts too soon.
1832
+ // In this fix, we are not resetting the signal value to 1.
1833
+ // We are just not ignoring the signal in the asyncMemCopy below.
1834
+ //
1835
+ // This fix does not solve the random SDMA problem.
1836
+ // We need to understand how this InputSignal value which was a kernel
1837
+ // completion signal became 0. More testing is needed.
1838
+ //
1840
1839
// Avoid defining the input dependency if already satisfied.
1841
- // if (InputSignal && !InputSignal->load())
1842
- // fprintf(stderr , " Inputsignal value %ld for signal %p\n",InputSignal->load(),InputSignal);
1843
- // InputSignal = nullptr;
1840
+ // if (InputSignal && !InputSignal->load())
1841
+ // fprintf(stderr , " Inputsignal value %ld for signal
1842
+ // %p\n",InputSignal->load(),InputSignal); InputSignal = nullptr;
1844
1843
1845
1844
// Setup the post action for releasing the intermediate buffer.
1846
1845
if (auto Err = Slots[Curr].schedReleaseBuffer (Inter, MemoryManager))
@@ -1849,7 +1848,8 @@ struct AMDGPUStreamTy {
1849
1848
// Issue the first step: device to host transfer. Avoid defining the input
1850
1849
// dependency if already satisfied.
1851
1850
if (InputSignal) {
1852
- // fprintf(stderr,"calling utils::asyncMemCopy with InputSignal %p val%ld\n",InputSignal,InputSignal->load());
1851
+ // fprintf(stderr,"calling utils::asyncMemCopy with InputSignal %p
1852
+ // val%ld\n",InputSignal,InputSignal->load());
1853
1853
hsa_signal_t InputSignalRaw = InputSignal->get ();
1854
1854
if (auto Err = utils::asyncMemCopy (
1855
1855
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1 ,
@@ -2448,7 +2448,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2448
2448
OMPX_ForceSyncRegions (" OMPX_FORCE_SYNC_REGIONS" , 0 ),
2449
2449
OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
2450
2450
OMPX_UseMultipleSdmaEngines (
2451
- // setting default to true here appears to solve random sdma problem
2451
+ // setting default to true here appears to solve random sdma problem
2452
2452
" LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , true ),
2453
2453
AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
2454
2454
AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
@@ -2814,15 +2814,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2814
2814
uint64_t getClockFrequency () const override { return ClockFrequency; }
2815
2815
2816
2816
// / Allocate and construct an AMDGPU kernel.
2817
- Expected<GenericKernelTy &>
2818
- constructKernel (const __tgt_offload_entry &KernelEntry,
2819
- OMPTgtExecModeFlags ExecMode) override {
2817
+ Expected<GenericKernelTy &> constructKernel (const char *Name) override {
2820
2818
// Allocate and construct the AMDGPU kernel.
2821
2819
AMDGPUKernelTy *AMDGPUKernel = Plugin::get ().allocate <AMDGPUKernelTy>();
2822
2820
if (!AMDGPUKernel)
2823
2821
return Plugin::error (" Failed to allocate memory for AMDGPU kernel" );
2824
2822
2825
- new (AMDGPUKernel) AMDGPUKernelTy (KernelEntry. name , ExecMode );
2823
+ new (AMDGPUKernel) AMDGPUKernelTy (Name );
2826
2824
2827
2825
return *AMDGPUKernel;
2828
2826
}
@@ -2870,7 +2868,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2870
2868
// Allocate and initialize the image object.
2871
2869
AMDGPUDeviceImageTy *AMDImage =
2872
2870
Plugin::get ().allocate <AMDGPUDeviceImageTy>();
2873
- new (AMDImage) AMDGPUDeviceImageTy (ImageId, TgtImage);
2871
+ new (AMDImage) AMDGPUDeviceImageTy (ImageId, * this , TgtImage);
2874
2872
2875
2873
// Load the HSA executable.
2876
2874
if (Error Err = AMDImage->loadExecutable (*this ))
@@ -3602,13 +3600,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3602
3600
if (IsCtor && !Handler.isSymbolInImage (*this , Image, KernelName))
3603
3601
return Plugin::success ();
3604
3602
3605
- // Retrieve the execution mode.
3606
- auto ExecModeOrErr = getExecutionModeForKernel (KernelName, Image);
3607
- if (!ExecModeOrErr)
3608
- return ExecModeOrErr.takeError ();
3609
-
3610
3603
// Allocate and construct the AMDGPU kernel.
3611
- AMDGPUKernelTy AMDGPUKernel (KernelName, *ExecModeOrErr );
3604
+ AMDGPUKernelTy AMDGPUKernel (KernelName);
3612
3605
if (auto Err = AMDGPUKernel.init (*this , Image))
3613
3606
return Err;
3614
3607
@@ -4196,9 +4189,9 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
4196
4189
}
4197
4190
4198
4191
bool canUseHostGlobals () override final {
4199
- // Check if the HSA_XNACK and OMPX_APU_MAPS are enabled. If unified memory is
4200
- // not enabled but both HSA_XNACK and OMPX_APU_MAPS are enabled then we can
4201
- // also use globals directly from the host.
4192
+ // Check if the HSA_XNACK and OMPX_APU_MAPS are enabled. If unified memory
4193
+ // is not enabled but both HSA_XNACK and OMPX_APU_MAPS are enabled then we
4194
+ // can also use globals directly from the host.
4202
4195
bool EnableHostGlobals = false ;
4203
4196
bool IsZeroCopyOnAPU = AreAllocationsForMapsOnApusDisabled ();
4204
4197
BoolEnvar HSAXnack = BoolEnvar (" HSA_XNACK" , false );
@@ -4208,8 +4201,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
4208
4201
4209
4202
// Check if we are on a system that has an APU or on a non-APU system
4210
4203
// where unified shared memory can be enabled:
4211
- bool IsUsmSystem =
4212
- hasAPUDevice () || hasDGpuWithUsmSupport ();
4204
+ bool IsUsmSystem = hasAPUDevice () || hasDGpuWithUsmSupport ();
4213
4205
4214
4206
// Warn user if there is a mismatch between the request and the system
4215
4207
// architecture:
0 commit comments