Skip to content

Commit a316776

Browse files
committed
[Libomptarget] Move target table handling out of the plugins llvm#77150
premerge landing of patch from joseph Change-Id: I1eea45458f96bbb01970d36c58f1d700613b6702
1 parent fe6e75c commit a316776

File tree

15 files changed

+296
-320
lines changed

15 files changed

+296
-320
lines changed

openmp/libomptarget/include/Shared/APITypes.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ struct __tgt_target_table {
6262
*EntriesEnd; // End of the table with all the entries (non inclusive)
6363
};
6464

65+
/// This struct contains a handle to a loaded binary in the plugin device.
66+
struct __tgt_device_binary {
67+
uintptr_t handle;
68+
};
69+
6570
// clang-format on
6671

6772
/// This struct contains information exchanged between different asynchronous

openmp/libomptarget/include/Shared/PluginAPI.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,18 @@ int32_t __tgt_rtl_init_device(int32_t ID);
7676
// return NULL. Otherwise, return a pointer to the built address table.
7777
// Individual entries in the table may also be NULL, when the corresponding
7878
// offload region is not supported on the target device.
79-
__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
80-
__tgt_device_image *Image);
79+
int32_t __tgt_rtl_load_binary(int32_t ID, __tgt_device_image *Image,
80+
__tgt_device_binary *Binary);
81+
82+
// Look up the device address of the named symbol in the given binary. Returns
83+
// non-zero on failure.
84+
int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
85+
const char *Name, void **DevicePtr);
86+
87+
// Look up the device address of the named kernel in the given binary. Returns
88+
// non-zero on failure.
89+
int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
90+
void **DevicePtr);
8191

8292
// Allocate data on the particular target device, of the specified size.
8393
// HostPtr is a address of the host data the allocated target data

openmp/libomptarget/include/Shared/PluginAPI.inc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ PLUGIN_API_HANDLE(is_data_exchangable, false);
1919
PLUGIN_API_HANDLE(number_of_devices, true);
2020
PLUGIN_API_HANDLE(init_device, true);
2121
PLUGIN_API_HANDLE(load_binary, true);
22+
PLUGIN_API_HANDLE(get_global, true);
23+
PLUGIN_API_HANDLE(get_function, true);
2224
PLUGIN_API_HANDLE(data_alloc, true);
2325
PLUGIN_API_HANDLE(data_submit, true);
2426
PLUGIN_API_HANDLE(data_submit_async, false);

openmp/libomptarget/include/device.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ struct DeviceTy {
7777
/// Provide access to the mapping handler.
7878
MappingInfoTy &getMappingInfo() { return MappingInfo; }
7979

80-
__tgt_target_table *loadBinary(__tgt_device_image *Img);
80+
llvm::Expected<__tgt_device_binary> loadBinary(__tgt_device_image *Img);
8181

8282
// device memory allocation/deallocation routines
8383
/// Allocates \p Size bytes on the device, host or shared memory space

openmp/libomptarget/include/rtl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,16 @@
2626
/// are trying to (re)register an existing lib or really have a new one.
2727
struct TranslationTable {
2828
__tgt_target_table HostTable;
29+
llvm::SmallVector<__tgt_target_table> DeviceTables;
2930

3031
// Image assigned to a given device.
3132
llvm::SmallVector<__tgt_device_image *>
3233
TargetsImages; // One image per device ID.
3334

35+
// Arrays of entries active on the device.
36+
llvm::SmallVector<llvm::SmallVector<__tgt_offload_entry>>
37+
TargetsEntries; // One table per device ID.
38+
3439
// Table of entry points or NULL if it was not already computed.
3540
llvm::SmallVector<__tgt_target_table *>
3641
TargetsTable; // One table per device ID.

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 36 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,9 @@ hsa_status_t hostrpc_terminate();
185185
__attribute__((weak)) hsa_status_t hostrpc_terminate() {
186186
return HSA_STATUS_SUCCESS;
187187
}
188-
__attribute__((weak)) uint64_t
189-
hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t DeviceId,
190-
hsa_amd_memory_pool_t HostMemoryPool,
191-
hsa_amd_memory_pool_t DevMemoryPool) {
188+
__attribute__((weak)) uint64_t hostrpc_assign_buffer(
189+
hsa_agent_t, hsa_queue_t *, uint32_t DeviceId,
190+
hsa_amd_memory_pool_t HostMemoryPool, hsa_amd_memory_pool_t DevMemoryPool) {
192191
// FIXME:THIS SHOULD BE HARD FAIL
193192
DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
194193
"missing\n",
@@ -553,8 +552,9 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
553552
/// Class implementing the AMDGPU device images' properties.
554553
struct AMDGPUDeviceImageTy : public DeviceImageTy {
555554
/// Create the AMDGPU image with the id and the target image pointer.
556-
AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
557-
: DeviceImageTy(ImageId, TgtImage) {}
555+
AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
556+
const __tgt_device_image *TgtImage)
557+
: DeviceImageTy(ImageId, Device, TgtImage) {}
558558

559559
/// Prepare and load the executable corresponding to the image.
560560
Error loadExecutable(const AMDGPUDeviceTy &Device);
@@ -608,8 +608,8 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
608608
/// generic kernel class.
609609
struct AMDGPUKernelTy : public GenericKernelTy {
610610
/// Create an AMDGPU kernel with a name and an execution mode.
611-
AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
612-
: GenericKernelTy(Name, ExecutionMode),
611+
AMDGPUKernelTy(const char *Name)
612+
: GenericKernelTy(Name),
613613
ServiceThreadDeviceBufferGlobal("service_thread_buf", sizeof(uint64_t)),
614614
HostServiceBufferHandler(Plugin::createGlobalHandler()) {}
615615

@@ -855,9 +855,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
855855
CurrentMaxNumThreads = std::min(
856856
static_cast<uint32_t>(TeamsThreadLimitEnvVar), CurrentMaxNumThreads);
857857

858-
return std::min(CurrentMaxNumThreads,
859-
(ThreadLimitClause[0] > 0) ? ThreadLimitClause[0] :
860-
PreferredNumThreads);
858+
return std::min(CurrentMaxNumThreads, (ThreadLimitClause[0] > 0)
859+
? ThreadLimitClause[0]
860+
: PreferredNumThreads);
861861
}
862862
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
863863
uint32_t NumTeamsClause[3], uint64_t LoopTripCount,
@@ -1637,7 +1637,6 @@ struct AMDGPUStreamTy {
16371637
assert(Slot && "Invalid slot");
16381638
assert(Slot->Signal && "Invalid signal");
16391639

1640-
16411640
// Peform the operation.
16421641
if (auto Err = Slot->performAction())
16431642
FATAL_MESSAGE(1, "Error peforming post action: %s",
@@ -1826,21 +1825,21 @@ struct AMDGPUStreamTy {
18261825
// Consume stream slot and compute dependencies.
18271826
auto [Curr, InputSignal] = consume(OutputSignals[0]);
18281827

1829-
//
1830-
// For some reason, the kernel completion signal value gets turned to 0
1831-
// when it should be 1. The code we are commenting out causes this signal
1832-
// to be ignored below and the D2H copy process starts too soon.
1833-
// In this fix, we are not resetting the signal value to 1.
1834-
// We are just not ignoring the signal in the asyncMemCopy below.
1835-
//
1836-
// This fix does not solve the random SDMA problem.
1837-
// We need to understand how this InputSignal value which was a kernel
1838-
// completion signal became 0. More testing is needed.
1839-
//
1828+
//
1829+
// For some reason, the kernel completion signal value gets turned to 0
1830+
// when it should be 1. The code we are commenting out causes this signal
1831+
// to be ignored below and the D2H copy process starts too soon.
1832+
// In this fix, we are not resetting the signal value to 1.
1833+
// We are just not ignoring the signal in the asyncMemCopy below.
1834+
//
1835+
// This fix does not solve the random SDMA problem.
1836+
// We need to understand how this InputSignal value which was a kernel
1837+
// completion signal became 0. More testing is needed.
1838+
//
18401839
// Avoid defining the input dependency if already satisfied.
1841-
// if (InputSignal && !InputSignal->load())
1842-
// fprintf(stderr , " Inputsignal value %ld for signal %p\n",InputSignal->load(),InputSignal);
1843-
// InputSignal = nullptr;
1840+
// if (InputSignal && !InputSignal->load())
1841+
// fprintf(stderr , " Inputsignal value %ld for signal
1842+
// %p\n",InputSignal->load(),InputSignal); InputSignal = nullptr;
18441843

18451844
// Setup the post action for releasing the intermediate buffer.
18461845
if (auto Err = Slots[Curr].schedReleaseBuffer(Inter, MemoryManager))
@@ -1849,7 +1848,8 @@ struct AMDGPUStreamTy {
18491848
// Issue the first step: device to host transfer. Avoid defining the input
18501849
// dependency if already satisfied.
18511850
if (InputSignal) {
1852-
// fprintf(stderr,"calling utils::asyncMemCopy with InputSignal %p val%ld\n",InputSignal,InputSignal->load());
1851+
// fprintf(stderr,"calling utils::asyncMemCopy with InputSignal %p
1852+
// val%ld\n",InputSignal,InputSignal->load());
18531853
hsa_signal_t InputSignalRaw = InputSignal->get();
18541854
if (auto Err = utils::asyncMemCopy(
18551855
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
@@ -2448,7 +2448,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
24482448
OMPX_ForceSyncRegions("OMPX_FORCE_SYNC_REGIONS", 0),
24492449
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
24502450
OMPX_UseMultipleSdmaEngines(
2451-
// setting default to true here appears to solve random sdma problem
2451+
// setting default to true here appears to solve random sdma problem
24522452
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", true),
24532453
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
24542454
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
@@ -2814,15 +2814,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
28142814
uint64_t getClockFrequency() const override { return ClockFrequency; }
28152815

28162816
/// Allocate and construct an AMDGPU kernel.
2817-
Expected<GenericKernelTy &>
2818-
constructKernel(const __tgt_offload_entry &KernelEntry,
2819-
OMPTgtExecModeFlags ExecMode) override {
2817+
Expected<GenericKernelTy &> constructKernel(const char *Name) override {
28202818
// Allocate and construct the AMDGPU kernel.
28212819
AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
28222820
if (!AMDGPUKernel)
28232821
return Plugin::error("Failed to allocate memory for AMDGPU kernel");
28242822

2825-
new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode);
2823+
new (AMDGPUKernel) AMDGPUKernelTy(Name);
28262824

28272825
return *AMDGPUKernel;
28282826
}
@@ -2870,7 +2868,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
28702868
// Allocate and initialize the image object.
28712869
AMDGPUDeviceImageTy *AMDImage =
28722870
Plugin::get().allocate<AMDGPUDeviceImageTy>();
2873-
new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage);
2871+
new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
28742872

28752873
// Load the HSA executable.
28762874
if (Error Err = AMDImage->loadExecutable(*this))
@@ -3602,13 +3600,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
36023600
if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
36033601
return Plugin::success();
36043602

3605-
// Retrieve the execution mode.
3606-
auto ExecModeOrErr = getExecutionModeForKernel(KernelName, Image);
3607-
if (!ExecModeOrErr)
3608-
return ExecModeOrErr.takeError();
3609-
36103603
// Allocate and construct the AMDGPU kernel.
3611-
AMDGPUKernelTy AMDGPUKernel(KernelName, *ExecModeOrErr);
3604+
AMDGPUKernelTy AMDGPUKernel(KernelName);
36123605
if (auto Err = AMDGPUKernel.init(*this, Image))
36133606
return Err;
36143607

@@ -4196,9 +4189,9 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
41964189
}
41974190

41984191
bool canUseHostGlobals() override final {
4199-
// Check if the HSA_XNACK and OMPX_APU_MAPS are enabled. If unified memory is
4200-
// not enabled but both HSA_XNACK and OMPX_APU_MAPS are enabled then we can
4201-
// also use globals directly from the host.
4192+
// Check if the HSA_XNACK and OMPX_APU_MAPS are enabled. If unified memory
4193+
// is not enabled but both HSA_XNACK and OMPX_APU_MAPS are enabled then we
4194+
// can also use globals directly from the host.
42024195
bool EnableHostGlobals = false;
42034196
bool IsZeroCopyOnAPU = AreAllocationsForMapsOnApusDisabled();
42044197
BoolEnvar HSAXnack = BoolEnvar("HSA_XNACK", false);
@@ -4208,8 +4201,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
42084201

42094202
// Check if we are on a system that has an APU or on a non-APU system
42104203
// where unified shared memory can be enabled:
4211-
bool IsUsmSystem =
4212-
hasAPUDevice() || hasDGpuWithUsmSupport();
4204+
bool IsUsmSystem = hasAPUDevice() || hasDGpuWithUsmSupport();
42134205

42144206
// Warn user if there is a mismatch between the request and the system
42154207
// architecture:

openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ struct GlobalTy {
4040
GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr)
4141
: Name(Name), Size(Size), Ptr(Ptr) {}
4242

43-
GlobalTy(const __tgt_offload_entry &Entry)
44-
: Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {}
45-
4643
const std::string &getName() const { return Name; }
4744
uint32_t getSize() const { return Size; }
4845
void *getPtr() const { return Ptr; }

0 commit comments

Comments
 (0)