Skip to content

Commit 5438fe3

Browse files
carlobertolliThorBl
andcommitted
[OpenMP] Enable automatic unified shared memory on MI300A.
This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior, where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device. The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A, that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current GPU configuration. If all these conditions are met, then automatic zero-copy is triggered. This patch also introduces an environment variable OMPX_APU_MAPS that, if set, triggers automatic zero-copy also on non APU GPUs (e.g., on discrete GPUs). This patch is still missing support for global variables, which will be provided in a subsequent patch. Co-authored-by: Thorsten Blass <[email protected]> Co-authored-by: Carlo Bertolli <[email protected]>
1 parent 96c4f10 commit 5438fe3

File tree

12 files changed

+220
-27
lines changed

12 files changed

+220
-27
lines changed

openmp/libomptarget/include/Shared/PluginAPI.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
219219
void *VAddr, bool isRecord,
220220
bool SaveOutput,
221221
uint64_t &ReqPtrArgOffset);
222+
223+
// Returns true if the device \p DeviceId suggests to use auto zero-copy.
224+
int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
222225
}
223226

224227
#endif // OMPTARGET_SHARED_PLUGIN_API_H

openmp/libomptarget/include/Shared/PluginAPI.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,4 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
4747
PLUGIN_API_HANDLE(data_notify_unmapped, false);
4848
PLUGIN_API_HANDLE(set_device_offset, false);
4949
PLUGIN_API_HANDLE(initialize_record_replay, false);
50+
PLUGIN_API_HANDLE(use_auto_zero_copy, false);

openmp/libomptarget/include/Shared/Requirements.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,12 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t {
3333
/// unified_shared_memory clause.
3434
OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
3535
/// dynamic_allocators clause.
36-
OMP_REQ_DYNAMIC_ALLOCATORS = 0x010
36+
OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
37+
/// Auto zero-copy extension:
38+
/// when running on an APU, the GPU plugin may decide to
39+
/// run in zero-copy even though the user did not program
40+
/// their application with unified_shared_memory requirement.
41+
OMPX_REQ_AUTO_ZERO_COPY = 0x020
3742
};
3843

3944
class RequirementCollection {
@@ -65,6 +70,14 @@ class RequirementCollection {
6570
return;
6671
}
6772

73+
// Auto zero-copy is only valid when no other requirement has been set
74+
// and it is computed at device initialization time, after the requirement
75+
// flag has already been set to OMP_REQ_NONE.
76+
if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) {
77+
SetFlags = NewFlags;
78+
return;
79+
}
80+
6881
// If multiple compilation units are present enforce
6982
// consistency across all of them for require clauses:
7083
// - reverse_offload

openmp/libomptarget/include/device.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,9 @@ struct DeviceTy {
164164
/// Print all offload entries to stderr.
165165
void dumpOffloadEntries();
166166

167+
/// Ask the device whether the runtime should use auto zero-copy.
168+
bool useAutoZeroCopy();
169+
167170
private:
168171
/// Deinitialize the device (and plugin).
169172
void deinit();

openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ typedef enum {
6363
} hsa_amd_memory_pool_access_t;
6464

6565
typedef enum hsa_amd_agent_info_s {
66+
HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
6667
HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
6768
HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
6869
HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 99 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,29 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
183183
#endif
184184
}
185185

186+
Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
187+
std::string Target;
188+
auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
189+
uint32_t Length;
190+
hsa_status_t Status;
191+
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
192+
if (Status != HSA_STATUS_SUCCESS)
193+
return Status;
194+
195+
llvm::SmallVector<char> ISAName(Length);
196+
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
197+
if (Status != HSA_STATUS_SUCCESS)
198+
return Status;
199+
200+
llvm::StringRef TripleTarget(ISAName.begin(), Length);
201+
if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
202+
Target = TripleTarget.ltrim('-').rtrim('\0').str();
203+
return HSA_STATUS_SUCCESS;
204+
});
205+
if (Err)
206+
return Err;
207+
return Target;
208+
}
186209
} // namespace utils
187210

188211
/// Utility class representing generic resource references to AMDGPU resources.
@@ -1848,8 +1871,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
18481871
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
18491872
OMPX_UseMultipleSdmaEngines(
18501873
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
1851-
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
1852-
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
1874+
OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
1875+
AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
1876+
HostDevice(HostDevice) {}
18531877

18541878
~AMDGPUDeviceTy() {}
18551879

@@ -1940,6 +1964,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
19401964
if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
19411965
return Err;
19421966

1967+
// Detect if XNACK is enabled
1968+
auto TargeTripleAndFeaturesOrError =
1969+
utils::getTargetTripleAndFeatures(Agent);
1970+
if (!TargeTripleAndFeaturesOrError)
1971+
return TargeTripleAndFeaturesOrError.takeError();
1972+
StringRef TargeTripleAndFeatures(*TargeTripleAndFeaturesOrError);
1973+
if (TargeTripleAndFeatures.contains("xnack+"))
1974+
IsXnackEnabled = true;
1975+
1976+
// detect if device is an APU.
1977+
if (auto Err = checkIfAPU())
1978+
return Err;
1979+
19431980
return Plugin::success();
19441981
}
19451982

@@ -2631,6 +2668,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26312668
return Plugin::success();
26322669
}
26332670

2671+
/// Returns true if auto zero-copy the best configuration for the current
2672+
/// arch.
2673+
/// On AMDGPUs, automatic zero-copy is turned on
2674+
/// when running on an APU with XNACK (unified memory) support
2675+
/// enabled. On discrete GPUs, automatic zero-copy is triggered
2676+
/// if the user sets the environment variable OMPX_APU_MAPS=1
2677+
/// and if XNACK is enabled. The rationale is that zero-copy
2678+
/// is the best configuration (performance, memory footprint) on APUs,
2679+
/// while it is often not the best on discrete GPUs.
2680+
/// XNACK can be enabled with a kernel boot parameter or with
2681+
/// the HSA_XNACK environment variable.
2682+
bool useAutoZeroCopyImpl() override {
2683+
return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
2684+
}
2685+
26342686
/// Getters and setters for stack and heap sizes.
26352687
Error getDeviceStackSize(uint64_t &Value) override {
26362688
Value = StackSize;
@@ -2728,6 +2780,34 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27282780
return Err;
27292781
}
27302782

2783+
/// Detect if current architecture is an APU.
2784+
Error checkIfAPU() {
2785+
llvm::StringRef StrGfxName(ComputeUnitKind);
2786+
IsAPU = llvm::StringSwitch<bool>(StrGfxName)
2787+
.Case("gfx940", true)
2788+
.Default(false);
2789+
if (IsAPU)
2790+
return Plugin::success();
2791+
2792+
bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
2793+
.Case("gfx942", true)
2794+
.Default(false);
2795+
if (!MayBeAPU)
2796+
return Plugin::success();
2797+
else {
2798+
// can be MI300A or MI300X
2799+
uint32_t ChipID = 0;
2800+
if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
2801+
return Err;
2802+
2803+
if (!(ChipID & 0x1)) {
2804+
IsAPU = true;
2805+
return Plugin::success();
2806+
}
2807+
}
2808+
return Plugin::success();
2809+
}
2810+
27312811
/// Envar for controlling the number of HSA queues per device. High number of
27322812
/// queues may degrade performance.
27332813
UInt32Envar OMPX_NumQueues;
@@ -2764,6 +2844,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27642844
/// Use ROCm 5.7 interface for multiple SDMA engines
27652845
BoolEnvar OMPX_UseMultipleSdmaEngines;
27662846

2847+
/// Value of OMPX_APU_MAPS env var used to force
2848+
/// automatic zero-copy behavior on non-APU GPUs.
2849+
BoolEnvar OMPX_ApuMaps;
2850+
27672851
/// Stream manager for AMDGPU streams.
27682852
AMDGPUStreamManagerTy AMDGPUStreamManager;
27692853

@@ -2794,6 +2878,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27942878
/// The current size of the stack that will be used in cases where it could
27952879
/// not be statically determined.
27962880
uint64_t StackSize = 16 * 1024 /* 16 KB */;
2881+
2882+
/// Is the plugin associated with an APU?
2883+
bool IsAPU = false;
2884+
2885+
/// True is the system is configured with XNACK-Enabled.
2886+
/// False otherwise.
2887+
bool IsXnackEnabled = false;
27972888
};
27982889

27992890
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3039,30 +3130,15 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
30393130
std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
30403131

30413132
for (hsa_agent_t Agent : KernelAgents) {
3042-
std::string Target;
3043-
auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
3044-
uint32_t Length;
3045-
hsa_status_t Status;
3046-
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
3047-
if (Status != HSA_STATUS_SUCCESS)
3048-
return Status;
3049-
3050-
llvm::SmallVector<char> ISAName(Length);
3051-
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
3052-
if (Status != HSA_STATUS_SUCCESS)
3053-
return Status;
3054-
3055-
llvm::StringRef TripleTarget(ISAName.begin(), Length);
3056-
if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
3057-
Target = TripleTarget.ltrim('-').rtrim('\0').str();
3058-
return HSA_STATUS_SUCCESS;
3059-
});
3060-
if (Err)
3061-
return std::move(Err);
3133+
auto TargeTripleAndFeaturesOrError =
3134+
utils::getTargetTripleAndFeatures(Agent);
3135+
if (!TargeTripleAndFeaturesOrError)
3136+
return TargeTripleAndFeaturesOrError.takeError();
3137+
StringRef TargeTripleAndFeatures(*TargeTripleAndFeaturesOrError);
30623138

30633139
if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
30643140
ElfOrErr->getPlatformFlags(),
3065-
Target))
3141+
TargeTripleAndFeatures))
30663142
return false;
30673143
}
30683144
return true;

openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -872,6 +872,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
872872

873873
virtual Error getDeviceStackSize(uint64_t &V) = 0;
874874

875+
/// Returns true if current plugin architecture is an APU
876+
/// and unified_shared_memory was not requested by the program.
877+
bool useAutoZeroCopy();
878+
virtual bool useAutoZeroCopyImpl() { return false; }
879+
875880
private:
876881
/// Register offload entry for global variable.
877882
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,

openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1561,6 +1561,8 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
15611561
return syncEventImpl(EventPtr);
15621562
}
15631563

1564+
bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
1565+
15641566
Error GenericPluginTy::init() {
15651567
auto NumDevicesOrErr = initImpl();
15661568
if (!NumDevicesOrErr)
@@ -2073,6 +2075,14 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
20732075
return OFFLOAD_SUCCESS;
20742076
}
20752077

2078+
int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
2079+
// Automatic zero-copy only applies to programs that did
2080+
// not request unified_shared_memory and are deployed on an
2081+
// APU with XNACK enabled.
2082+
if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
2083+
return false;
2084+
return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
2085+
}
20762086
#ifdef __cplusplus
20772087
}
20782088
#endif

openmp/libomptarget/src/OpenMP/Mapping.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -252,15 +252,21 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
252252
MESSAGE("device mapping required by 'present' map type modifier does not "
253253
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
254254
DPxPTR(HstPtrBegin), Size);
255-
} else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
256-
!HasCloseModifier) {
255+
} else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
256+
!HasCloseModifier) ||
257+
(PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
258+
257259
// If unified shared memory is active, implicitly mapped variables that are
258260
// not privatized use host address. Any explicitly mapped variables also use
259261
// host address where correctness is not impeded. In all other cases maps
260262
// are respected.
261263
// In addition to the mapping rules above, the close map modifier forces the
262264
// mapping of the variable to the device.
263265
if (Size) {
266+
INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
267+
"Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
268+
"memory\n",
269+
DPxPTR((uintptr_t)HstPtrBegin), Size);
264270
DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
265271
"memory\n",
266272
DPxPTR((uintptr_t)HstPtrBegin), Size);
@@ -415,7 +421,8 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
415421
LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
416422
LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
417423
LR.TPR.TargetPointer = (void *)TP;
418-
} else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
424+
} else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
425+
PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) {
419426
// If the value isn't found in the mapping and unified shared memory
420427
// is on then it means we have stumbled upon a value which we need to
421428
// use directly from the host.

openmp/libomptarget/src/PluginManager.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,19 +144,32 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
144144

145145
int32_t NumPD = getNumberOfPluginDevices();
146146
ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
147+
// Auto zero-copy is a per-device property. We need to ensure
148+
// that all devices are suggesting to use it.
149+
bool UseAutoZeroCopy = true;
150+
if (NumPD == 0)
151+
UseAutoZeroCopy = false;
147152
for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
148153
auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
149154
if (auto Err = Device->init()) {
150155
DP("Skip plugin known device %d: %s\n", PDevI,
151156
toString(std::move(Err)).c_str());
152157
continue;
153158
}
159+
UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy();
154160

155161
ExclusiveDevicesAccessor->push_back(std::move(Device));
156162
++NumberOfUserDevices;
157163
++UserDevId;
158164
}
159165

166+
// Auto Zero-Copy can only be currently triggered when the system is an
167+
// homogeneous APU architecture without attached discrete GPUs.
168+
// If all devices suggest to use it, change requirment flags to trigger
169+
// zero-copy behavior when mapping memory.
170+
if (UseAutoZeroCopy)
171+
PM.addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
172+
160173
DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
161174
DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
162175
NumberOfPluginDevices);

openmp/libomptarget/src/device.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,3 +339,9 @@ void DeviceTy::dumpOffloadEntries() {
339339
fprintf(stderr, " %11s: %s\n", Kind, It.second.getNameAsCStr());
340340
}
341341
}
342+
343+
bool DeviceTy::useAutoZeroCopy() {
344+
if (RTL->use_auto_zero_copy)
345+
return RTL->use_auto_zero_copy(RTLDeviceID);
346+
return false;
347+
}

0 commit comments

Comments
 (0)