Skip to content

Commit dad5ee7

Browse files
carlobertollironlieb
authored andcommitted
[OpenMP] Move logic that needs requirements flags out of plugin.
This patch moves to libomptarget all OpenMP logic that depends on the requirement flags. This is due to upcoming trunk patch llvm#80345. The consequence of this change is that we have far less calls to the plugin during non-initialization phases. In addition, it - introduces diagnostic prints for zero-copy modes, and - make unified_shared_memory program fail on XNACK-Disabled environments when env variable OMPX_STRICT_SANITY_CHECKS=true. Note that unified_shared_memory can run on an XNACK-Disabled system by setting OMPX_EAGER_ZERO_COPY_MAPS, because it supports accesses to CPU-allocated memory using GPU page table prefaulting, instead of XNACK. Change-Id: I252e6557b3db1a13dd0fb1fe55f2f508361d9957
1 parent 3297d8e commit dad5ee7

File tree

15 files changed

+334
-149
lines changed

15 files changed

+334
-149
lines changed

openmp/libomptarget/include/Shared/Debug.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ enum OpenMPInfoType : uint32_t {
6565
OMP_INFOTYPE_AMD_API_TRACE = 0x200,
6666
// Print whenever data does not have a viable device counterpart.
6767
OMP_INFOTYPE_EMPTY_MAPPING = 0x0040,
68+
// Print diagnostic information for users.
69+
OMP_INFOTYPE_USER_DIAGNOSTIC = 0x0080,
6870
// Enable every flag.
6971
OMP_INFOTYPE_ALL = 0xffffffff,
7072
};

openmp/libomptarget/include/Shared/PluginAPI.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -234,17 +234,16 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
234234
// Return if the system is equipped with an APU
235235
bool __tgt_rtl_has_apu_device(int32_t DeviceId);
236236

237-
// Returns true, if the system is equipped with a dGPU which supports USM.
237+
// Returns true if the system is equipped with a dGPU which supports USM.
238238
bool __tgt_rtl_has_USM_capable_dGPU(int32_t DeviceId);
239239

240+
// Returns true if the system supports unified memory.
241+
bool __tgt_rtl_supports_unified_memory(int32_t DeviceId);
242+
240243
// Returns true if coarse graining of mapped memory is disabled
241244
// (it only applies to MI200 GPUs).
242245
bool __tgt_rtl_is_fine_grained_memory_enabled(int32_t DeviceId);
243246

244-
// Returns true if GPU page table prefaulting is enabled. False
245-
// otherwise.
246-
bool __tgt_rtl_requested_prepopulate_gpu_page_table(int32_t DeviceId);
247-
248247
// Check if image is incompatible due to XNACK mismatch.
249248
void __tgt_rtl_check_invalid_image(__tgt_device_image *Image);
250249

@@ -273,6 +272,12 @@ int32_t __tgt_rtl_activate_record_replay(int32_t, uint64_t, void *, bool,
273272
// Returns true if the device \p DeviceId suggests to use auto zero-copy.
274273
int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
275274

275+
// Performs sanity checks on zero-copy options and prints diagnostic info.
276+
int32_t __tgt_rtl_zero_copy_sanity_checks_and_diag(int32_t DeviceId,
277+
bool isUnifiedSharedMemory,
278+
bool isAutoZeroCopy,
279+
bool isEagerMaps);
280+
276281
#ifdef __cplusplus
277282
}
278283
#endif

openmp/libomptarget/include/Shared/PluginAPI.inc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ PLUGIN_API_HANDLE(initialize_record_replay, false);
5151
PLUGIN_API_HANDLE(check_invalid_image, true);
5252
PLUGIN_API_HANDLE(has_apu_device, true);
5353
PLUGIN_API_HANDLE(has_USM_capable_dGPU, true);
54-
PLUGIN_API_HANDLE(requested_prepopulate_gpu_page_table, true);
54+
PLUGIN_API_HANDLE(supports_unified_memory, true);
5555
PLUGIN_API_HANDLE(is_fine_grained_memory_enabled, true);
5656
PLUGIN_API_HANDLE(is_system_supporting_managed_memory, true);
5757
PLUGIN_API_HANDLE(number_of_team_procs, true);
@@ -63,3 +63,4 @@ PLUGIN_API_HANDLE(enable_access_to_all_agents, false);
6363
PLUGIN_API_HANDLE(release_async_info, false);
6464
PLUGIN_API_HANDLE(activate_record_replay, false);
6565
PLUGIN_API_HANDLE(use_auto_zero_copy, false);
66+
PLUGIN_API_HANDLE(zero_copy_sanity_checks_and_diag, false);

openmp/libomptarget/include/Shared/Requirements.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,14 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t {
3838
/// when running on an APU, the GPU plugin may decide to
3939
/// run in zero-copy even though the user did not program
4040
/// their application with unified_shared_memory requirement.
41-
OMPX_REQ_AUTO_ZERO_COPY = 0x020
41+
OMPX_REQ_AUTO_ZERO_COPY = 0x020,
42+
/// Eager Maps is an extension of auto zero-copy and
43+
/// unified shared memory. Selected using an environment
44+
/// varible OMPX_EAGER_ZERO_COPY_MAPS, it makes memory mapping
45+
/// issue a GPU TLB prefaulting action. This allows applications
46+
/// using unified memory to run with unified memory support disabled
47+
/// (if possible on the target device).
48+
OMPX_REQ_EAGER_ZERO_COPY_MAPS = 0x040
4249
};
4350

4451
class RequirementCollection {

openmp/libomptarget/include/device.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,13 @@ struct DeviceTy {
4949

5050
/// Flag to force synchronous data transfers
5151
/// Controlled via environment flag OMPX_FORCE_SYNC_REGIONS
52-
bool ForceSynchronousTargetRegions;
52+
bool ForceSynchronousTargetRegions = false;
53+
54+
/// Flag that indicates whether the user requested eager zero-copy maps
55+
/// to execute their application. Even if true, this is only valid on certain
56+
/// architectures and configurations, which is checked upon device
57+
/// initialization.
58+
bool EagerZeroCopyMaps = false;
5359

5460
DeviceTy(PluginAdaptorTy *RTL, int32_t DeviceID, int32_t RTLDeviceID);
5561
// DeviceTy is not copyable
@@ -161,6 +167,16 @@ struct DeviceTy {
161167
/// Ask the device whether the runtime should use auto zero-copy.
162168
bool useAutoZeroCopy();
163169

170+
/// Ask the device whether it is an APU.
171+
bool checkIfAPU();
172+
173+
/// Ask the device whether it supports unified memory.
174+
bool supportsUnifiedMemory();
175+
176+
/// Ask the device to perform sanity checks for zero-copy configurations.
177+
void zeroCopySanityChecksAndDiag(bool isUnifiedSharedMemory,
178+
bool isAutoZeroCopy, bool isEagerMaps);
179+
164180
private:
165181
/// Deinitialize the device (and plugin).
166182
void deinit();

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 51 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -2504,7 +2504,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25042504
OMPX_ApuMaps("OMPX_APU_MAPS", false),
25052505
OMPX_DisableUsmMaps("OMPX_DISABLE_USM_MAPS", false),
25062506
OMPX_NoMapChecks("OMPX_DISABLE_MAPS", true),
2507-
OMPX_EagerApuMaps("OMPX_EAGER_ZERO_COPY_MAPS", false),
2507+
OMPX_StrictSanityChecks("OMPX_STRICT_SANITY_CHECKS", false),
25082508
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
25092509
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
25102510

@@ -2942,75 +2942,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29422942
return Plugin::success();
29432943
}
29442944

2945-
// TODO: clean up the following three functions after removing auto_zero_copy
2946-
// support and document appropriately.
2947-
void checkAndAdjustUsmModeForTargetImage(const __tgt_device_image *TgtImage) {
2948-
assert((TgtImage != nullptr) && "TgtImage is nullptr");
2949-
assert(!(Plugin::get().getRequiresFlags() & OMP_REQ_UNDEFINED) &&
2950-
"Requires flags are not set.");
2951-
2952-
if (!(IsAPU || hasDGpuWithUsmSupportImpl()))
2953-
return;
2954-
2955-
bool IsXnackRequired =
2956-
Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY;
2957-
utils::XnackBuildMode BinaryXnackMode =
2958-
utils::extractXnackModeFromBinary(TgtImage);
2959-
2960-
if (IsXnackRequired) {
2961-
handleImageRequiresUsmMode(BinaryXnackMode);
2962-
} else {
2963-
handleDefaultMode(BinaryXnackMode);
2964-
}
2965-
}
2966-
2967-
void handleImageRequiresUsmMode(utils::XnackBuildMode XnackImageMode) {
2968-
bool IsXnackActiveOnSystem = IsXnackEnabled;
2969-
2970-
if ((XnackImageMode == ELF::EF_AMDGPU_FEATURE_XNACK_ANY_V4) ||
2971-
(XnackImageMode == ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4 &&
2972-
IsXnackActiveOnSystem) ||
2973-
(XnackImageMode == ELF::EF_AMDGPU_FEATURE_XNACK_OFF_V4 &&
2974-
!IsXnackActiveOnSystem)) {
2975-
if (OMPX_EagerApuMaps.get() && IsAPU)
2976-
PrepopulateGPUPageTable = true; // Pre-faulting
2977-
}
2978-
2979-
if (!IsXnackActiveOnSystem &&
2980-
(XnackImageMode != ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4)) {
2981-
FAILURE_MESSAGE(
2982-
"Running a program that requires XNACK on a system where XNACK is "
2983-
"disabled. This may cause problems when using a OS-allocated pointer "
2984-
"inside a target region. "
2985-
"Re-run with HSA_XNACK=1 to remove this warning.\n");
2986-
}
2987-
}
2988-
2989-
void handleDefaultMode(utils::XnackBuildMode XnackImageMode) {
2990-
// assuming that copying is required
2991-
// handled in userAutoZeroCopyImpl
2992-
// DisableAllocationsForMapsOnApus = false;
2993-
bool IsXnackActiveOnSystem = IsXnackEnabled;
2994-
2995-
if (IsXnackActiveOnSystem && (IsAPU || OMPX_ApuMaps.get()) &&
2996-
((XnackImageMode == ELF::EF_AMDGPU_FEATURE_XNACK_ANY_V4) ||
2997-
(XnackImageMode == ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4))) {
2998-
if (IsAPU && OMPX_EagerApuMaps.get()) {
2999-
PrepopulateGPUPageTable = true; // Pre-faulting
3000-
}
3001-
return;
3002-
}
3003-
3004-
if (!IsXnackActiveOnSystem && IsAPU && OMPX_EagerApuMaps.get() &&
3005-
((XnackImageMode == ELF::EF_AMDGPU_FEATURE_XNACK_ANY_V4) ||
3006-
(XnackImageMode == ELF::EF_AMDGPU_FEATURE_XNACK_OFF_V4))) {
3007-
PrepopulateGPUPageTable = true; // Pre-faulting
3008-
return;
3009-
}
3010-
3011-
return;
3012-
}
3013-
30142945
/// Load the binary image into the device and allocate an image object.
30152946
Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
30162947
int32_t ImageId) override {
@@ -3022,9 +2953,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
30222953
// Load the HSA executable.
30232954
if (Error Err = AMDImage->loadExecutable(*this))
30242955
return std::move(Err);
3025-
3026-
checkAndAdjustUsmModeForTargetImage(TgtImage);
3027-
30282956
return AMDImage;
30292957
}
30302958

@@ -3624,14 +3552,51 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
36243552
/// while it is often not the best on discrete GPUs.
36253553
/// XNACK can be enabled with a kernel boot parameter or with
36263554
/// the HSA_XNACK environment variable.
3627-
/// ROCm-only behavior: default (non USM, with xnack- or xnack-any)
3628-
/// and OMPX_EAGER_APU_MAPS is automatic zero-copy with pre-fault.
36293555
bool useAutoZeroCopyImpl() override {
3630-
return (
3631-
((IsAPU || OMPX_ApuMaps) && IsXnackEnabled) ||
3632-
(IsAPU &&
3633-
!(Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) &&
3634-
!IsXnackEnabled && OMPX_EagerApuMaps.get()));
3556+
return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
3557+
}
3558+
3559+
/// Performs sanity checks on the selected zero-copy configuration and prints
3560+
/// diagnostic information.
3561+
Error zeroCopySanityChecksAndDiagImpl(bool isUnifiedSharedMemory,
3562+
bool isAutoZeroCopy,
3563+
bool isEagerMaps) override {
3564+
// Implementation sanity checks: either unified_shared_memory or auto
3565+
// zero-copy, not both
3566+
if (isUnifiedSharedMemory && isAutoZeroCopy)
3567+
return Plugin::error("Internal runtime error: cannot be both "
3568+
"unified_shared_memory and auto zero-copy.");
3569+
3570+
if (IsXnackEnabled)
3571+
INFO(OMP_INFOTYPE_USER_DIAGNOSTIC, getDeviceId(), "XNACK is enabled.\n");
3572+
else
3573+
INFO(OMP_INFOTYPE_USER_DIAGNOSTIC, getDeviceId(), "XNACK is disabled.\n");
3574+
if (isUnifiedSharedMemory)
3575+
INFO(OMP_INFOTYPE_USER_DIAGNOSTIC, getDeviceId(),
3576+
"Application configured to run in zero-copy using "
3577+
"unified_shared_memory.\n");
3578+
else if (isAutoZeroCopy)
3579+
INFO(
3580+
OMP_INFOTYPE_USER_DIAGNOSTIC, getDeviceId(),
3581+
"Application configured to run in zero-copy using auto zero-copy.\n");
3582+
if (isEagerMaps)
3583+
INFO(OMP_INFOTYPE_USER_DIAGNOSTIC, getDeviceId(),
3584+
"Requested pre-faulting of GPU page tables.\n");
3585+
3586+
// Sanity checks: selecting unified_shared_memory with XNACK-Disabled
3587+
// triggers a warning that can be turned into a fatal error using an
3588+
// environment variable.
3589+
if (isUnifiedSharedMemory && !IsXnackEnabled) {
3590+
MESSAGE0(
3591+
"Running a program that requires XNACK on a system where XNACK is "
3592+
"disabled. This may cause problems when using an OS-allocated "
3593+
"pointer "
3594+
"inside a target region. "
3595+
"Re-run with HSA_XNACK=1 to remove this warning.");
3596+
if (OMPX_StrictSanityChecks)
3597+
llvm_unreachable("User-requested hard stop on sanity check errors.");
3598+
}
3599+
return Plugin::success();
36353600
}
36363601

36373602
/// Getters and setters for stack and heap sizes.
@@ -3845,19 +3810,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
38453810
if (OMPX_DisableUsmMaps.get() == true) {
38463811
EnableFineGrainedMemory = true;
38473812
}
3848-
3849-
if (IsAPU) {
3850-
// OMPX_EAGER_ZERO_COPY_MAPS=1 && HSA_XNACK=0 (XNACK-disabled)
3851-
// && default (non-USM) program
3852-
if ((OMPX_EagerApuMaps.get() == true) && !IsXnackEnabled &&
3853-
!(Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
3854-
PrepopulateGPUPageTable = true;
3855-
}
3856-
}
3857-
}
3858-
3859-
bool requestedPrepopulateGPUPageTableImpl() override final {
3860-
return PrepopulateGPUPageTable;
38613813
}
38623814

38633815
bool IsFineGrainedMemoryEnabledImpl() override final {
@@ -3875,6 +3827,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
38753827
return hasGfx90aDevice() || hasMI300xDevice();
38763828
}
38773829

3830+
/// Returns whether AMD GPU supports unified memory in
3831+
/// the current configuration.
3832+
bool supportsUnifiedMemoryImpl() override final { return IsXnackEnabled; }
3833+
38783834
/// Envar for controlling the number of HSA queues per device. High number of
38793835
/// queues may degrade performance.
38803836
UInt32Envar OMPX_NumQueues;
@@ -3963,11 +3919,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
39633919
/// currently always without map checks.
39643920
BoolEnvar OMPX_NoMapChecks;
39653921

3966-
/// Value of OMPX_EAGER_ZERO_COPY_MAPS. When true, it
3967-
/// makes the plugin prefault the GPU page table upon
3968-
/// map. This allows running with XNACK-Disabled and
3969-
/// use zero-copy.
3970-
BoolEnvar OMPX_EagerApuMaps;
3922+
// Makes warnings turn into fatal errors
3923+
BoolEnvar OMPX_StrictSanityChecks;
39713924

39723925
/// Stream manager for AMDGPU streams.
39733926
AMDGPUStreamManagerTy AMDGPUStreamManager;
@@ -4030,11 +3983,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
40303983
/// False otherwise.
40313984
bool IsXnackEnabled = false;
40323985

4033-
/// Set by OMPX_EAGER_ZERO_COPY_MAPS environment variable.
4034-
/// If set, map clauses provoke prefaulting of the GPU
4035-
/// page table (applies to limited cases).
4036-
bool PrepopulateGPUPageTable = false;
4037-
40383986
// Set by OMPX_DISABLE_USM_MAPS environment variable.
40393987
// If set, fine graned memory is used for maps instead of coarse grained.
40403988
bool EnableFineGrainedMemory = false;

openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -826,19 +826,16 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
826826

827827
// Returns true if the system is equipped with an APU.
828828
// moved in from plugin
829-
// virtual bool hasAPUDevice() { return false; }
830829
bool hasAPUDevice();
831830
virtual bool hasAPUDeviceImpl() { return false; }
832831

833-
// Returns true if the system is equipped with a dGPU that supports USM
834-
// virtual bool hasDGpuWithUsmSupport() { return false; }
832+
// Returns true if the system is equipped with a dGPU that supports USM/
835833
bool hasDGpuWithUsmSupport();
836834
virtual bool hasDGpuWithUsmSupportImpl() { return false; }
837835

838-
// Returns true if user requested Eager Maps implementation.
839-
// virtual bool requestedPrepopulateGPUPageTable() { return false; }
840-
bool requestedPrepopulateGPUPageTable();
841-
virtual bool requestedPrepopulateGPUPageTableImpl() { return false; }
836+
// Returns true if the system supports unified memory.
837+
bool supportsUnifiedMemory();
838+
virtual bool supportsUnifiedMemoryImpl() { return false; }
842839

843840
// Returns true if coarse graining of mapped variables is
844841
// disabled on MI200 GPUs.
@@ -952,6 +949,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
952949

953950
bool isFastReductionEnabled() const { return IsFastReductionEnabled; }
954951

952+
/// Performs sanity checks on zero-copy options and prints diagnostic info.
953+
Error zeroCopySanityChecksAndDiag(bool isUnifiedSharedMemory,
954+
bool isAutoZeroCopy, bool isEagerMaps);
955+
virtual Error zeroCopySanityChecksAndDiagImpl(bool isUnifiedSharedMemory,
956+
bool isAutoZeroCopy,
957+
bool isEagerMaps) {
958+
return Error::success();
959+
}
960+
955961
private:
956962
/// Get and set the stack size and heap size for the device. If not used, the
957963
/// plugin can implement the setters as no-op and setting the output

0 commit comments

Comments
 (0)