Skip to content

Commit 4f61320

Browse files
carlobertollironlieb
authored andcommitted
[OpenMP][MI300A] APU: prefault host memory upon memory copy.
On APUs, when copying memory the CPU-allocated memory (e.g., via malloc, mmaps, etc.) is faulted at memory copy time, potentially page-by-page. This patch prefaults host memory when its size is >2MB, which has been shown as being the threshold after which not prefaulting incurs large overehads (see https://ontrack-internal.amd.com/browse/SWDEV-443343). Change-Id: Ib95a336b0352a579faf1af7e40cc32d3600a4cdd
1 parent f95009a commit 4f61320

File tree

1 file changed

+65
-2
lines changed
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

1 file changed

+65
-2
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2478,6 +2478,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
24782478
// setting default to true here appears to solve random sdma problem
24792479
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", true),
24802480
OMPX_SyncCopyBack("LIBOMPTARGET_SYNC_COPY_BACK", true),
2481+
OMPX_APUPrefaultMemcopy("LIBOMPTARGET_APU_PREFAULT_MEMCOPY", "true"),
2482+
OMPX_APUPrefaultMemcopySize("LIBOMPTARGET_APU_PREFAULT_MEMCOPY_SIZE",
2483+
2 * 1024 * 1024), // 2MB
24812484
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
24822485
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
24832486

@@ -3044,6 +3047,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
30443047
AMDGPUStreamTy *Stream = nullptr;
30453048
void *PinnedPtr = nullptr;
30463049

3050+
// Prefault GPU page table in XNACK-Enabled case, on APUs,
3051+
// under the assumption that explicitly allocated memory
3052+
// will be fully accessed and that on-the-fly individual page faults
3053+
// perform worse than whole memory faulting.
3054+
if (OMPX_APUPrefaultMemcopy && Size >= OMPX_APUPrefaultMemcopySize &&
3055+
IsAPU && IsXnackEnabled)
3056+
if (auto Err = prepopulatePageTableImpl(const_cast<void *>(HstPtr), Size))
3057+
return Err;
3058+
30473059
// Use one-step asynchronous operation when host memory is already pinned.
30483060
if (void *PinnedPtr =
30493061
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
@@ -3251,8 +3263,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
32513263
}
32523264

32533265
Error prepopulatePageTableImpl(void *ptr, int64_t size) override final {
3254-
// Instruct ROCr that the [ptr, ptr+size-1] pages are
3255-
// coarse grain
3266+
// Instruct runtimes that the [ptr, ptr+size-1] pages will be accessed by
3267+
// devices but should not be migrated (only perform page faults, if needed).
32563268
hsa_amd_svm_attribute_pair_t tt;
32573269
tt.attribute = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE;
32583270
tt.value = Agent.handle;
@@ -3751,9 +3763,38 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
37513763
/// Use ROCm 5.7 interface for multiple SDMA engines
37523764
BoolEnvar OMPX_UseMultipleSdmaEngines;
37533765

3766+
/// Value of OMPX_APU_MAPS env var used to force
3767+
/// automatic zero-copy behavior on non-APU GPUs.
3768+
BoolEnvar OMPX_ApuMaps;
3769+
3770+
/// Value of OMPX_DISABLE_USM_MAPS. Use on MI200
3771+
/// systems to disable both device memory
3772+
/// allocations and host-device memory copies upon
3773+
/// map, and coarse graining of mapped variables.
3774+
BoolEnvar OMPX_DisableUsmMaps;
3775+
3776+
/// Value of OMPX_DISABLE_MAPS. Turns off map table checks
3777+
/// in libomptarget in unified_shared_memory mode. Legacy:
3778+
/// never turned to false (unified_shared_memory mode is
3779+
/// currently always without map checks.
3780+
BoolEnvar OMPX_NoMapChecks;
3781+
3782+
/// Makes warnings turn into fatal errors
3783+
BoolEnvar OMPX_StrictSanityChecks;
3784+
37543785
/// Variable to hold synchronous copy back
37553786
BoolEnvar OMPX_SyncCopyBack;
37563787

3788+
/// On APUs, this env var indicates whether memory copy
3789+
/// should be preceded by pre-faulting of host memory,
3790+
/// to prevent page faults during the copy.
3791+
BoolEnvar OMPX_APUPrefaultMemcopy;
3792+
3793+
/// On APUs, when prefaulting host memory before a copy,
3794+
/// this env var controls the size after which prefaulting
3795+
/// is applied.
3796+
UInt32Envar OMPX_APUPrefaultMemcopySize;
3797+
37573798
/// Stream manager for AMDGPU streams.
37583799
AMDGPUStreamManagerTy AMDGPUStreamManager;
37593800

@@ -3798,6 +3839,28 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
37983839
// The maximum scratch memory size per thread.
37993840
// See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
38003841
uint32_t MaxThreadScratchSize;
3842+
3843+
/// Is the plugin associated with an APU?
3844+
bool IsAPU = false;
3845+
3846+
// Is the device an MI300X?
3847+
bool IsEquippedWithMI300X = false;
3848+
3849+
// Is the device an MI200?
3850+
bool IsEquippedWithGFX90A = false;
3851+
3852+
/// True if the system is configured with XNACK-Enabled.
3853+
/// False otherwise.
3854+
bool IsXnackEnabled = false;
3855+
3856+
// Set by OMPX_DISABLE_USM_MAPS environment variable.
3857+
// If set, fine graned memory is used for maps instead of coarse grained.
3858+
bool EnableFineGrainedMemory = false;
3859+
3860+
/// Set by OMPX_DISABLE_MAPS environment variable.
3861+
// If false, map checks are performed also in unified_shared_memory mode.
3862+
// TODO: this feature is non functional.
3863+
bool NoUSMMapChecks = true;
38013864
};
38023865

38033866
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {

0 commit comments

Comments
 (0)