@@ -2478,6 +2478,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
2478
2478
// setting default to true here appears to solve random sdma problem
2479
2479
" LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , true ),
2480
2480
OMPX_SyncCopyBack (" LIBOMPTARGET_SYNC_COPY_BACK" , true ),
2481
+ OMPX_APUPrefaultMemcopy (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY" , " true" ),
2482
+ OMPX_APUPrefaultMemcopySize (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY_SIZE" ,
2483
+ 2 * 1024 * 1024 ), // 2MB
2481
2484
AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
2482
2485
AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
2483
2486
@@ -3044,6 +3047,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3044
3047
AMDGPUStreamTy *Stream = nullptr ;
3045
3048
void *PinnedPtr = nullptr ;
3046
3049
3050
+ // Prefault GPU page table in XNACK-Enabled case, on APUs,
3051
+ // under the assumption that explicitly allocated memory
3052
+ // will be fully accessed and that on-the-fly individual page faults
3053
+ // perform worse than whole memory faulting.
3054
+ if (OMPX_APUPrefaultMemcopy && Size >= OMPX_APUPrefaultMemcopySize &&
3055
+ IsAPU && IsXnackEnabled)
3056
+ if (auto Err = prepopulatePageTableImpl (const_cast <void *>(HstPtr), Size))
3057
+ return Err;
3058
+
3047
3059
// Use one-step asynchronous operation when host memory is already pinned.
3048
3060
if (void *PinnedPtr =
3049
3061
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer (HstPtr)) {
@@ -3251,8 +3263,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3251
3263
}
3252
3264
3253
3265
Error prepopulatePageTableImpl (void *ptr, int64_t size) override final {
3254
- // Instruct ROCr that the [ptr, ptr+size-1] pages are
3255
- // coarse grain
3266
+ // Instruct runtimes that the [ptr, ptr+size-1] pages will be accessed by
3267
+ // devices but should not be migrated (only perform page faults, if needed).
3256
3268
hsa_amd_svm_attribute_pair_t tt;
3257
3269
tt.attribute = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE;
3258
3270
tt.value = Agent.handle ;
@@ -3751,9 +3763,38 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3751
3763
// / Use ROCm 5.7 interface for multiple SDMA engines
3752
3764
BoolEnvar OMPX_UseMultipleSdmaEngines;
3753
3765
3766
+ // / Value of OMPX_APU_MAPS env var used to force
3767
+ // / automatic zero-copy behavior on non-APU GPUs.
3768
+ BoolEnvar OMPX_ApuMaps;
3769
+
3770
+ // / Value of OMPX_DISABLE_USM_MAPS. Use on MI200
3771
+ // / systems to disable both device memory
3772
+ // / allocations and host-device memory copies upon
3773
+ // / map, and coarse graining of mapped variables.
3774
+ BoolEnvar OMPX_DisableUsmMaps;
3775
+
3776
+ // / Value of OMPX_DISABLE_MAPS. Turns off map table checks
3777
+ // / in libomptarget in unified_shared_memory mode. Legacy:
3778
+ // / never turned to false (unified_shared_memory mode is
3779
+ // / currently always without map checks.
3780
+ BoolEnvar OMPX_NoMapChecks;
3781
+
3782
+ // / Makes warnings turn into fatal errors
3783
+ BoolEnvar OMPX_StrictSanityChecks;
3784
+
3754
3785
// / Variable to hold synchronous copy back
3755
3786
BoolEnvar OMPX_SyncCopyBack;
3756
3787
3788
+ // / On APUs, this env var indicates whether memory copy
3789
+ // / should be preceded by pre-faulting of host memory,
3790
+ // / to prevent page faults during the copy.
3791
+ BoolEnvar OMPX_APUPrefaultMemcopy;
3792
+
3793
+ // / On APUs, when prefaulting host memory before a copy,
3794
+ // / this env var controls the size after which prefaulting
3795
+ // / is applied.
3796
+ UInt32Envar OMPX_APUPrefaultMemcopySize;
3797
+
3757
3798
// / Stream manager for AMDGPU streams.
3758
3799
AMDGPUStreamManagerTy AMDGPUStreamManager;
3759
3800
@@ -3798,6 +3839,28 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3798
3839
// The maximum scratch memory size per thread.
3799
3840
// See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
3800
3841
uint32_t MaxThreadScratchSize;
3842
+
3843
+ // / Is the plugin associated with an APU?
3844
+ bool IsAPU = false ;
3845
+
3846
+ // Is the device an MI300X?
3847
+ bool IsEquippedWithMI300X = false ;
3848
+
3849
+ // Is the device an MI200?
3850
+ bool IsEquippedWithGFX90A = false ;
3851
+
3852
+ // / True if the system is configured with XNACK-Enabled.
3853
+ // / False otherwise.
3854
+ bool IsXnackEnabled = false ;
3855
+
3856
+ // Set by OMPX_DISABLE_USM_MAPS environment variable.
3857
+ // If set, fine graned memory is used for maps instead of coarse grained.
3858
+ bool EnableFineGrainedMemory = false ;
3859
+
3860
+ // / Set by OMPX_DISABLE_MAPS environment variable.
3861
+ // If false, map checks are performed also in unified_shared_memory mode.
3862
+ // TODO: this feature is non functional.
3863
+ bool NoUSMMapChecks = true ;
3801
3864
};
3802
3865
3803
3866
Error AMDGPUDeviceImageTy::loadExecutable (const AMDGPUDeviceTy &Device) {
0 commit comments