Skip to content

Commit 1c94541

Browse files
carlobertollironlieb
authored andcommitted
[OpenMP] Make coarse graining not dependent on USM info in the plugin.
This patch cleans up coarse graining upon mapping for MI200 and it removes a dependency on the requirement flags in the plugin, to ease downstream merging of llvm#80345. Change-Id: I0bad9113c106c238c2089bf8097789a27f6899ea
1 parent 0e64686 commit 1c94541

File tree

6 files changed

+121
-34
lines changed

6 files changed

+121
-34
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2704,17 +2704,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27042704
if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
27052705
return Err;
27062706

2707-
// Initialize memspace table to keep track of coarse grain memory regions
2708-
// in USM mode
2709-
if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
2710-
// TODO: add framework for multiple systems supporting
2711-
// unified_shared_memory
2712-
coarse_grain_mem_tab = new AMDGPUMemTypeBitFieldTable(
2713-
AMDGPU_X86_64_SystemConfiguration::max_addressable_byte +
2714-
1, // memory size
2715-
AMDGPU_X86_64_SystemConfiguration::page_size);
2716-
}
2717-
27182707
// Take the second timepoints and compute the required metadata.
27192708
OMPT_IF_ENABLED(completeH2DTimeRate(HostRef1, DeviceRef1););
27202709

@@ -2782,6 +2771,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27822771
// Invalidate agent reference.
27832772
Agent = {0};
27842773

2774+
delete CoarseGrainMemoryTable;
2775+
27852776
return Plugin::success();
27862777
}
27872778

@@ -3226,11 +3217,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
32263217
}
32273218

32283219
Error setCoarseGrainMemoryImpl(void *ptr, int64_t size) override final {
3229-
// track coarse grain memory pages in local table
3230-
coarse_grain_mem_tab->insert((const uintptr_t)ptr, size);
3220+
// If the table has not yet been created, check if the gpu arch is
3221+
// MI200 and create it.
3222+
if (!IsEquippedWithGFX90A)
3223+
return Plugin::success();
3224+
if (!CoarseGrainMemoryTable)
3225+
CoarseGrainMemoryTable = new AMDGPUMemTypeBitFieldTable(
3226+
AMDGPU_X86_64_SystemConfiguration::max_addressable_byte +
3227+
1, // memory size
3228+
AMDGPU_X86_64_SystemConfiguration::page_size);
32313229

3232-
// Instruct ROCr that the [ptr, ptr+size-1] pages are
3233-
// coarse grain
3230+
// track coarse grain memory pages in local table for user queries.
3231+
CoarseGrainMemoryTable->insert((const uintptr_t)ptr, size);
3232+
3233+
// Ask ROCr to turn [ptr, ptr+size-1] pages to
3234+
// coarse grain.
32343235
hsa_amd_svm_attribute_pair_t tt;
32353236
tt.attribute = HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG;
32363237
tt.value = HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED;
@@ -3244,13 +3245,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
32443245

32453246
uint32_t queryCoarseGrainMemoryImpl(const void *ptr,
32463247
int64_t size) override final {
3247-
3248-
// if the table is not yet allocated, it means we have not yet gone through
3249-
// an OpenMP pragma or API that would provoke intialization of the RTL
3250-
if (!coarse_grain_mem_tab)
3248+
// If the table has not yet been created it means that
3249+
// no memory has yet been set to coarse grain.
3250+
if (!CoarseGrainMemoryTable)
32513251
return 0;
32523252

3253-
return coarse_grain_mem_tab->contains((const uintptr_t)ptr, size);
3253+
return CoarseGrainMemoryTable->contains((const uintptr_t)ptr, size);
32543254
}
32553255

32563256
Error prepopulatePageTableImpl(void *ptr, int64_t size) override final {
@@ -3936,7 +3936,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
39363936
AMDHostDeviceTy &HostDevice;
39373937

39383938
// Data structure used to keep track of coarse grain memory regions
3939-
AMDGPUMemTypeBitFieldTable *coarse_grain_mem_tab = nullptr;
3939+
// on MI200 in unified_shared_memory programs only.
3940+
AMDGPUMemTypeBitFieldTable *CoarseGrainMemoryTable = nullptr;
39403941

39413942
/// Pointer to the preallocated device memory pool
39423943
void *PreAllocatedDeviceMemoryPool;

openmp/libomptarget/plugins-nextgen/amdgpu/utils/memtype.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ class AMDGPUMemTypeBitFieldTable {
4747
tab = (uint64_t *)calloc(tab_size, sizeof(uint64_t));
4848
}
4949

50+
~AMDGPUMemTypeBitFieldTable() {
51+
if (tab)
52+
free(tab);
53+
}
54+
5055
// Set all pages touched by address in the range [base, base+size-1]
5156
// \arg base : pointer to first byte of the memory area whose
5257
// type should become of the tracked type
@@ -102,7 +107,7 @@ class AMDGPUMemTypeBitFieldTable {
102107
// memory type. For any bit:
103108
// 0 = page is *not* of tracked type
104109
// 1 = page is of tracked type
105-
uint64_t *tab;
110+
uint64_t *tab = nullptr;
106111
};
107112

108113
#endif //__MEMTYPE_H__

openmp/libomptarget/src/OpenMP/API.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -654,10 +654,17 @@ EXTERN int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum) {
654654
}
655655

656656
EXTERN int omp_is_coarse_grain_mem_region(void *ptr, size_t size) {
657-
DeviceTy &Device = *PM->getDevice(omp_get_default_device());
658-
if (!Device.RTL->query_coarse_grain_mem_region)
657+
if (!(PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY))
659658
return 0;
660-
return Device.RTL->query_coarse_grain_mem_region(Device.DeviceID, ptr, size);
659+
auto DeviceOrErr = PM->getDevice(omp_get_default_device());
660+
if (!DeviceOrErr)
661+
FATAL_MESSAGE(omp_get_default_device(), "%s",
662+
toString(DeviceOrErr.takeError()).c_str());
663+
664+
if (!DeviceOrErr->RTL->query_coarse_grain_mem_region)
665+
return 0;
666+
return DeviceOrErr->RTL->query_coarse_grain_mem_region(
667+
omp_get_default_device(), ptr, size);
661668
}
662669

663670
EXTERN void *omp_get_mapped_ptr(const void *Ptr, int DeviceNum) {

openmp/libomptarget/src/OpenMP/Mapping.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,16 +266,20 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
266266
// memory as coarse-grained. The usage of coarse-grained memory can be
267267
// overriden by setting the env-var OMPX_DISABLE_USM_MAPS=1.
268268
// This is not done for APUs.
269-
if (!(Device.RTL->has_apu_device(Device.DeviceID) ||
270-
Device.RTL->has_USM_capable_dGPU(Device.DeviceID)) &&
271-
Device.RTL->is_fine_grained_memory_enabled(Device.DeviceID) &&
272-
HstPtrBegin && Device.RTL->set_coarse_grain_mem_region) {
269+
if (Device.RTL->has_USM_capable_dGPU(Device.DeviceID) && HstPtrBegin &&
270+
(!Device.RTL->is_fine_grained_memory_enabled(Device.DeviceID)) &&
271+
Device.RTL->set_coarse_grain_mem_region) {
273272
Device.RTL->set_coarse_grain_mem_region(Device.DeviceID, HstPtrBegin,
274273
Size);
274+
INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
275+
"Memory pages for HstPtrBegin " DPxMOD " Size=%" PRId64
276+
" switched to coarse grain\n",
277+
DPxPTR((uintptr_t)HstPtrBegin), Size);
275278
}
276279

277280
// If we are here, it means that we are either in auto zero-copy or USM.
278-
// Enable GPU page table prefaulting if selected by the user.
281+
// Enable GPU page table prefaulting if selected by the user. This feature
282+
// is only enabled for APUs.
279283
if (Device.EagerZeroCopyMaps) {
280284
Device.RTL->prepopulate_page_table(Device.DeviceID, HstPtrBegin, Size);
281285
INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,

openmp/libomptarget/test/lit.cfg

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ if config.libomptarget_has_libc:
117117
# For all other targets, we currently assume it is.
118118
supports_unified_shared_memory = True
119119
supports_apu = False
120+
is_mi200 = False
120121
if config.libomptarget_current_target.startswith('nvptx'):
121122
try:
122123
cuda_arch = int(config.cuda_test_arch[:3])
@@ -137,10 +138,14 @@ elif config.libomptarget_current_target.startswith('amdgcn'):
137138
(config.amdgpu_test_arch.startswith("gfx942") and
138139
evaluate_bool_env(config.environment['IS_APU']))):
139140
supports_apu = True
140-
if supports_unified_shared_memory:
141-
config.available_features.add('unified_shared_memory')
142-
if supports_apu:
143-
config.available_features.add('apu')
141+
if (config.amdgpu_test_arch.startswith("gfx90a")):
142+
is_mi200 = True
143+
if supports_unified_shared_memory:
144+
config.available_features.add('unified_shared_memory')
145+
if supports_apu:
146+
config.available_features.add('apu')
147+
if is_mi200:
148+
config.available_features.add('mi200')
144149

145150
# Setup environment to find dynamic library at runtime
146151
if config.operating_system == 'Windows':
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compilexx-generic
3+
// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
4+
// RUN: | %fcheck-generic -check-prefix=CHECK
5+
6+
// RUN: %libomptarget-compilexx-generic
7+
// RUN: env OMPX_DISABLE_USM_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
8+
// RUN: | %fcheck-generic -check-prefix=CHECK_FINE
9+
10+
// UNSUPPORTED: aarch64-unknown-linux-gnu
11+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
12+
// UNSUPPORTED: nvptx64-nvidia-cuda
13+
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
14+
// UNSUPPORTED: x86_64-pc-linux-gnu
15+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
16+
17+
// REQUIRES: unified_shared_memory
18+
// REQUIRES: mi200
19+
20+
// clang-format on
21+
22+
#include <cstdio>
23+
#include <omp.h>
24+
25+
#pragma omp requires unified_shared_memory
26+
27+
int main() {
28+
const size_t n = 1024;
29+
30+
double *a = new double[n];
31+
// clang-format off
32+
// CHECK: Memory pages for HstPtrBegin 0x{{.*}} Size=8192 switched to coarse grain
33+
// CHECK: Before mapping, memory is fine grain.
34+
// CHECK_FINE: Before mapping, memory is fine grain.
35+
// clang-format on
36+
if (omp_is_coarse_grain_mem_region(a, n * sizeof(double)))
37+
printf("Before mapping, memory is coarse grain.\n");
38+
else
39+
printf("Before mapping, memory is fine grain.\n");
40+
41+
#pragma omp target enter data map(to : a[:n])
42+
43+
// CHECK: After mapping, memory is coarse grain.
44+
// CHECK_FINE: After mapping, memory is fine grain.
45+
if (omp_is_coarse_grain_mem_region(a, n * sizeof(double)))
46+
printf("After mapping, memory is coarse grain.\n");
47+
else
48+
printf("After mapping, memory is fine grain.\n");
49+
50+
#pragma omp target exit data map(from : a[:n])
51+
52+
// CHECK: After removing map, memory is still coarse grain.
53+
// CHECK_FINE: After removing map, memory is back to fine grain.
54+
if (omp_is_coarse_grain_mem_region(a, n * sizeof(double)))
55+
printf("After removing map, memory is still coarse grain.\n");
56+
else
57+
printf("After removing map, memory is back to fine grain.\n");
58+
59+
// Plugins must be initialized for unified_shared_memory requirement
60+
// to be added. An empty target region is enough for that initialization.
61+
#pragma omp target
62+
{}
63+
64+
return 0;
65+
}

0 commit comments

Comments
 (0)