Skip to content

Commit 8dcc66f

Browse files
carlobertollidoru1004jplehr
committed
[OpenMP] Support for global variables when in auto zero-copy.
When building without unified_shared_memory, global variables are declared in the device binary and allocated upon loading onto GPU memory. However, when running in zero-copy mode (same as with unified_shared_memory) D2H and H2D copies for mapped local and global variables are turned off. This patch turns back on H2D and D2H copies when they refer to global variables, enabling an application built without unified_shared_memory to work correctly with global variables when run under automatic zero-copy. Co-authored-by: Doru Bercea <[email protected]> Co-authored-by: Jan-Patrick Lehr <[email protected]>
1 parent 93a2a8c commit 8dcc66f

File tree

2 files changed

+80
-1
lines changed

2 files changed

+80
-1
lines changed

openmp/libomptarget/src/omptarget.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,8 @@ static int initLibrary(DeviceTy &Device) {
188188
// If unified memory is active, the corresponding global is a device
189189
// reference to the host global. We need to initialize the pointer on
190190
// the deive to point to the memory on the host.
191-
if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
191+
if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
192+
(PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
192193
if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
193194
Entry.size) != OFFLOAD_SUCCESS)
194195
REPORT("Failed to write symbol for USM %s\n", Entry.name);
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compilexx-generic
3+
// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=60 %libomptarget-run-generic 2>&1 \
4+
// RUN: | %fcheck-generic -check-prefix=CHECK
5+
6+
// UNSUPPORTED: aarch64-unknown-linux-gnu
7+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
8+
// UNSUPPORTED: nvptx64-nvidia-cuda
9+
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
10+
// UNSUPPORTED: x86_64-pc-linux-gnu
11+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
12+
13+
// REQUIRES: unified_shared_memory
14+
15+
// clang-format on
16+
17+
#include <cstdio>
18+
#include <cstdint>
19+
20+
/// Test for globals under automatic zero-copy.
21+
/// Because we are building without unified_shared_memory
22+
/// requirement pragma, all globals are allocated in the device
23+
/// memory of all used GPUs. To ensure those globals contain the intended values, we
24+
/// need to execute H2D and D2H memory copies even if we are running in automatic zero-copy.
25+
/// This only applies to globals. Local variables (their host pointers) are passed to the kernels by-value,
26+
/// according to the automatic zero-copy behavior.
27+
28+
#pragma omp begin declare target
29+
int32_t x; // 4 bytes
30+
int32_t z[10]; // 40 bytes
31+
int32_t *k; // 20 bytes
32+
#pragma omp end declare target
33+
34+
int main() {
35+
int32_t *dev_k = nullptr;
36+
x = 3;
37+
int32_t y = -1;
38+
for(size_t t = 0; t < 10; t++)
39+
z[t] = t;
40+
k = new int32_t[5];
41+
42+
printf("Host pointer for k = %p\n", k);
43+
for(size_t t = 0; t < 5; t++)
44+
k[t] = -t;
45+
46+
/// target update to forces a copy between host and device global, which we must execute to
47+
/// keep the two global copies consistent.
48+
/// CHECK: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=40, Name=z
49+
#pragma omp target update to(z[:10])
50+
51+
/// target map with always modifier (for x) forces a copy between host and device global, which we must execute to
52+
/// keep the two global copies consistent.
53+
/// k's content (host address) is passed by-value to the kernel (Size=20 case).
54+
/// y, being a local variable, is also passed by-value to the kernel (Size=4 case)
55+
/// CHECK: Return HstPtrBegin {{.*}} Size=4 for unified shared memory
56+
/// CHECK: Return HstPtrBegin {{.*}} Size=20 for unified shared memory
57+
/// CHECK: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4, Name=x
58+
#pragma omp target map(to:k[:5]) map(always, tofrom:x) map(tofrom:y) map(from:dev_k)
59+
{
60+
x++;
61+
y++;
62+
for(size_t t = 0; t < 10; t++)
63+
z[t]++;
64+
dev_k = k;
65+
}
66+
/// CHECK-NOT: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=20, Name=k
67+
68+
/// CHECK: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4, Name=x
69+
70+
/// CHECK: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=40, Name=z
71+
#pragma omp target update from(z[:10])
72+
73+
/// CHECK-NOT: k pointer not correctly passed to kernel
74+
if (dev_k != k) printf("k pointer not correctly passed to kernel\n");
75+
76+
delete [] k;
77+
return 0;
78+
}

0 commit comments

Comments
 (0)