Skip to content

Commit 6918bde

Browse files
carlobertollidoru1004jplehr
committed
[OpenMP] Support for global variables when in auto zero-copy.
When building without unified_shared_memory, global variables are declared in the device binary and allocated upon loading onto GPU memory. However, when running in zero-copy mode (same as with unified_shared_memory) D2H and H2D copies for mapped local and global variables are turned off. This patch turns back on H2D and D2H copies when they refer to global variables, enabling an application built without unified_shared_memory to work correctly with global variables when run under automatic zero-copy. Co-authored-by: Doru Bercea <[email protected]> Co-authored-by: Jan-Patrick Lehr <[email protected]>
1 parent 93a2a8c commit 6918bde

File tree

2 files changed

+87
-1
lines changed

2 files changed

+87
-1
lines changed

openmp/libomptarget/src/omptarget.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,8 @@ static int initLibrary(DeviceTy &Device) {
188188
// If unified memory is active, the corresponding global is a device
189189
// reference to the host global. We need to initialize the pointer on
190190
// the deive to point to the memory on the host.
191-
if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
191+
if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
192+
(PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
192193
if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
193194
Entry.size) != OFFLOAD_SUCCESS)
194195
REPORT("Failed to write symbol for USM %s\n", Entry.name);
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compilexx-generic
3+
// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=60 %libomptarget-run-generic 2>&1 \
4+
// RUN: | %fcheck-generic -check-prefix=CHECK
5+
6+
// UNSUPPORTED: aarch64-unknown-linux-gnu
7+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
8+
// UNSUPPORTED: nvptx64-nvidia-cuda
9+
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
10+
// UNSUPPORTED: x86_64-pc-linux-gnu
11+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
12+
13+
// REQUIRES: unified_shared_memory
14+
15+
// clang-format on
16+
17+
#include <cstdint>
18+
#include <cstdio>
19+
20+
/// Test for globals under automatic zero-copy.
21+
/// Because we are building without unified_shared_memory
22+
/// requirement pragma, all globals are allocated in the device
23+
/// memory of all used GPUs. To ensure those globals contain the intended
24+
/// values, we need to execute H2D and D2H memory copies even if we are running
25+
/// in automatic zero-copy. This only applies to globals. Local variables (their
26+
/// host pointers) are passed to the kernels by-value, according to the
27+
/// automatic zero-copy behavior.
28+
29+
#pragma omp begin declare target
30+
int32_t x; // 4 bytes
31+
int32_t z[10]; // 40 bytes
32+
int32_t *k; // 20 bytes
33+
#pragma omp end declare target
34+
35+
int main() {
36+
int32_t *dev_k = nullptr;
37+
x = 3;
38+
int32_t y = -1;
39+
for (size_t t = 0; t < 10; t++)
40+
z[t] = t;
41+
k = new int32_t[5];
42+
43+
printf("Host pointer for k = %p\n", k);
44+
for (size_t t = 0; t < 5; t++)
45+
k[t] = -t;
46+
47+
/// target update to forces a copy between host and device global, which we must
48+
/// execute to keep the two global copies consistent. CHECK: Copying data from
49+
/// host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=40, Name=z
50+
#pragma omp target update to(z[ : 10])
51+
52+
/// target map with always modifier (for x) forces a copy between host and
53+
/// device global, which we must execute to keep the two global copies
54+
/// consistent. k's content (host address) is passed by-value to the kernel
55+
/// (Size=20 case). y, being a local variable, is also passed by-value to the
56+
/// kernel (Size=4 case) CHECK: Return HstPtrBegin {{.*}} Size=4 for unified
57+
/// shared memory CHECK: Return HstPtrBegin {{.*}} Size=20 for unified shared
58+
/// memory CHECK: Copying data from host to device, HstPtr={{.*}},
59+
/// TgtPtr={{.*}}, Size=4, Name=x
60+
#pragma omp target map(to : k[ : 5]) map(always, tofrom : x) map(tofrom : y) \
61+
map(from : dev_k)
62+
{
63+
x++;
64+
y++;
65+
for (size_t t = 0; t < 10; t++)
66+
z[t]++;
67+
dev_k = k;
68+
}
69+
/// CHECK-NOT: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}},
70+
/// Size=20, Name=k
71+
72+
/// CHECK: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}},
73+
/// Size=4, Name=x
74+
75+
/// CHECK: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}},
76+
/// Size=40, Name=z
77+
#pragma omp target update from(z[ : 10])
78+
79+
/// CHECK-NOT: k pointer not correctly passed to kernel
80+
if (dev_k != k)
81+
printf("k pointer not correctly passed to kernel\n");
82+
83+
delete[] k;
84+
return 0;
85+
}

0 commit comments

Comments
 (0)