Skip to content

Commit 5f65fb6

Browse files
committed
[OpenMP] [amdgpu] Added a synchronous version of data exchange.
1 parent 44af53b commit 5f65fb6

File tree

2 files changed

+88
-0
lines changed

2 files changed

+88
-0
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2402,6 +2402,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
24022402
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
24032403
AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
24042404

2405+
// For large transfers use synchronous behavior.
2406+
if (Size >= OMPX_MaxAsyncCopyBytes) {
2407+
if (AsyncInfoWrapper.hasQueue())
2408+
if (auto Err = synchronize(AsyncInfoWrapper))
2409+
return Err;
2410+
2411+
AMDGPUSignalTy Signal;
2412+
if (auto Err = Signal.init())
2413+
return Err;
2414+
2415+
if (auto Err = utils::asyncMemCopy(
2416+
useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr,
2417+
getAgent(), (uint64_t)Size, 0, nullptr, Signal.get()))
2418+
return Err;
2419+
2420+
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
2421+
return Err;
2422+
2423+
return Signal.deinit();
2424+
}
2425+
24052426
AMDGPUStreamTy *Stream = nullptr;
24062427
if (auto Err = getStream(AsyncInfoWrapper, Stream))
24072428
return Err;
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// RUN: %libomptarget-compile-generic && \
2+
// RUN: env LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES=0 %libomptarget-run-generic | \
3+
// RUN: %fcheck-generic -allow-empty
4+
// REQUIRES: amdgcn-amd-amdhsa
5+
6+
#include <assert.h>
7+
#include <omp.h>
8+
#include <stdio.h>
9+
#include <stdlib.h>
10+
11+
const int magic_num = 7;
12+
13+
int main(int argc, char *argv[]) {
14+
const int N = 128;
15+
const int num_devices = omp_get_num_devices();
16+
17+
// No target device, just return
18+
if (num_devices == 0) {
19+
printf("PASS\n");
20+
return 0;
21+
}
22+
23+
const int src_device = 0;
24+
int dst_device = num_devices - 1;
25+
26+
int length = N * sizeof(int);
27+
int *src_ptr = omp_target_alloc(length, src_device);
28+
int *dst_ptr = omp_target_alloc(length, dst_device);
29+
30+
assert(src_ptr && "src_ptr is NULL");
31+
assert(dst_ptr && "dst_ptr is NULL");
32+
33+
#pragma omp target teams distribute parallel for device(src_device) \
34+
is_device_ptr(src_ptr)
35+
for (int i = 0; i < N; ++i) {
36+
src_ptr[i] = magic_num;
37+
}
38+
39+
int rc =
40+
omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
41+
42+
assert(rc == 0 && "error in omp_target_memcpy");
43+
44+
int *buffer = malloc(length);
45+
46+
assert(buffer && "failed to allocate host buffer");
47+
48+
#pragma omp target teams distribute parallel for device(dst_device) \
49+
map(from : buffer[0 : N]) is_device_ptr(dst_ptr)
50+
for (int i = 0; i < N; ++i) {
51+
buffer[i] = dst_ptr[i] + magic_num;
52+
}
53+
54+
for (int i = 0; i < N; ++i)
55+
assert(buffer[i] == 2 * magic_num);
56+
57+
printf("PASS\n");
58+
59+
// Free host and device memory
60+
free(buffer);
61+
omp_target_free(src_ptr, src_device);
62+
omp_target_free(dst_ptr, dst_device);
63+
64+
return 0;
65+
}
66+
67+
// CHECK: PASS

0 commit comments

Comments
 (0)