1
- // REQUIRES: cuda
2
- // RUN: % if any-device-is-cuda %{ %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_61 -o %t.out %}
3
- // RUN: %if cuda %{ %{ run} %t.out %}
1
+ // REQUIRES: cuda || hip || level_zero
2
+ // RUN: %{build} % if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_61 %} -o %t.out
3
+ // RUN: %{ run} %t.out
4
4
5
5
#include < cassert>
6
6
#include < numeric>
@@ -18,17 +18,8 @@ constexpr size_t N = 512;
18
18
19
19
int main () {
20
20
21
- // Note that this code will largely be removed: it is temporary due to the
22
- // temporary lack of multiple devices per sycl context in the Nvidia backend.
23
- // A portable implementation, using a single gpu platform, should be possible
24
- // once the Nvidia context issues are resolved.
25
- // //////////////////////////////////////////////////////////////////////
26
- std::vector<sycl::device> Devs;
27
- for (const auto &plt : sycl::platform::get_platforms ()) {
21
+ auto Devs = platform (gpu_selector_v).get_devices (info::device_type::gpu);
28
22
29
- if (plt.get_backend () == sycl::backend::ext_oneapi_cuda)
30
- Devs.push_back (plt.get_devices ()[0 ]);
31
- }
32
23
if (Devs.size () < 2 ) {
33
24
std::cout << " Cannot test P2P capabilities, at least two devices are "
34
25
" required, exiting."
@@ -51,26 +42,26 @@ int main() {
51
42
// Enables Devs[1] to access Devs[0] memory.
52
43
Devs[1 ].ext_oneapi_enable_peer_access (Devs[0 ]);
53
44
54
- std::vector<double > input (N);
45
+ std::vector<int > input (N);
55
46
std::iota (input.begin (), input.end (), 0 );
56
47
57
- double h_sum = 0 .;
48
+ int h_sum = 0 .;
58
49
for (const auto &value : input) {
59
50
h_sum += value;
60
51
}
61
52
62
- double *d_sum = malloc_shared<double >(1 , Queues[0 ]);
63
- double *d_in = malloc_device<double >(N, Queues[0 ]);
53
+ int *d_sum = malloc_shared<int >(1 , Queues[0 ]);
54
+ int *d_in = malloc_device<int >(N, Queues[0 ]);
64
55
65
- Queues[0 ].memcpy (d_in, &input[0 ], N * sizeof (double ));
56
+ Queues[0 ].memcpy (d_in, &input[0 ], N * sizeof (int ));
66
57
Queues[0 ].wait ();
67
58
68
59
range global_range{N};
69
60
70
61
*d_sum = 0 .;
71
62
Queues[1 ].submit ([&](handler &h) {
72
63
h.parallel_for <class peer_atomic >(global_range, [=](id<1 > i) {
73
- sycl::atomic_ref<double , sycl::memory_order::relaxed,
64
+ sycl::atomic_ref<int , sycl::memory_order::relaxed,
74
65
sycl::memory_scope::system,
75
66
access::address_space::global_space>(*d_sum) += d_in[i];
76
67
});
0 commit comments