Skip to content

Commit d4c2a21

Browse files
uditagarwal97bb-sycl
authored andcommitted
[SYCL] Fix SYCL/Plugin/level_zero_sub_sub_device test case (intel#1515)
This test case was failing for a very long time. I investigated the failure and accordingly made the following changes: 1. Fixed RUN command by adding 'ZEX_NUMBER_OF_CCS=0:4' environment variable. This enables splitting sub-device into four CCS devices. 2. While partitioning sub-device into CCS, I changed 'partition_by_affinity_domain' to 'partition_by_cslice', in accordance to #7676. 3. Reorganized the test case a bit to make it more "aesthetically appealing".
1 parent 978e8e5 commit d4c2a21

File tree

1 file changed

+47
-95
lines changed

1 file changed

+47
-95
lines changed
Lines changed: 47 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// REQUIRES: gpu-intel-pvc, level_zero
22

33
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
4-
// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
5-
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: env ZE_DEBUG=1 env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
5+
// RUN: env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out
66

77
// Check that queues created on sub-sub-devices are going to specific compute
88
// engines:
@@ -25,89 +25,62 @@ using namespace std::chrono;
2525
#define INTER_NUM (150)
2626
#define KERNEL_NUM (2000)
2727

28-
void run(std::vector<queue> &queues) {
28+
void make_queue_and_run_workload(std::vector<device> &subsubdevices) {
29+
std::cout << "[important] create " << subsubdevices.size()
30+
<< " sycl queues, one for each sub-sub device" << std::endl;
31+
2932
auto N = 1024 * 16;
3033
size_t global_range = 1024;
3134
size_t local_range = 16;
3235

33-
float *buffer_host0 = malloc_host<float>(N, queues[0]);
34-
float *buffer_device0 = malloc_device<float>(N, queues[0]);
35-
36-
float *buffer_host1 = malloc_host<float>(N, queues[1]);
37-
float *buffer_device1 = malloc_device<float>(N, queues[1]);
38-
39-
float *buffer_host2 = malloc_host<float>(N, queues[2]);
40-
float *buffer_device2 = malloc_device<float>(N, queues[2]);
36+
std::vector<queue> queues;
37+
std::vector<float *> host_mem_ptrs;
38+
std::vector<float *> device_mem_ptrs;
39+
40+
// Create queues for each subdevice.
41+
for (auto &ccs : subsubdevices) {
42+
queue q(ccs,
43+
{property::queue::enable_profiling(), property::queue::in_order()});
44+
auto *host_mem_ptr = malloc_host<float>(N, q);
45+
auto *device_mem_ptr = malloc_device<float>(N, q);
46+
47+
for (int i = 0; i < N; ++i) {
48+
host_mem_ptr[i] = static_cast<float>(random_float());
49+
}
4150

42-
float *buffer_host3 = malloc_host<float>(N, queues[3]);
43-
float *buffer_device3 = malloc_device<float>(N, queues[3]);
51+
q.memcpy(device_mem_ptr, host_mem_ptr, N * sizeof(float)).wait();
4452

45-
for (int i = 0; i < N; ++i) {
46-
buffer_host0[i] = static_cast<float>(random_float());
47-
buffer_host1[i] = static_cast<float>(random_float());
48-
buffer_host2[i] = static_cast<float>(random_float());
49-
buffer_host3[i] = static_cast<float>(random_float());
53+
host_mem_ptrs.push_back(host_mem_ptr);
54+
device_mem_ptrs.push_back(device_mem_ptr);
55+
queues.push_back(q);
5056
}
5157

52-
queues[0].memcpy(buffer_device0, buffer_host0, N * sizeof(float)).wait();
53-
queues[1].memcpy(buffer_device1, buffer_host1, N * sizeof(float)).wait();
54-
queues[2].memcpy(buffer_device2, buffer_host2, N * sizeof(float)).wait();
55-
queues[3].memcpy(buffer_device3, buffer_host3, N * sizeof(float)).wait();
56-
58+
// Run workload.
5759
for (auto m = 0; m < INTER_NUM; ++m) {
5860
for (int k = 0; k < KERNEL_NUM; ++k) {
59-
auto event0 = queues[0].submit([&](handler &h) {
60-
h.parallel_for<class kernel0>(
61-
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
62-
[=](nd_item<1> item) {
63-
int i = item.get_global_linear_id();
64-
buffer_device0[i] = buffer_device0[i] + float(2.0);
65-
});
66-
});
67-
auto event1 = queues[1].submit([&](handler &h) {
68-
h.parallel_for<class kernel1>(
69-
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
70-
[=](nd_item<1> item) {
71-
int i = item.get_global_linear_id();
72-
buffer_device1[i] = buffer_device1[i] + float(2.0);
73-
});
74-
});
75-
auto event2 = queues[2].submit([&](handler &h) {
76-
h.parallel_for<class kernel2>(
77-
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
78-
[=](nd_item<1> item) {
79-
int i = item.get_global_linear_id();
80-
buffer_device2[i] = buffer_device2[i] + float(2.0);
81-
});
82-
});
83-
auto event3 = queues[3].submit([&](handler &h) {
84-
h.parallel_for<class kernel3>(
61+
for (int j = 0; j < queues.size(); j++) {
62+
queue current_queue = queues[j];
63+
float *device_mem_ptr = device_mem_ptrs[j];
64+
65+
auto event0 = current_queue.parallel_for<>(
8566
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
8667
[=](nd_item<1> item) {
8768
int i = item.get_global_linear_id();
88-
buffer_device3[i] = buffer_device3[i] + float(2.0);
69+
device_mem_ptr[i] = device_mem_ptr[i] + float(2.0);
8970
});
90-
});
71+
}
9172
}
92-
queues[0].wait();
93-
queues[1].wait();
94-
queues[2].wait();
95-
queues[3].wait();
96-
}
97-
98-
free(buffer_host0, queues[0]);
99-
free(buffer_device0, queues[0]);
10073

101-
free(buffer_host1, queues[1]);
102-
free(buffer_device1, queues[1]);
103-
104-
free(buffer_host2, queues[2]);
105-
free(buffer_device2, queues[2]);
74+
for (auto q : queues)
75+
q.wait();
76+
}
10677

107-
free(buffer_host3, queues[3]);
108-
free(buffer_device3, queues[3]);
78+
for (int j = 0; j < queues.size(); j++) {
79+
sycl::free(device_mem_ptrs[j], queues[j]);
80+
sycl::free(host_mem_ptrs[j], queues[j]);
81+
}
10982

110-
std::cout << "[info] Finish all" << std::endl;
83+
std::cout << "[info] Finish running workload" << std::endl;
11184
}
11285

11386
int main(void) {
@@ -116,20 +89,17 @@ int main(void) {
11689
<< std::endl;
11790
std::vector<device> subsub;
11891

119-
auto devices = device::get_devices(info::device_type::gpu);
120-
std::cout << "[info] device count = " << devices.size() << std::endl;
92+
device d;
12193

12294
// watch out device here
123-
auto subdevices =
124-
devices[1]
125-
.create_sub_devices<
126-
info::partition_property::partition_by_affinity_domain>(
127-
info::partition_affinity_domain::next_partitionable);
95+
auto subdevices = d.create_sub_devices<
96+
info::partition_property::partition_by_affinity_domain>(
97+
info::partition_affinity_domain::next_partitionable);
12898
std::cout << "[info] sub device size = " << subdevices.size() << std::endl;
12999
for (auto &subdev : subdevices) {
130100
auto subsubdevices = subdev.create_sub_devices<
131-
info::partition_property::partition_by_affinity_domain>(
132-
info::partition_affinity_domain::next_partitionable);
101+
info::partition_property::ext_intel_partition_by_cslice>();
102+
133103
std::cout << "[info] sub-sub device size = " << subsubdevices.size()
134104
<< std::endl;
135105
for (auto &subsubdev : subsubdevices) {
@@ -139,26 +109,8 @@ int main(void) {
139109

140110
std::cout << "[info] all sub-sub devices count: " << subsub.size()
141111
<< std::endl;
142-
std::cout << "[important] create 4 sycl queues on first 4 sub-sub devices"
143-
<< std::endl;
144-
145-
queue q0(subsub[0],
146-
{property::queue::enable_profiling(), property::queue::in_order()});
147-
queue q1(subsub[1],
148-
{property::queue::enable_profiling(), property::queue::in_order()});
149-
queue q2(subsub[2],
150-
{property::queue::enable_profiling(), property::queue::in_order()});
151-
queue q3(subsub[4],
152-
{property::queue::enable_profiling(), property::queue::in_order()});
153-
154-
std::vector<queue> queues;
155-
156-
queues.push_back(std::move(q0));
157-
queues.push_back(std::move(q1));
158-
queues.push_back(std::move(q2));
159-
queues.push_back(std::move(q3));
160112

161-
run(queues);
113+
make_queue_and_run_workload(subsub);
162114

163115
return 0;
164116
}

0 commit comments

Comments
 (0)