Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Fix SYCL/Plugin/level_zero_sub_sub_device test case #1515

Merged
merged 4 commits into from
Jan 17, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 47 additions & 95 deletions SYCL/Plugin/level_zero_sub_sub_device.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// REQUIRES: gpu-intel-pvc, level_zero

// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: env ZE_DEBUG=1 env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
// RUN: env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out

// Check that queues created on sub-sub-devices are going to specific compute
// engines:
Expand All @@ -25,89 +25,62 @@ using namespace std::chrono;
#define INTER_NUM (150)
#define KERNEL_NUM (2000)

void run(std::vector<queue> &queues) {
void make_queue_and_run_workload(std::vector<device> &subsubdevices) {
std::cout << "[important] create " << subsubdevices.size()
<< " sycl queues, one for each sub-sub device" << std::endl;

auto N = 1024 * 16;
size_t global_range = 1024;
size_t local_range = 16;

float *buffer_host0 = malloc_host<float>(N, queues[0]);
float *buffer_device0 = malloc_device<float>(N, queues[0]);

float *buffer_host1 = malloc_host<float>(N, queues[1]);
float *buffer_device1 = malloc_device<float>(N, queues[1]);

float *buffer_host2 = malloc_host<float>(N, queues[2]);
float *buffer_device2 = malloc_device<float>(N, queues[2]);
std::vector<queue> queues;
std::vector<float *> host_mem_ptrs;
std::vector<float *> device_mem_ptrs;

// Create queues for each subdevice.
for (auto &ccs : subsubdevices) {
queue q(ccs,
{property::queue::enable_profiling(), property::queue::in_order()});
auto *host_mem_ptr = malloc_host<float>(N, q);
auto *device_mem_ptr = malloc_device<float>(N, q);

for (int i = 0; i < N; ++i) {
host_mem_ptr[i] = static_cast<float>(random_float());
}

float *buffer_host3 = malloc_host<float>(N, queues[3]);
float *buffer_device3 = malloc_device<float>(N, queues[3]);
q.memcpy(device_mem_ptr, host_mem_ptr, N * sizeof(float)).wait();

for (int i = 0; i < N; ++i) {
buffer_host0[i] = static_cast<float>(random_float());
buffer_host1[i] = static_cast<float>(random_float());
buffer_host2[i] = static_cast<float>(random_float());
buffer_host3[i] = static_cast<float>(random_float());
host_mem_ptrs.push_back(host_mem_ptr);
device_mem_ptrs.push_back(device_mem_ptr);
queues.push_back(q);
}

queues[0].memcpy(buffer_device0, buffer_host0, N * sizeof(float)).wait();
queues[1].memcpy(buffer_device1, buffer_host1, N * sizeof(float)).wait();
queues[2].memcpy(buffer_device2, buffer_host2, N * sizeof(float)).wait();
queues[3].memcpy(buffer_device3, buffer_host3, N * sizeof(float)).wait();

// Run workload.
for (auto m = 0; m < INTER_NUM; ++m) {
for (int k = 0; k < KERNEL_NUM; ++k) {
auto event0 = queues[0].submit([&](handler &h) {
h.parallel_for<class kernel0>(
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device0[i] = buffer_device0[i] + float(2.0);
});
});
auto event1 = queues[1].submit([&](handler &h) {
h.parallel_for<class kernel1>(
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device1[i] = buffer_device1[i] + float(2.0);
});
});
auto event2 = queues[2].submit([&](handler &h) {
h.parallel_for<class kernel2>(
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device2[i] = buffer_device2[i] + float(2.0);
});
});
auto event3 = queues[3].submit([&](handler &h) {
h.parallel_for<class kernel3>(
for (int j = 0; j < queues.size(); j++) {
queue current_queue = queues[j];
float *device_mem_ptr = device_mem_ptrs[j];

auto event0 = current_queue.parallel_for<>(
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device3[i] = buffer_device3[i] + float(2.0);
device_mem_ptr[i] = device_mem_ptr[i] + float(2.0);
});
});
}
}
queues[0].wait();
queues[1].wait();
queues[2].wait();
queues[3].wait();
}

free(buffer_host0, queues[0]);
free(buffer_device0, queues[0]);

free(buffer_host1, queues[1]);
free(buffer_device1, queues[1]);

free(buffer_host2, queues[2]);
free(buffer_device2, queues[2]);
for (auto q : queues)
q.wait();
}

free(buffer_host3, queues[3]);
free(buffer_device3, queues[3]);
for (int j = 0; j < queues.size(); j++) {
sycl::free(device_mem_ptrs[j], queues[j]);
sycl::free(host_mem_ptrs[j], queues[j]);
}

std::cout << "[info] Finish all" << std::endl;
std::cout << "[info] Finish running workload" << std::endl;
}

int main(void) {
Expand All @@ -116,20 +89,17 @@ int main(void) {
<< std::endl;
std::vector<device> subsub;

auto devices = device::get_devices(info::device_type::gpu);
std::cout << "[info] device count = " << devices.size() << std::endl;
device d;

// watch out device here
auto subdevices =
devices[1]
.create_sub_devices<
info::partition_property::partition_by_affinity_domain>(
info::partition_affinity_domain::next_partitionable);
auto subdevices = d.create_sub_devices<
info::partition_property::partition_by_affinity_domain>(
info::partition_affinity_domain::next_partitionable);
std::cout << "[info] sub device size = " << subdevices.size() << std::endl;
for (auto &subdev : subdevices) {
auto subsubdevices = subdev.create_sub_devices<
info::partition_property::partition_by_affinity_domain>(
info::partition_affinity_domain::next_partitionable);
info::partition_property::ext_intel_partition_by_cslice>();

std::cout << "[info] sub-sub device size = " << subsubdevices.size()
<< std::endl;
for (auto &subsubdev : subsubdevices) {
Expand All @@ -139,26 +109,8 @@ int main(void) {

std::cout << "[info] all sub-sub devices count: " << subsub.size()
<< std::endl;
std::cout << "[important] create 4 sycl queues on first 4 sub-sub devices"
<< std::endl;

queue q0(subsub[0],
{property::queue::enable_profiling(), property::queue::in_order()});
queue q1(subsub[1],
{property::queue::enable_profiling(), property::queue::in_order()});
queue q2(subsub[2],
{property::queue::enable_profiling(), property::queue::in_order()});
queue q3(subsub[4],
{property::queue::enable_profiling(), property::queue::in_order()});

std::vector<queue> queues;

queues.push_back(std::move(q0));
queues.push_back(std::move(q1));
queues.push_back(std::move(q2));
queues.push_back(std::move(q3));

run(queues);
make_queue_and_run_workload(subsub);

return 0;
}