1
1
// REQUIRES: gpu-intel-pvc, level_zero
2
2
3
3
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
4
- // RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
5
- // RUN: %GPU_RUN_PLACEHOLDER %t.out
4
+ // RUN: env ZE_DEBUG=1 env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
5
+ // RUN: env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out
6
6
7
7
// Check that queues created on sub-sub-devices are going to specific compute
8
8
// engines:
@@ -25,89 +25,62 @@ using namespace std::chrono;
25
25
#define INTER_NUM (150 )
26
26
#define KERNEL_NUM (2000 )
27
27
28
- void run (std::vector<queue> &queues) {
28
+ void make_queue_and_run_workload (std::vector<device> &subsubdevices) {
29
+ std::cout << " [important] create " << subsubdevices.size ()
30
+ << " sycl queues, one for each sub-sub device" << std::endl;
31
+
29
32
auto N = 1024 * 16 ;
30
33
size_t global_range = 1024 ;
31
34
size_t local_range = 16 ;
32
35
33
- float *buffer_host0 = malloc_host<float >(N, queues[0 ]);
34
- float *buffer_device0 = malloc_device<float >(N, queues[0 ]);
35
-
36
- float *buffer_host1 = malloc_host<float >(N, queues[1 ]);
37
- float *buffer_device1 = malloc_device<float >(N, queues[1 ]);
38
-
39
- float *buffer_host2 = malloc_host<float >(N, queues[2 ]);
40
- float *buffer_device2 = malloc_device<float >(N, queues[2 ]);
36
+ std::vector<queue> queues;
37
+ std::vector<float *> host_mem_ptrs;
38
+ std::vector<float *> device_mem_ptrs;
39
+
40
+ // Create queues for each subdevice.
41
+ for (auto &ccs : subsubdevices) {
42
+ queue q (ccs,
43
+ {property::queue::enable_profiling (), property::queue::in_order ()});
44
+ auto *host_mem_ptr = malloc_host<float >(N, q);
45
+ auto *device_mem_ptr = malloc_device<float >(N, q);
46
+
47
+ for (int i = 0 ; i < N; ++i) {
48
+ host_mem_ptr[i] = static_cast <float >(random_float ());
49
+ }
41
50
42
- float *buffer_host3 = malloc_host<float >(N, queues[3 ]);
43
- float *buffer_device3 = malloc_device<float >(N, queues[3 ]);
51
+ q.memcpy (device_mem_ptr, host_mem_ptr, N * sizeof (float )).wait ();
44
52
45
- for (int i = 0 ; i < N; ++i) {
46
- buffer_host0[i] = static_cast <float >(random_float ());
47
- buffer_host1[i] = static_cast <float >(random_float ());
48
- buffer_host2[i] = static_cast <float >(random_float ());
49
- buffer_host3[i] = static_cast <float >(random_float ());
53
+ host_mem_ptrs.push_back (host_mem_ptr);
54
+ device_mem_ptrs.push_back (device_mem_ptr);
55
+ queues.push_back (q);
50
56
}
51
57
52
- queues[0 ].memcpy (buffer_device0, buffer_host0, N * sizeof (float )).wait ();
53
- queues[1 ].memcpy (buffer_device1, buffer_host1, N * sizeof (float )).wait ();
54
- queues[2 ].memcpy (buffer_device2, buffer_host2, N * sizeof (float )).wait ();
55
- queues[3 ].memcpy (buffer_device3, buffer_host3, N * sizeof (float )).wait ();
56
-
58
+ // Run workload.
57
59
for (auto m = 0 ; m < INTER_NUM; ++m) {
58
60
for (int k = 0 ; k < KERNEL_NUM; ++k) {
59
- auto event0 = queues[0 ].submit ([&](handler &h) {
60
- h.parallel_for <class kernel0 >(
61
- nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
62
- [=](nd_item<1 > item) {
63
- int i = item.get_global_linear_id ();
64
- buffer_device0[i] = buffer_device0[i] + float (2.0 );
65
- });
66
- });
67
- auto event1 = queues[1 ].submit ([&](handler &h) {
68
- h.parallel_for <class kernel1 >(
69
- nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
70
- [=](nd_item<1 > item) {
71
- int i = item.get_global_linear_id ();
72
- buffer_device1[i] = buffer_device1[i] + float (2.0 );
73
- });
74
- });
75
- auto event2 = queues[2 ].submit ([&](handler &h) {
76
- h.parallel_for <class kernel2 >(
77
- nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
78
- [=](nd_item<1 > item) {
79
- int i = item.get_global_linear_id ();
80
- buffer_device2[i] = buffer_device2[i] + float (2.0 );
81
- });
82
- });
83
- auto event3 = queues[3 ].submit ([&](handler &h) {
84
- h.parallel_for <class kernel3 >(
61
+ for (int j = 0 ; j < queues.size (); j++) {
62
+ queue current_queue = queues[j];
63
+ float *device_mem_ptr = device_mem_ptrs[j];
64
+
65
+ auto event0 = current_queue.parallel_for <>(
85
66
nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
86
67
[=](nd_item<1 > item) {
87
68
int i = item.get_global_linear_id ();
88
- buffer_device3 [i] = buffer_device3 [i] + float (2.0 );
69
+ device_mem_ptr [i] = device_mem_ptr [i] + float (2.0 );
89
70
});
90
- });
71
+ }
91
72
}
92
- queues[0 ].wait ();
93
- queues[1 ].wait ();
94
- queues[2 ].wait ();
95
- queues[3 ].wait ();
96
- }
97
-
98
- free (buffer_host0, queues[0 ]);
99
- free (buffer_device0, queues[0 ]);
100
73
101
- free (buffer_host1, queues[1 ]);
102
- free (buffer_device1, queues[1 ]);
103
-
104
- free (buffer_host2, queues[2 ]);
105
- free (buffer_device2, queues[2 ]);
74
+ for (auto q : queues)
75
+ q.wait ();
76
+ }
106
77
107
- free (buffer_host3, queues[3 ]);
108
- free (buffer_device3, queues[3 ]);
78
+ for (int j = 0 ; j < queues.size (); j++) {
79
+ sycl::free (device_mem_ptrs[j], queues[j]);
80
+ sycl::free (host_mem_ptrs[j], queues[j]);
81
+ }
109
82
110
- std::cout << " [info] Finish all " << std::endl;
83
+ std::cout << " [info] Finish running workload " << std::endl;
111
84
}
112
85
113
86
int main (void ) {
@@ -116,20 +89,17 @@ int main(void) {
116
89
<< std::endl;
117
90
std::vector<device> subsub;
118
91
119
- auto devices = device::get_devices (info::device_type::gpu);
120
- std::cout << " [info] device count = " << devices.size () << std::endl;
92
+ device d;
121
93
122
94
// watch out device here
123
- auto subdevices =
124
- devices[1 ]
125
- .create_sub_devices <
126
- info::partition_property::partition_by_affinity_domain>(
127
- info::partition_affinity_domain::next_partitionable);
95
+ auto subdevices = d.create_sub_devices <
96
+ info::partition_property::partition_by_affinity_domain>(
97
+ info::partition_affinity_domain::next_partitionable);
128
98
std::cout << " [info] sub device size = " << subdevices.size () << std::endl;
129
99
for (auto &subdev : subdevices) {
130
100
auto subsubdevices = subdev.create_sub_devices <
131
- info::partition_property::partition_by_affinity_domain>(
132
- info::partition_affinity_domain::next_partitionable);
101
+ info::partition_property::ext_intel_partition_by_cslice>();
102
+
133
103
std::cout << " [info] sub-sub device size = " << subsubdevices.size ()
134
104
<< std::endl;
135
105
for (auto &subsubdev : subsubdevices) {
@@ -139,26 +109,8 @@ int main(void) {
139
109
140
110
std::cout << " [info] all sub-sub devices count: " << subsub.size ()
141
111
<< std::endl;
142
- std::cout << " [important] create 4 sycl queues on first 4 sub-sub devices"
143
- << std::endl;
144
-
145
- queue q0 (subsub[0 ],
146
- {property::queue::enable_profiling (), property::queue::in_order ()});
147
- queue q1 (subsub[1 ],
148
- {property::queue::enable_profiling (), property::queue::in_order ()});
149
- queue q2 (subsub[2 ],
150
- {property::queue::enable_profiling (), property::queue::in_order ()});
151
- queue q3 (subsub[4 ],
152
- {property::queue::enable_profiling (), property::queue::in_order ()});
153
-
154
- std::vector<queue> queues;
155
-
156
- queues.push_back (std::move (q0));
157
- queues.push_back (std::move (q1));
158
- queues.push_back (std::move (q2));
159
- queues.push_back (std::move (q3));
160
112
161
- run (queues );
113
+ make_queue_and_run_workload (subsub );
162
114
163
115
return 0 ;
164
116
}
0 commit comments