@@ -116,21 +116,29 @@ void copy_numpy_ndarray_into_usm_ndarray(
116
116
117
117
// check for applicability of special cases:
118
118
// (same type && (both C-contiguous || both F-contiguous)
119
- bool both_c_contig =
119
+ const bool both_c_contig =
120
120
((src_flags & py::array::c_style) && dst.is_c_contiguous ());
121
- bool both_f_contig =
121
+ const bool both_f_contig =
122
122
((src_flags & py::array::f_style) && dst.is_f_contiguous ());
123
+
124
+ const bool same_data_types = (src_type_id == dst_type_id);
125
+
123
126
if (both_c_contig || both_f_contig) {
124
- if (src_type_id == dst_type_id ) {
127
+ if (same_data_types ) {
125
128
int src_elem_size = npy_src.itemsize ();
126
129
127
130
sycl::event copy_ev =
128
131
exec_q.memcpy (static_cast <void *>(dst_data),
129
132
static_cast <const void *>(src_data),
130
133
src_nelems * src_elem_size, depends);
131
134
132
- // wait for copy_ev to complete
133
- copy_ev.wait ();
135
+ {
136
+ // wait for copy_ev to complete
137
+ // release GIL to allow other threads (host_tasks)
138
+ // a chance to acquire GIL
139
+ py::gil_scoped_release lock{};
140
+ copy_ev.wait ();
141
+ }
134
142
135
143
return ;
136
144
}
@@ -202,6 +210,30 @@ void copy_numpy_ndarray_into_usm_ndarray(
202
210
simplified_dst_strides.push_back (1 );
203
211
}
204
212
213
+ const bool can_use_memcpy =
214
+ (same_data_types && (nd == 1 ) && (src_offset == 0 ) &&
215
+ (dst_offset == 0 ) && (simplified_src_strides[0 ] == 1 ) &&
216
+ (simplified_dst_strides[0 ] == 1 ));
217
+
218
+ if (can_use_memcpy) {
219
+ int src_elem_size = npy_src.itemsize ();
220
+
221
+ sycl::event copy_ev = exec_q.memcpy (
222
+ static_cast <void *>(dst_data), static_cast <const void *>(src_data),
223
+ src_nelems * src_elem_size, depends);
224
+
225
+ {
226
+ // wait for copy_ev to complete
227
+ // release GIL to allow other threads (host_tasks)
228
+ // a chance to acquire GIL
229
+ py::gil_scoped_release lock{};
230
+
231
+ copy_ev.wait_and_throw ();
232
+ }
233
+
234
+ return ;
235
+ }
236
+
205
237
// Minimum and maximum element offsets for source np.ndarray
206
238
py::ssize_t npy_src_min_nelem_offset (src_offset);
207
239
py::ssize_t npy_src_max_nelem_offset (src_offset);
@@ -230,17 +262,22 @@ void copy_numpy_ndarray_into_usm_ndarray(
230
262
}
231
263
const sycl::event ©_shape_ev = std::get<2 >(ptr_size_event_tuple);
232
264
233
- // Get implementation function pointer
234
- auto copy_and_cast_from_host_blocking_fn =
235
- copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
236
- [src_type_id];
265
+ {
266
+ // release GIL for the blocking call
267
+ py::gil_scoped_release lock{};
268
+
269
+ // Get implementation function pointer
270
+ auto copy_and_cast_from_host_blocking_fn =
271
+ copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
272
+ [src_type_id];
237
273
238
- copy_and_cast_from_host_blocking_fn (
239
- exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
240
- npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
241
- dst_offset, depends, {copy_shape_ev});
274
+ copy_and_cast_from_host_blocking_fn (
275
+ exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
276
+ npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
277
+ dst_offset, depends, {copy_shape_ev});
242
278
243
- sycl::free (shape_strides, exec_q);
279
+ sycl::free (shape_strides, exec_q);
280
+ }
244
281
245
282
return ;
246
283
}
0 commit comments