Release GIL around blocking operations in libtensor

oleksandr-pavlyk · oleksandr-pavlyk · commit 7d5c06b8491b · 2024-07-29T06:56:00.000-05:00
Copy from NumPy ndarray to usm_ndarray is blocking,
so release GIL.

mask-positions (cumulative value) which returns a total
is blocking, so release the GIL.
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -160,10 +160,14 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
                       ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
                       : mask_positions_contig_i64_dispatch_vector[mask_typeid];
 
-        size_t total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
-                              host_task_events, depends);
+        size_t total_set{};
+
         {
             py::gil_scoped_release release;
+
+            total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
+                           host_task_events, depends);
+
             sycl::event::wait(host_task_events);
         }
         return total_set;
@@ -198,12 +202,13 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
     if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
-        copy_shape_ev.wait();
         {
             py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
             sycl::event::wait(host_task_events);
+            sycl::free(shape_strides, exec_q);
         }
-        sycl::free(shape_strides, exec_q);
         throw std::runtime_error("Unexpected error");
     }
 
@@ -213,15 +218,17 @@ size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
     dependent_events.insert(dependent_events.end(), depends.begin(),
                             depends.end());
 
-    size_t total_set =
-        strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, cumsum_data,
-                   host_task_events, dependent_events);
+    size_t total_set;
 
     {
         py::gil_scoped_release release;
+
+        total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides,
+                               cumsum_data, host_task_events, dependent_events);
+
         sycl::event::wait(host_task_events);
+        sycl::free(shape_strides, exec_q);
     }
-    sycl::free(shape_strides, exec_q);
 
     return total_set;
 }
@@ -352,8 +359,12 @@ size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
     sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
     if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
-        copy_shape_ev.wait();
-        sycl::event::wait(host_task_events);
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+        }
         sycl::free(shape_strides, exec_q);
         throw std::runtime_error("Unexpected error");
     }
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -116,21 +116,29 @@ void copy_numpy_ndarray_into_usm_ndarray(
 
     // check for applicability of special cases:
     //      (same type && (both C-contiguous || both F-contiguous)
-    bool both_c_contig =
+    const bool both_c_contig =
         ((src_flags & py::array::c_style) && dst.is_c_contiguous());
-    bool both_f_contig =
+    const bool both_f_contig =
         ((src_flags & py::array::f_style) && dst.is_f_contiguous());
+
+    const bool same_data_types = (src_type_id == dst_type_id);
+
     if (both_c_contig || both_f_contig) {
-        if (src_type_id == dst_type_id) {
+        if (same_data_types) {
             int src_elem_size = npy_src.itemsize();
 
             sycl::event copy_ev =
                 exec_q.memcpy(static_cast<void *>(dst_data),
                               static_cast<const void *>(src_data),
                               src_nelems * src_elem_size, depends);
 
-            // wait for copy_ev to complete
-            copy_ev.wait();
+            {
+                // wait for copy_ev to complete
+                // release GIL to allow other threads (host_tasks)
+                // a chance to acquire GIL
+                py::gil_scoped_release lock{};
+                copy_ev.wait();
+            }
 
             return;
         }
@@ -202,6 +210,30 @@ void copy_numpy_ndarray_into_usm_ndarray(
         simplified_dst_strides.push_back(1);
     }
 
+    const bool can_use_memcpy =
+        (same_data_types && (nd == 1) && (src_offset == 0) &&
+         (dst_offset == 0) && (simplified_src_strides[0] == 1) &&
+         (simplified_dst_strides[0] == 1));
+
+    if (can_use_memcpy) {
+        int src_elem_size = npy_src.itemsize();
+
+        sycl::event copy_ev = exec_q.memcpy(
+            static_cast<void *>(dst_data), static_cast<const void *>(src_data),
+            src_nelems * src_elem_size, depends);
+
+        {
+            // wait for copy_ev to complete
+            // release GIL to allow other threads (host_tasks)
+            // a chance to acquire GIL
+            py::gil_scoped_release lock{};
+
+            copy_ev.wait();
+        }
+
+        return;
+    }
+
     // Minimum and maximum element offsets for source np.ndarray
     py::ssize_t npy_src_min_nelem_offset(src_offset);
     py::ssize_t npy_src_max_nelem_offset(src_offset);
@@ -230,17 +262,22 @@ void copy_numpy_ndarray_into_usm_ndarray(
     }
     const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
-    // Get implementation function pointer
-    auto copy_and_cast_from_host_blocking_fn =
-        copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
-                                                       [src_type_id];
+    {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
 
-    copy_and_cast_from_host_blocking_fn(
-        exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
-        npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
-        dst_offset, depends, {copy_shape_ev});
+        // Get implementation function pointer
+        auto copy_and_cast_from_host_blocking_fn =
+            copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
+                                                           [src_type_id];
 
-    sycl::free(shape_strides, exec_q);
+        copy_and_cast_from_host_blocking_fn(
+            exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+            npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
+            dst_offset, depends, {copy_shape_ev});
+
+        sycl::free(shape_strides, exec_q);
+    }
 
     return;
 }