Implement heevd_batch via heevd call

vlad-perevezentsev · vlad-perevezentsev · commit e96454ef539a · 2024-06-03T21:21:31.000+02:00
diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp
@@ -228,6 +228,120 @@ std::pair<sycl::event, sycl::event>
     return std::make_pair(args_ev, heevd_ev);
 }
 
+std::pair<sycl::event, sycl::event>
+    heevd_batch(sycl::queue exec_q,
+                const std::int8_t jobz,
+                const std::int8_t upper_lower,
+                dpctl::tensor::usm_ndarray eig_vecs,
+                dpctl::tensor::usm_ndarray eig_vals,
+                const std::vector<sycl::event> &depends)
+{
+    const int eig_vecs_nd = eig_vecs.get_ndim();
+    const int eig_vals_nd = eig_vals.get_ndim();
+
+    if (eig_vecs_nd != 3) {
+        throw py::value_error("Unexpected ndim=" + std::to_string(eig_vecs_nd) +
+                              " of an output array with eigenvectors");
+    }
+    else if (eig_vals_nd != 2) {
+        throw py::value_error("Unexpected ndim=" + std::to_string(eig_vals_nd) +
+                              " of an output array with eigenvalues");
+    }
+
+    const py::ssize_t *eig_vecs_shape = eig_vecs.get_shape_raw();
+    const py::ssize_t *eig_vals_shape = eig_vals.get_shape_raw();
+
+    if (eig_vecs_shape[1] != eig_vecs_shape[2]) {
+        throw py::value_error(
+            "The last two dimensions of 'eig_vecs' must be the same.");
+    }
+    else if (eig_vecs_shape[0] != eig_vals_shape[0] ||
+             eig_vecs_shape[1] != eig_vals_shape[1])
+    {
+        throw py::value_error(
+            "The shape of 'eig_vals' must be (batch_size, n), "
+            "where batch_size = " +
+            std::to_string(eig_vecs_shape[0]) +
+            " and n = " + std::to_string(eig_vecs_shape[1]));
+    }
+
+    size_t src_nelems(1);
+
+    for (int i = 0; i < eig_vecs_nd; ++i) {
+        src_nelems *= static_cast<size_t>(eig_vecs_shape[i]);
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {eig_vecs, eig_vals})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(eig_vecs, eig_vals)) {
+        throw py::value_error("Arrays with eigenvectors and eigenvalues are "
+                              "overlapping segments of memory");
+    }
+
+    bool is_eig_vecs_c_contig = eig_vecs.is_c_contiguous();
+    bool is_eig_vals_c_contig = eig_vals.is_c_contiguous();
+    if (!is_eig_vecs_c_contig) {
+        throw py::value_error(
+            "An array with input matrix / output eigenvectors "
+            "must be C-contiguous");
+    }
+    else if (!is_eig_vals_c_contig) {
+        throw py::value_error(
+            "An array with output eigenvalues must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int eig_vecs_type_id =
+        array_types.typenum_to_lookup_id(eig_vecs.get_typenum());
+    int eig_vals_type_id =
+        array_types.typenum_to_lookup_id(eig_vals.get_typenum());
+
+    heevd_impl_fn_ptr_t heevd_fn =
+        heevd_dispatch_table[eig_vecs_type_id][eig_vals_type_id];
+    if (heevd_fn == nullptr) {
+        throw py::value_error("No heevd implementation defined for a pair of "
+                              "type for eigenvectors and eigenvalues");
+    }
+
+    char *eig_vecs_data = eig_vecs.get_data();
+    char *eig_vals_data = eig_vals.get_data();
+
+    const std::int64_t batch_size = eig_vecs_shape[0];
+    const std::int64_t n = eig_vecs_shape[1];
+    int vecs_elemsize = eig_vecs.get_elemsize();
+    int vals_elemsize = eig_vals.get_elemsize();
+
+    const oneapi::mkl::job jobz_val = static_cast<oneapi::mkl::job>(jobz);
+    const oneapi::mkl::uplo uplo_val =
+        static_cast<oneapi::mkl::uplo>(upper_lower);
+
+    std::vector<sycl::event> host_task_events;
+
+    for (std::int64_t i = 0; i < batch_size; ++i) {
+        char *eig_vecs_batch = eig_vecs_data + i * n * n * vecs_elemsize;
+        char *eig_vals_batch = eig_vals_data + i * n * vals_elemsize;
+
+        sycl::event heevd_ev =
+            heevd_fn(exec_q, jobz_val, uplo_val, n, eig_vecs_batch,
+                     eig_vals_batch, host_task_events, depends);
+    }
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {eig_vecs, eig_vals}, host_task_events);
+
+    return std::make_pair(args_ev, args_ev);
+}
+
 template <typename fnT, typename T, typename RealT>
 struct HeevdContigFactory
 {
diff --git a/dpnp/backend/extensions/lapack/heevd.hpp b/dpnp/backend/extensions/lapack/heevd.hpp
@@ -46,6 +46,14 @@ extern std::pair<sycl::event, sycl::event>
           dpctl::tensor::usm_ndarray eig_vals,
           const std::vector<sycl::event> &depends = {});
 
+extern std::pair<sycl::event, sycl::event>
+    heevd_batch(sycl::queue exec_q,
+                const std::int8_t jobz,
+                const std::int8_t upper_lower,
+                dpctl::tensor::usm_ndarray eig_vecs,
+                dpctl::tensor::usm_ndarray eig_vals,
+                const std::vector<sycl::event> &depends = {});
+
 extern void init_heevd_dispatch_table(void);
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -146,6 +146,14 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("eig_vecs"), py::arg("eig_vals"),
           py::arg("depends") = py::list());
 
+    m.def("_heevd_batch", &lapack_ext::heevd_batch,
+          "Call `heevd` from OneMKL LAPACK library in a loop to return "
+          "the eigenvalues and eigenvectors of a batch of complex Hermitian "
+          "matrices",
+          py::arg("sycl_queue"), py::arg("jobz"), py::arg("upper_lower"),
+          py::arg("eig_vecs"), py::arg("eig_vals"),
+          py::arg("depends") = py::list());
+
     m.def("_orgqr_batch", &lapack_ext::orgqr_batch,
           "Call `_orgqr_batch` from OneMKL LAPACK library to return "
           "the real orthogonal matrix Qi of the QR factorization "
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
@@ -103,93 +103,26 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type):
     jobz = _jobz[eigen_mode]
     uplo = _upper_lower[UPLO]
 
-    # Get LAPACK function (_syevd for real or _heevd for complex data types)
+    # Get LAPACK function (_syevd_batch for real or _heevd_batch
+    # for complex data types)
     # to compute all eigenvalues and, optionally, all eigenvectors
     lapack_func = (
-        "_heevd" if dpnp.issubdtype(v_type, dpnp.complexfloating) else "_syevd"
+        "_heevd_batch"
+        if dpnp.issubdtype(v_type, dpnp.complexfloating)
+        else "_syevd_batch"
     )
 
     a_sycl_queue = a.sycl_queue
-
-    new = True
-
-    if not new or lapack_func == "_heevd":
-        is_cpu_device = a.sycl_device.has_aspect_cpu
-        orig_shape = a.shape
-        # get 3d input array by reshape
-        a = a.reshape(-1, orig_shape[-2], orig_shape[-1])
-        a_usm_arr = dpnp.get_usm_ndarray(a)
-
-        # allocate a memory for dpnp array of eigenvalues
-        w = dpnp.empty_like(
-            a,
-            shape=orig_shape[:-1],
-            dtype=w_type,
-        )
-        w_orig_shape = w.shape
-        # get 2d dpnp array with eigenvalues by reshape
-        w = w.reshape(-1, w_orig_shape[-1])
-
-        a_order = "C" if a.flags.c_contiguous else "F"
-
-        # need to loop over the 1st dimension to get eigenvalues and
-        # eigenvectors of 3d matrix A
-        batch_size = a.shape[0]
-        eig_vecs = [None] * batch_size
-        ht_list_ev = [None] * batch_size * 2
-        for i in range(batch_size):
-            # oneMKL LAPACK assumes fortran-like array as input, so
-            # allocate a memory with 'F' order for dpnp array of eigenvectors
-            eig_vecs[i] = dpnp.empty_like(a[i], order="F", dtype=v_type)
-
-            # use DPCTL tensor function to fill the array of eigenvectors with
-            # content of input array
-            ht_list_ev[2 * i], copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=a_usm_arr[i],
-                dst=eig_vecs[i].get_array(),
-                sycl_queue=a_sycl_queue,
-            )
-
-            # TODO: Remove this w/a when MKLD-17201 is solved.
-            # Waiting for a host task executing an OneMKL LAPACK syevd/heevd
-            # call on CPU causes deadlock due to serialization of all host tasks
-            # in the queue.
-            # We need to wait for each host tasks before calling _seyvd and
-            # _heevd to avoid deadlock.
-            if is_cpu_device:
-                ht_list_ev[2 * i].wait()
-
-            # call LAPACK extension function to get eigenvalues and
-            # eigenvectors of a portion of matrix A
-            ht_list_ev[2 * i + 1], _ = getattr(li, lapack_func)(
-                a_sycl_queue,
-                jobz,
-                uplo,
-                eig_vecs[i].get_array(),
-                w[i].get_array(),
-                depends=[copy_ev],
-            )
-
-        dpctl.SyclEvent.wait_for(ht_list_ev)
-
-        w = w.reshape(w_orig_shape)
-
-        if eigen_mode == "V":
-            # combine the list of eigenvectors into a single array
-            v = dpnp.array(eig_vecs, order=a_order).reshape(orig_shape)
-            return w, v
-        return w
-
     a_orig_shape = a.shape
     # get 3d input array by reshape
     a = a.reshape(-1, a_orig_shape[-2], a_orig_shape[-1])
 
-    # oneMKL LAPACK syevd overwrites `a` and
-    # assumes fortran-like array as input.
-    # To use C-contiguous arrays, we transpose the last two dimensions
-    # before passing to syevd.
-    # This transposition is effective because each batch
-    # in the input array `a` is square.
+    # oneMKL LAPACK syevd/heevd overwrites `a` and assumes fortran-like array
+    # as input.
+    # To use C-contiguous arrays, we transpose the last two dimensions before
+    # passing to syevd/heevd.
+    # This transposition is effective because each batch in the input array `a`
+    # is square.
     a = a.transpose((0, 2, 1))
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
@@ -212,7 +145,7 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type):
     # get 2d dpnp array with eigenvalues by reshape
     w = w.reshape(-1, w_orig_shape[-1])
 
-    ht_ev, _ = li._syevd_batch(
+    ht_ev, _ = getattr(li, lapack_func)(
         a_sycl_queue,
         jobz,
         uplo,