Removed extra copy for transpose arrays in dot() (#1477)

antonwolfy · web-flow · commit 326c45192bfa · 2023-07-17T10:19:03.000+02:00
* Removed extra copy for strided arrays in dot()

* Added support of strided arrays

* Added support of strided out array

* Fix handling of 1d and 2d arrays
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
@@ -22,6 +22,7 @@ env:
       test_special.py
       test_umath.py
       test_usm_type.py
+      third_party/cupy/linalg_tests/test_product.py
       third_party/cupy/math_tests/test_explog.py
       third_party/cupy/math_tests/test_misc.py
       third_party/cupy/math_tests/test_trigonometric.py
diff --git a/dpnp/backend/kernels/dpnp_krnl_common.cpp b/dpnp/backend/kernels/dpnp_krnl_common.cpp
@@ -34,6 +34,7 @@
 #include <dpnp_iface.hpp>
 
 namespace mkl_blas = oneapi::mkl::blas;
+namespace mkl_blas_cm = oneapi::mkl::blas::column_major;
 namespace mkl_blas_rm = oneapi::mkl::blas::row_major;
 namespace mkl_lapack = oneapi::mkl::lapack;
 
@@ -227,12 +228,10 @@ DPCTLSyclEventRef dpnp_dot_c(DPCTLSyclQueueRef q_ref,
     DPCTLSyclEventRef event_ref = nullptr;
     sycl::queue q = *(reinterpret_cast<sycl::queue *>(q_ref));
 
-    DPNPC_ptr_adapter<_DataType_input1> input1_ptr(q_ref, input1_in,
-                                                   input1_size);
-    DPNPC_ptr_adapter<_DataType_input2> input2_ptr(q_ref, input2_in,
-                                                   input2_size);
-    _DataType_input1 *input1 = input1_ptr.get_ptr();
-    _DataType_input2 *input2 = input2_ptr.get_ptr();
+    _DataType_input1 *input1 =
+        static_cast<_DataType_input1 *>(const_cast<void *>(input1_in));
+    _DataType_input2 *input2 =
+        static_cast<_DataType_input2 *>(const_cast<void *>(input2_in));
     _DataType_output *result = reinterpret_cast<_DataType_output *>(result_out);
 
     if (!input1_size || !input2_size) {
@@ -257,10 +256,12 @@ DPCTLSyclEventRef dpnp_dot_c(DPCTLSyclQueueRef q_ref,
     // if both arrays are vectors
     if ((input1_ndim == 1) && (input2_ndim == 1)) {
         assert(input1_size == input2_size);
+
         sycl::event event = dot(q, result, input1, input2, input1_strides[0],
                                 input2_strides[0], input1_size);
-        event.wait();
-        return event_ref;
+
+        event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
+        return DPCTLEvent_Copy(event_ref);
     }
 
     // 1D vector
@@ -297,13 +298,17 @@ DPCTLSyclEventRef dpnp_dot_c(DPCTLSyclQueueRef q_ref,
     size_t ext_result_ndim =
         ((input1_ndim == 1) || (input2_ndim == 1)) ? 2 : result_ndim;
     shape_elem_type *ext_result_shape = new shape_elem_type[ext_result_ndim];
+    shape_elem_type *ext_result_strides = new shape_elem_type[ext_result_ndim];
     if ((input1_ndim == 1) || (input2_ndim == 1)) {
         ext_result_shape[0] = ext_input1_shape[0];
         ext_result_shape[1] = ext_input2_shape[1];
+        ext_result_strides[0] = 0;
+        ext_result_strides[1] = result_strides[0];
     }
     else {
         for (size_t i = 0; i < ext_result_ndim; ++i) {
             ext_result_shape[i] = result_shape[i];
+            ext_result_strides[i] = result_strides[i];
         }
     }
 
@@ -316,80 +321,89 @@ DPCTLSyclEventRef dpnp_dot_c(DPCTLSyclQueueRef q_ref,
         // check if GEMM can be executed (strides)
         // TODO: rewrite the condition in general case for ndims > 2
         // (looks like there are such another cases)
-
         if (ext_input1_ndim == 2 && ext_input2_ndim == 2) {
-// there is a difference of behavior with trans and sizes params in previous
-// version of GEMM only new version is supported, in case of old version
-// computation goes in common way
-#if INTEL_MKL_VERSION >= 20210004
-            // is mat1 F-contiguous, C-contiguous
-            bool mat1_f_contig =
-                (((ext_input1_shape[0] == 1) || (ext_input1_strides[0] == 1)) &&
-                 ((ext_input1_shape[1] == 1) ||
-                  (ext_input1_strides[1] == ext_input1_shape[0])));
-            bool mat1_c_contig =
-                (((ext_input1_shape[1] == 1) || (ext_input1_strides[1] == 1)) &&
-                 ((ext_input1_shape[0] == 1) ||
-                  (ext_input1_strides[0] == ext_input1_shape[1])));
-            // is mat2 F-contiguous, C-contiguous
-            bool mat2_f_contig =
-                (((ext_input2_shape[0] == 1) || (ext_input2_strides[0] == 1)) &&
-                 ((ext_input2_shape[1] == 1) ||
-                  (ext_input2_strides[1] == ext_input2_shape[0])));
-            bool mat2_c_contig =
-                (((ext_input2_shape[1] == 1) || (ext_input2_strides[1] == 1)) &&
-                 ((ext_input2_shape[0] == 1) ||
-                  (ext_input2_strides[0] == ext_input2_shape[1])));
-
-            if ((mat1_f_contig || mat1_c_contig) &&
-                (mat2_f_contig || mat2_c_contig)) {
-                oneapi::mkl::transpose trans1 =
-                    (mat1_f_contig && !mat1_c_contig)
-                        ? oneapi::mkl::transpose::trans
-                        : oneapi::mkl::transpose::nontrans;
-                oneapi::mkl::transpose trans2 =
-                    (mat2_f_contig && !mat2_c_contig)
-                        ? oneapi::mkl::transpose::trans
-                        : oneapi::mkl::transpose::nontrans;
+            // OneMKL gemm suports only arrays contiguous on inner dimension,
+            // so stride for at least one dimension should be equal to 1
+            if ((ext_input1_strides[0] == 1 || ext_input1_strides[1] == 1) &&
+                (ext_input2_strides[0] == 1 || ext_input2_strides[1] == 1) &&
+                (ext_result_strides[0] == 1 || ext_result_strides[1] == 1))
+            {
+                const bool isRowmA =
+                    (ext_input1_strides[1] == 1 || ext_input1_strides[0] == 0);
+                const bool isRowmB =
+                    (ext_input2_strides[1] == 1 || ext_input2_strides[1] == 0);
+                const bool isRowmC =
+                    (ext_result_strides[1] == 1 || ext_result_strides[0] == 0);
+
+                oneapi::mkl::transpose transA =
+                    (isRowmA != isRowmC) ? oneapi::mkl::transpose::trans
+                                         : oneapi::mkl::transpose::nontrans;
+                oneapi::mkl::transpose transB =
+                    (isRowmB != isRowmC) ? oneapi::mkl::transpose::trans
+                                         : oneapi::mkl::transpose::nontrans;
 
                 const size_t size_m = ext_input1_shape[0];
                 const size_t size_n = ext_input2_shape[1];
                 const size_t size_k = ext_input1_shape[1];
 
-                const std::int64_t lda =
-                    trans1 == oneapi::mkl::transpose::nontrans
-                        ? ext_input1_strides[0]
-                        : ext_input1_strides[1];
-                const std::int64_t ldb =
-                    trans2 == oneapi::mkl::transpose::nontrans
-                        ? ext_input2_strides[0]
-                        : ext_input2_strides[1];
-
-                // definition of ldc will be another for result with
-                // non-standard (c-contiguous) strides const std::int64_t ldc =
-                // result_strides[0] == 1 ? result_strides[1] :
-                // result_strides[0];
-                const std::int64_t ldc = size_n;
+                auto getLdaLdc = [](const bool isRown, shape_elem_type *strides,
+                                    shape_elem_type *shapes) {
+                    if (isRown) {
+                        return (strides[0] != 0) ? strides[0] : shapes[1];
+                    }
+                    return strides[1];
+                };
+
+                const std::int64_t lda = static_cast<std::int64_t>(
+                    getLdaLdc(isRowmA, ext_input1_strides, ext_input1_shape));
+                const std::int64_t ldb = static_cast<std::int64_t>(
+                    isRowmB ? ext_input2_strides[0] : ext_input2_strides[1]);
+                const std::int64_t ldc = static_cast<std::int64_t>(
+                    getLdaLdc(isRowmC, ext_result_strides, ext_result_shape));
+
+                constexpr _DataType_output alpha = 1;
+                constexpr _DataType_output beta = 0;
+
+                std::stringstream error_msg;
+                std::int64_t info = 0;
 
                 try {
-                    sycl::event event = mkl_blas_rm::gemm(
-                        q, trans1, trans2, size_m, size_n, size_k,
-                        _DataType_output(1), // alpha
-                        input1, lda, input2, ldb,
-                        _DataType_output(0), // beta
-                        result, ldc);
-                    event.wait();
-                    delete[] ext_input1_shape;
-                    delete[] ext_input1_strides;
-                    delete[] ext_input2_shape;
-                    delete[] ext_input2_strides;
-                    delete[] ext_result_shape;
-
-                    return event_ref;
+                    if (isRowmC) {
+                        mkl_blas_rm::gemm(q, transA, transB, size_m, size_n,
+                                          size_k, alpha, input1, lda, input2,
+                                          ldb, beta, result, ldc)
+                            .wait();
+                    }
+                    else {
+                        mkl_blas_cm::gemm(q, transA, transB, size_m, size_n,
+                                          size_k, alpha, input1, lda, input2,
+                                          ldb, beta, result, ldc)
+                            .wait();
+                    }
+                } catch (mkl_lapack::exception const &e) {
+                    error_msg << "Unexpected MKL exception caught during "
+                                 "gemm() call:\nreason: "
+                              << e.what() << "\ninfo: " << e.info();
+                    info = e.info();
                 } catch (const std::exception &e) {
-                    // do nothing, proceed to general case
+                    error_msg << "Unexpected SYCL exception caught during "
+                                 "gemm() call:\n"
+                              << e.what();
+                    info = -1;
                 }
-#endif
+
+                if (info != 0) // an unexected error occurs
+                {
+                    throw std::runtime_error(error_msg.str());
+                }
+
+                delete[] ext_input1_shape;
+                delete[] ext_input1_strides;
+                delete[] ext_input2_shape;
+                delete[] ext_input2_strides;
+                delete[] ext_result_shape;
+                delete[] ext_result_strides;
+                return event_ref;
             }
         }
     }
@@ -437,6 +451,7 @@ DPCTLSyclEventRef dpnp_dot_c(DPCTLSyclQueueRef q_ref,
     delete[] ext_input2_shape;
     delete[] ext_input2_strides;
     delete[] ext_result_shape;
+    delete[] ext_result_strides;
 
     return event_ref;
 }
diff --git a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
@@ -55,7 +55,7 @@ ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_dot_t)(c_dpctl.DPCTLSyclQueueR
                                                          const shape_elem_type *, const shape_elem_type * ,
                                                          void * , const size_t, const size_t,
                                                          const shape_elem_type *, const shape_elem_type * ,
-                                                         const c_dpctl.DPCTLEventVectorRef)
+                                                         const c_dpctl.DPCTLEventVectorRef) except +
 ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_matmul_t)(c_dpctl.DPCTLSyclQueueRef,
                                                             void * , const size_t, const size_t,
                                                             const shape_elem_type *, const shape_elem_type * ,
diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
@@ -108,17 +108,16 @@ def dot(x1, x2, out=None, **kwargs):
             else (None, None)
         )
 
-        # TODO: copy_when_strides=False (now it's done for faster implementation with transpose arrays)
         x1_desc = dpnp.get_dpnp_descriptor(
             x1,
-            copy_when_strides=True,
+            copy_when_strides=False,
             copy_when_nondefault_queue=False,
             alloc_usm_type=usm_type,
             alloc_queue=queue,
         )
         x2_desc = dpnp.get_dpnp_descriptor(
             x2,
-            copy_when_strides=True,
+            copy_when_strides=False,
             copy_when_nondefault_queue=False,
             alloc_usm_type=usm_type,
             alloc_queue=queue,
@@ -131,7 +130,9 @@ def dot(x1, x2, out=None, **kwargs):
                     )
                 out_desc = (
                     dpnp.get_dpnp_descriptor(
-                        out, copy_when_nondefault_queue=False
+                        out,
+                        copy_when_strides=False,
+                        copy_when_nondefault_queue=False,
                     )
                     or None
                 )

Original file line number	Diff line number	Diff line change
`@@ -108,17 +108,16 @@ def dot(x1, x2, out=None, **kwargs):`
`108`	`108`	`else (None, None)`
`109`	`109`	`)`
`110`	`110`
`111`		`- # TODO: copy_when_strides=False (now it's done for faster implementation with transpose arrays)`
`112`	`111`	`x1_desc = dpnp.get_dpnp_descriptor(`
`113`	`112`	`x1,`
`114`		`- copy_when_strides=True,`
	`113`	`+ copy_when_strides=False,`
`115`	`114`	`copy_when_nondefault_queue=False,`
`116`	`115`	`alloc_usm_type=usm_type,`
`117`	`116`	`alloc_queue=queue,`
`118`	`117`	`)`
`119`	`118`	`x2_desc = dpnp.get_dpnp_descriptor(`
`120`	`119`	`x2,`
`121`		`- copy_when_strides=True,`
	`120`	`+ copy_when_strides=False,`
`122`	`121`	`copy_when_nondefault_queue=False,`
`123`	`122`	`alloc_usm_type=usm_type,`
`124`	`123`	`alloc_queue=queue,`
`@@ -131,7 +130,9 @@ def dot(x1, x2, out=None, **kwargs):`
`131`	`130`	`)`
`132`	`131`	`out_desc = (`
`133`	`132`	`dpnp.get_dpnp_descriptor(`
`134`		`- out, copy_when_nondefault_queue=False`
	`133`	`+ out,`
	`134`	`+ copy_when_strides=False,`
	`135`	`+ copy_when_nondefault_queue=False,`
`135`	`136`	`)`
`136`	`137`	`or None`
`137`	`138`	`)`