IntelPython
diff --git a/‎.github/workflows/conda-package.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/conda-package.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎dpnp/backend/include/dpnp_iface_fptr.hpp
Lines changed: 1 addition & 1 deletion b/‎dpnp/backend/include/dpnp_iface_fptr.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎dpnp/backend/kernels/dpnp_krnl_bitwise.cpp
Lines changed: 52 additions & 45 deletions b/‎dpnp/backend/kernels/dpnp_krnl_bitwise.cpp
Lines changed: 52 additions & 45 deletions
diff --git a/‎dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
Lines changed: 77 additions & 54 deletions b/‎dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
Lines changed: 77 additions & 54 deletions
diff --git a/‎dpnp/backend/kernels/dpnp_krnl_mathematical.cpp
Lines changed: 54 additions & 36 deletions b/‎dpnp/backend/kernels/dpnp_krnl_mathematical.cpp
Lines changed: 54 additions & 36 deletions
diff --git a/‎dpnp/backend/src/dpnp_fptr.hpp
Lines changed: 6 additions & 0 deletions b/‎dpnp/backend/src/dpnp_fptr.hpp
Lines changed: 6 additions & 0 deletions
diff --git a/‎dpnp/dpnp_algo/dpnp_algo_linearalgebra.pyx
Lines changed: 24 additions & 11 deletions b/‎dpnp/dpnp_algo/dpnp_algo_linearalgebra.pyx
Lines changed: 24 additions & 11 deletions
diff --git a/‎dpnp/dpnp_array.py
Lines changed: 2 additions & 1 deletion b/‎dpnp/dpnp_array.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎dpnp/dpnp_iface_linearalgebra.py
Lines changed: 40 additions & 27 deletions b/‎dpnp/dpnp_iface_linearalgebra.py
Lines changed: 40 additions & 27 deletions
@@ -12,6 +12,7 @@ env:
   CHANNELS: '-c dppy/label/dev -c intel -c main --override-channels'
   TEST_SCOPE: >-
       test_arraycreation.py
+      test_dot.py
       test_dparray.py
       test_fft.py
       test_linalg.py
 
@@ -377,7 +377,7 @@ enum class DPNPFuncName : size_t
     DPNP_FN_VANDER_EXT,                   /**< Used in numpy.vander() impl, requires extra parameters */
     DPNP_FN_VAR,                          /**< Used in numpy.var() impl  */
     DPNP_FN_VAR_EXT,                      /**< Used in numpy.var() impl, requires extra parameters */
-    DPNP_FN_WHERE_EXT,                    /**< Used in numpy.var() impl, requires extra parameters */
+    DPNP_FN_WHERE_EXT,                    /**< Used in numpy.where() impl, requires extra parameters */
     DPNP_FN_ZEROS,                        /**< Used in numpy.zeros() impl */
     DPNP_FN_ZEROS_LIKE,                   /**< Used in numpy.zeros_like() impl */
     DPNP_FN_LAST,                         /**< The latest element of the enumeration */
 
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2016-2020, Intel Corporation
+// Copyright (c) 2016-2023, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -114,10 +114,10 @@ DPCTLSyclEventRef (*dpnp_around_ext_c)(DPCTLSyclQueueRef,
                                        const int,
                                        const DPCTLEventVectorRef) = dpnp_around_c<_DataType>;
 
-template <typename _KernelNameSpecialization>
+template <typename _KernelNameSpecialization1, typename _KernelNameSpecialization2>
 class dpnp_elemwise_absolute_c_kernel;
 
-template <typename _DataType>
+template <typename _DataType_input, typename _DataType_output>
 DPCTLSyclEventRef dpnp_elemwise_absolute_c(DPCTLSyclQueueRef q_ref,
                                            const void* input1_in,
                                            void* result1,
@@ -137,43 +137,63 @@ DPCTLSyclEventRef dpnp_elemwise_absolute_c(DPCTLSyclQueueRef q_ref,
     sycl::queue q = *(reinterpret_cast<sycl::queue*>(q_ref));
     sycl::event event;
 
-    DPNPC_ptr_adapter<_DataType> input1_ptr(q_ref, input1_in, size);
-    _DataType* array1 = input1_ptr.get_ptr();
-    DPNPC_ptr_adapter<_DataType> result1_ptr(q_ref, result1, size, false, true);
-    _DataType* result = result1_ptr.get_ptr();
+    _DataType_input* array1 = static_cast<_DataType_input*>(const_cast<void*>(input1_in));
+    _DataType_output* result = static_cast<_DataType_output*>(result1);
 
-    if constexpr (std::is_same<_DataType, double>::value || std::is_same<_DataType, float>::value)
+    if constexpr (is_any_v<_DataType_input, float, double, std::complex<float>, std::complex<double>>)
     {
-        // https://docs.oneapi.com/versions/latest/onemkl/abs.html
         event = oneapi::mkl::vm::abs(q, size, array1, result);
     }
     else
     {
-        sycl::range<1> gws(size);
-        auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {
-            const size_t idx = global_id[0];
+        static_assert(is_any_v<_DataType_input, int32_t, int64_t>,
+                      "Integer types are only expected to pass in 'abs' kernel");
+        static_assert(std::is_same_v<_DataType_input, _DataType_output>, "Result type must match a type of input data");
+
+        constexpr size_t lws = 64;
+        constexpr unsigned int vec_sz = 8;
+        constexpr sycl::access::address_space global_space = sycl::access::address_space::global_space;
+
+        auto gws_range = sycl::range<1>(((size + lws * vec_sz - 1) / (lws * vec_sz)) * lws);
+        auto lws_range = sycl::range<1>(lws);
 
-            if (array1[idx] >= 0)
+        auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
+            auto sg = nd_it.get_sub_group();
+            const auto max_sg_size = sg.get_max_local_range()[0];
+            const size_t start =
+                vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) + sg.get_group_id()[0] * max_sg_size);
+
+            if (start + static_cast<size_t>(vec_sz) * max_sg_size < size)
             {
-                result[idx] = array1[idx];
+                using input_ptrT = sycl::multi_ptr<_DataType_input, global_space>;
+                using result_ptrT = sycl::multi_ptr<_DataType_output, global_space>;
+
+                sycl::vec<_DataType_input, vec_sz> data_vec = sg.load<vec_sz>(input_ptrT(&array1[start]));
+
+                // sycl::abs() returns unsigned integers only, so explicit casting to signed ones is required
+                using result_absT = typename cl::sycl::detail::make_unsigned<_DataType_output>::type;
+                sycl::vec<_DataType_output, vec_sz> res_vec =
+                    dpnp_vec_cast<_DataType_output, result_absT, vec_sz>(sycl::abs(data_vec));
+
+                sg.store<vec_sz>(result_ptrT(&result[start]), res_vec);
             }
             else
             {
-                result[idx] = -1 * array1[idx];
+                for (size_t k = start + sg.get_local_id()[0]; k < size; k += max_sg_size)
+                {
+                    result[k] = std::abs(array1[k]);
+                }
             }
         };
 
         auto kernel_func = [&](sycl::handler& cgh) {
-            cgh.parallel_for<class dpnp_elemwise_absolute_c_kernel<_DataType>>(gws, kernel_parallel_for_func);
+            cgh.parallel_for<class dpnp_elemwise_absolute_c_kernel<_DataType_input, _DataType_output>>(
+                sycl::nd_range<1>(gws_range, lws_range), kernel_parallel_for_func);
         };
-
         event = q.submit(kernel_func);
     }
 
-    input1_ptr.depends_on(event);
-    result1_ptr.depends_on(event);
     event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
-
     return DPCTLEvent_Copy(event_ref);
 }
 
@@ -182,28 +202,24 @@ void dpnp_elemwise_absolute_c(const void* input1_in, void* result1, size_t size)
 {
     DPCTLSyclQueueRef q_ref = reinterpret_cast<DPCTLSyclQueueRef>(&DPNP_QUEUE);
     DPCTLEventVectorRef dep_event_vec_ref = nullptr;
-    DPCTLSyclEventRef event_ref = dpnp_elemwise_absolute_c<_DataType>(q_ref,
-                                                                      input1_in,
-                                                                      result1,
-                                                                      size,
-                                                                      dep_event_vec_ref);
+    DPCTLSyclEventRef event_ref = dpnp_elemwise_absolute_c<_DataType, _DataType>(q_ref,
+                                                                                 input1_in,
+                                                                                 result1,
+                                                                                 size,
+                                                                                 dep_event_vec_ref);
     DPCTLEvent_WaitAndThrow(event_ref);
+    DPCTLEvent_Delete(event_ref);
 }
 
 template <typename _DataType>
 void (*dpnp_elemwise_absolute_default_c)(const void*, void*, size_t) = dpnp_elemwise_absolute_c<_DataType>;
 
-template <typename _DataType>
+template <typename _DataType_input, typename _DataType_output = _DataType_input>
 DPCTLSyclEventRef (*dpnp_elemwise_absolute_ext_c)(DPCTLSyclQueueRef,
                                                   const void*,
                                                   void*,
                                                   size_t,
-                                                  const DPCTLEventVectorRef) = dpnp_elemwise_absolute_c<_DataType>;
-
-// template void dpnp_elemwise_absolute_c<double>(void* array1_in, void* result1, size_t size);
-// template void dpnp_elemwise_absolute_c<float>(void* array1_in, void* result1, size_t size);
-// template void dpnp_elemwise_absolute_c<long>(void* array1_in, void* result1, size_t size);
-// template void dpnp_elemwise_absolute_c<int>(void* array1_in, void* result1, size_t size);
+                                                  const DPCTLEventVectorRef) = dpnp_elemwise_absolute_c<_DataType_input, _DataType_output>;
 
 template <typename _DataType_output, typename _DataType_input1, typename _DataType_input2>
 DPCTLSyclEventRef dpnp_cross_c(DPCTLSyclQueueRef q_ref,
@@ -1085,10 +1101,12 @@ void func_map_init_mathematical(func_map_t& fmap)
                                                                   (void*)dpnp_elemwise_absolute_ext_c<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_ABSOLUTE_EXT][eft_LNG][eft_LNG] = {eft_LNG,
                                                                   (void*)dpnp_elemwise_absolute_ext_c<int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_ABSOLUTE_EXT][eft_FLT][eft_FLT] = {eft_FLT,
-                                                                  (void*)dpnp_elemwise_absolute_ext_c<float>};
-    fmap[DPNPFuncName::DPNP_FN_ABSOLUTE_EXT][eft_DBL][eft_DBL] = {eft_DBL,
-                                                                  (void*)dpnp_elemwise_absolute_ext_c<double>};
+    fmap[DPNPFuncName::DPNP_FN_ABSOLUTE_EXT][eft_FLT][eft_FLT] = {eft_FLT, (void*)dpnp_elemwise_absolute_ext_c<float>};
+    fmap[DPNPFuncName::DPNP_FN_ABSOLUTE_EXT][eft_DBL][eft_DBL] = {eft_DBL, (void*)dpnp_elemwise_absolute_ext_c<double>};
+    fmap[DPNPFuncName::DPNP_FN_ABSOLUTE_EXT][eft_C64][eft_C64] = {
+        eft_FLT, (void*)dpnp_elemwise_absolute_ext_c<std::complex<float>, float>};
+    fmap[DPNPFuncName::DPNP_FN_ABSOLUTE_EXT][eft_C128][eft_C128] = {
+        eft_DBL, (void*)dpnp_elemwise_absolute_ext_c<std::complex<double>, double>};
 
     fmap[DPNPFuncName::DPNP_FN_AROUND][eft_INT][eft_INT] = {eft_INT, (void*)dpnp_around_default_c<int32_t>};
     fmap[DPNPFuncName::DPNP_FN_AROUND][eft_LNG][eft_LNG] = {eft_LNG, (void*)dpnp_around_default_c<int64_t>};
 
@@ -163,6 +163,12 @@ struct is_any : std::disjunction<std::is_same<T, Ts>...> {};
 template <typename T, typename... Ts>
 struct are_same : std::conjunction<std::is_same<T, Ts>...> {};
 
+/**
+ * A template constant to check if type T matces any type from Ts.
+ */
+template <typename T, typename... Ts>
+constexpr auto is_any_v = is_any<T, Ts...>::value;
+
 /**
  * A template constat to check if both types T1 and T2 match every type from Ts sequence.
  */
 
@@ -1,7 +1,7 @@
 # cython: language_level=3
 # -*- coding: utf-8 -*-
 # *****************************************************************************
-# Copyright (c) 2016-2020, Intel Corporation
+# Copyright (c) 2016-2023, Intel Corporation
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -65,8 +65,9 @@ ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_matmul_t)(c_dpctl.DPCTLSyclQue
                                                             const shape_elem_type *, const shape_elem_type * ,
                                                             const c_dpctl.DPCTLEventVectorRef)
 
-cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1, utils.dpnp_descriptor in_array2):
-
+cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1,
+                                     utils.dpnp_descriptor in_array2,
+                                     utils.dpnp_descriptor out=None):
     cdef shape_type_c shape1, shape2
 
     shape1 = in_array1.shape
@@ -78,6 +79,7 @@ cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1, utils.dpnp
 
     # get the FPTR data structure
     cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_DOT_EXT, param1_type, param2_type)
+    cdef utils.dpnp_descriptor result
 
     ndim1 = in_array1.ndim
     ndim2 = in_array2.ndim
@@ -89,7 +91,7 @@ cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1, utils.dpnp
     elif ndim1 == 1 and ndim2 == 1:
         result_shape = ()
     elif ndim1 == 1:  # ndim2 > 1
-        result_shape = shape2[:-1]
+        result_shape = shape2[::-2] if ndim2 == 2 else shape2[::2]
     elif ndim2 == 1:  # ndim1 > 1
         result_shape = shape1[:-1]
     else:
@@ -101,13 +103,24 @@ cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1, utils.dpnp
 
     result_sycl_device, result_usm_type, result_sycl_queue = utils.get_common_usm_allocation(in_array1, in_array2)
 
-    # create result array with type given by FPTR data
-    cdef utils.dpnp_descriptor result = utils.create_output_descriptor(result_shape,
-                                                                       kernel_data.return_type,
-                                                                       None,
-                                                                       device=result_sycl_device,
-                                                                       usm_type=result_usm_type,
-                                                                       sycl_queue=result_sycl_queue)
+    if out is None:
+        # create result array with type given by FPTR data
+        result = utils.create_output_descriptor(result_shape,
+                                                kernel_data.return_type,
+                                                None,
+                                                device=result_sycl_device,
+                                                usm_type=result_usm_type,
+                                                sycl_queue=result_sycl_queue)
+    else:
+        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > kernel_data.return_type)
+        if out.dtype != result_type:
+            utils.checker_throw_value_error('dot', 'out.dtype', out.dtype, result_type)
+        if out.shape != result_shape:
+            utils.checker_throw_value_error('dot', 'out.shape', out.shape, result_shape)
+
+        result = out
+
+        utils.get_common_usm_allocation(in_array1, result)  # check USM allocation is common
 
     cdef shape_type_c result_strides = utils.strides_to_vector(result.strides, result.shape)
     cdef shape_type_c in_array1_shape = in_array1.shape
 
@@ -592,7 +592,8 @@ def diagonal(input, offset=0, axis1=0, axis2=1):
 
         return dpnp.diagonal(input, offset, axis1, axis2)
 
- # 'dot',
+    def dot(self, other, out=None):
+        return dpnp.dot(self, other, out)
 
     @property
     def dtype(self):
 
@@ -44,9 +44,9 @@
 from dpnp.dpnp_algo import *
 from dpnp.dpnp_utils import *
 import dpnp
-import dpnp.config as config
 
 import numpy
+import dpctl.tensor as dpt
 
 
 __all__ = [
@@ -62,18 +62,25 @@
 ]
 
 
-def dot(x1, x2, **kwargs):
+def dot(x1, x2, out=None, **kwargs):
     """
-    Returns the dot product of `x1` and `x2`.
+    Dot product of `x1` and `x2`.
 
     For full documentation refer to :obj:`numpy.dot`.
 
+    Returns
+    -------
+    y : dpnp.ndarray
+        Returns the dot product of `x1` and `x2`.
+        If `out` is given, then it is returned.
+    
     Limitations
     -----------
-        Parameters ``x1`` and ``x2`` are supported as :obj:`dpnp.ndarray` of the same type.
-        Keyword arguments ``kwargs`` are currently unsupported.
-        Otherwise the functions will be executed sequentially on CPU.
-        Input array data types are limited by supported DPNP :ref:`Data types`.
+    Parameters `x1` and `x2` are supported as either scalar, :class:`dpnp.ndarray`
+    or :class:`dpctl.tensor.usm_ndarray`, but both `x1` and `x2` can not be scalars at the same time.
+    Keyword argument ``kwargs`` is currently unsupported.
+    Otherwise the functions will be executed sequentially on CPU.
+    Input array data types are limited by supported DPNP :ref:`Data types`.
 
     See Also
     --------
@@ -82,31 +89,37 @@ def dot(x1, x2, **kwargs):
 
     Examples
     --------
-    >>> import dpnp as np
-    >>> np.dot(3, 4)
-    12
-    >>> a = np.array([1, 2, 3])
-    >>> b = np.array([1, 2, 3])
-    >>> np.dot(a, b)
+    >>> import dpnp as dp
+    >>> a = dp.array([1, 2, 3])
+    >>> b = dp.array([1, 2, 3])
+    >>> dp.dot(a, b)
     14
 
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False)
-    x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False)
-    if x1_desc and x2_desc and not kwargs:
-        # TODO: remove fallback with scalars when muliply backend func will support strides
-        if(x1_desc.ndim == 0 and x2_desc.strides is not None
-                or x2_desc.ndim == 0 and x1_desc.strides is not None):
-            pass
-        elif (x1_desc.ndim >= 1 and x2_desc.ndim > 1 and x1_desc.shape[-1] != x2_desc.shape[-2]):
-            pass
-        elif (x1_desc.ndim > 0 and x2_desc.ndim == 1 and x1_desc.shape[-1] != x2_desc.shape[0]):
-            pass
-        else:
-            return dpnp_dot(x1_desc, x2_desc).get_pyobj()
+    if kwargs:
+        pass
+    elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
+        # at least either x1 or x2 has to be an array
+        pass
+    else:
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
+
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        if x1_desc and x2_desc:
+            if out is not None:
+                if not isinstance(out, (dpnp.ndarray, dpt.usm_ndarray)):
+                    raise TypeError("return array must be of supported array type")
+                out_desc = dpnp.get_dpnp_descriptor(out, copy_when_nondefault_queue=False)
+            else:
+                out_desc = None
+            return dpnp_dot(x1_desc, x2_desc, out=out_desc).get_pyobj()
 
-    return call_origin(numpy.dot, x1, x2, **kwargs)
+    return call_origin(numpy.dot, x1, x2, out=out, **kwargs)
 
 
 def einsum(*args, **kwargs):