IntelPython
diff --git a/‎.github/workflows/array-api-skips.txt
Lines changed: 0 additions & 3 deletions b/‎.github/workflows/array-api-skips.txt
Lines changed: 0 additions & 3 deletions
diff --git a/‎.github/workflows/check-mkl-interfaces.yaml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/check-mkl-interfaces.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/cron-run-tests.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cron-run-tests.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎dpnp/dpnp_array.py
Lines changed: 64 additions & 8 deletions b/‎dpnp/dpnp_array.py
Lines changed: 64 additions & 8 deletions
diff --git a/‎dpnp/dpnp_iface.py
Lines changed: 36 additions & 0 deletions b/‎dpnp/dpnp_iface.py
Lines changed: 36 additions & 0 deletions
diff --git a/‎dpnp/dpnp_iface_indexing.py
Lines changed: 6 additions & 0 deletions b/‎dpnp/dpnp_iface_indexing.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎dpnp/dpnp_iface_libmath.py
Lines changed: 4 additions & 0 deletions b/‎dpnp/dpnp_iface_libmath.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎dpnp/dpnp_iface_mathematical.py
Lines changed: 10 additions & 2 deletions b/‎dpnp/dpnp_iface_mathematical.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎dpnp/dpnp_iface_sorting.py
Lines changed: 5 additions & 0 deletions b/‎dpnp/dpnp_iface_sorting.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎dpnp/linalg/dpnp_utils_linalg.py
Lines changed: 16 additions & 2 deletions b/‎dpnp/linalg/dpnp_utils_linalg.py
Lines changed: 16 additions & 2 deletions
@@ -57,9 +57,6 @@ array_api_tests/test_sorting_functions.py::test_sort
 array_api_tests/test_signatures.py::test_func_signature[std]
 array_api_tests/test_signatures.py::test_func_signature[var]
 
-# missing 'stream' keyword argument
-array_api_tests/test_signatures.py::test_array_method_signature[to_device]
-
 # wrong shape is returned
 array_api_tests/test_linalg.py::test_vecdot
 array_api_tests/test_linalg.py::test_linalg_vecdot
 
@@ -108,7 +108,7 @@ jobs:
         id: run_tests
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 10
+          timeout_minutes: 12
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |
@@ -216,7 +216,7 @@ jobs:
         id: run_tests
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 10
+          timeout_minutes: 12
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |
 
@@ -126,7 +126,7 @@ jobs:
         id: run_tests_linux
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 10
+          timeout_minutes: 12
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |
 
@@ -153,13 +153,6 @@ def mT(self):
 
         return dpnp_array._create_from_usm_ndarray(self._array_obj.mT)
 
-    def to_device(self, target_device):
-        """Transfer array to target device."""
-
-        return dpnp_array(
-            shape=self.shape, buffer=self.get_array().to_device(target_device)
-        )
-
     @property
     def sycl_queue(self):
         return self._array_obj.sycl_queue
@@ -199,7 +192,9 @@ def __and__(self, other):
     # '__array_prepare__',
     # '__array_priority__',
     # '__array_struct__',
-    # '__array_ufunc__',
+
+    __array_ufunc__ = None
+
     # '__array_wrap__',
 
     def __array_namespace__(self, /, *, api_version=None):
@@ -612,6 +607,25 @@ def __truediv__(self, other):
         """Return ``self/value``."""
         return dpnp.true_divide(self, other)
 
+    @property
+    def __usm_ndarray__(self):
+        """
+        Property to support `__usm_ndarray__` protocol.
+
+        It assumes to return :class:`dpctl.tensor.usm_ndarray` instance
+        corresponding to the content of the object.
+
+        This property is intended to speed-up conversion from
+        :class:`dpnp.ndarray` to :class:`dpctl.tensor.usm_ndarray` passed
+        into  `dpctl.tensor.asarray` function. The input object that implements
+        `__usm_ndarray__` protocol is recognized as owner of USM allocation
+        that is managed by a smart pointer, and asynchronous deallocation
+        will not involve GIL.
+
+        """
+
+        return self._array_obj
+
     def __xor__(self, other):
         """Return ``self^value``."""
         return dpnp.bitwise_xor(self, other)
@@ -1693,6 +1707,48 @@ def take(self, indices, axis=None, out=None, mode="wrap"):
 
         return dpnp.take(self, indices, axis=axis, out=out, mode=mode)
 
+    def to_device(self, device, /, *, stream=None):
+        """
+        Transfers this array to specified target device.
+
+        Parameters
+        ----------
+        device : {string, SyclDevice, SyclQueue}
+            Array API concept of target device. It can be an OneAPI filter
+            selector string, an instance of :class:`dpctl.SyclDevice`
+            corresponding to a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :obj:`dpnp.dpnp_array.dpnp_array.device` property.
+        stream : {SyclQueue, None}, optional
+            Execution queue to synchronize with. If ``None``, synchronization
+            is not performed.
+            Default: ``None``.
+
+        Returns
+        -------
+        out : dpnp.ndarray
+            A view if data copy is not required, and a copy otherwise.
+            If copying is required, it is done by copying from the original
+            allocation device to the host, followed by copying from host
+            to the target device.
+
+        Examples
+        --------
+        >>> import dpnp as np, dpctl
+        >>> x = np.full(100, 2, dtype=np.int64)
+        >>> q_prof = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
+        >>> # return a view with profile-enabled queue
+        >>> y = x.to_device(q_prof)
+        >>> timer = dpctl.SyclTimer()
+        >>> with timer(q_prof):
+        ...     z = y * y
+        >>> print(timer.dt)
+
+        """
+
+        usm_res = self._array_obj.to_device(device, stream=stream)
+        return dpnp_array._create_from_usm_ndarray(usm_res)
+
     # 'tobytes',
     # 'tofile',
     # 'tolist',
 
@@ -68,6 +68,7 @@
     "get_result_array",
     "get_usm_ndarray",
     "get_usm_ndarray_or_scalar",
+    "is_cuda_backend",
     "is_supported_array_or_scalar",
     "is_supported_array_type",
     "synchronize_array_data",
@@ -681,6 +682,41 @@ def get_usm_ndarray_or_scalar(a):
     return a if dpnp.isscalar(a) else get_usm_ndarray(a)
 
 
+def is_cuda_backend(obj=None):
+    """
+    Checks that object has a CUDA backend.
+
+    Parameters
+    ----------
+    obj : {Device, SyclDevice, SyclQueue, dpnp.ndarray, usm_ndarray, None},
+          optional
+        An input object with sycl_device property to check device backend.
+        If `obj` is ``None``, device backend will be checked for the default
+        queue.
+        Default: ``None``.
+
+    Returns
+    -------
+    out : bool
+        Return ``True`` if data of the input object resides on a CUDA backend,
+        otherwise ``False``.
+
+    """
+
+    if obj is None:
+        sycl_device = dpctl.select_default_device()
+    elif isinstance(obj, dpctl.SyclDevice):
+        sycl_device = obj
+    else:
+        sycl_device = getattr(obj, "sycl_device", None)
+    if (
+        sycl_device is not None
+        and sycl_device.backend == dpctl.backend_type.cuda
+    ):
+        return True
+    return False
+
+
 def is_supported_array_or_scalar(a):
     """
     Return ``True`` if `a` is a scalar or an array of either
 
@@ -128,6 +128,7 @@ def choose(x1, choices, out=None, mode="raise"):
     :obj:`dpnp.take_along_axis` : Preferable if choices is an array.
 
     """
+
     x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
 
     choices_list = []
@@ -137,6 +138,11 @@ def choose(x1, choices, out=None, mode="raise"):
         )
 
     if x1_desc:
+        if dpnp.is_cuda_backend(x1_desc.get_array()):
+            raise NotImplementedError(
+                "Running on CUDA is currently not supported"
+            )
+
         if any(not desc for desc in choices_list):
             pass
         elif out is not None:
 
@@ -82,6 +82,10 @@ def erf(in_array1):
         in_array1, copy_when_strides=False, copy_when_nondefault_queue=False
     )
     if x1_desc:
+        if dpnp.is_cuda_backend(x1_desc.get_array()):
+            raise NotImplementedError(
+                "Running on CUDA is currently not supported"
+            )
         return dpnp_erf(x1_desc).get_pyobj()
 
     result = create_output_descriptor_py(
 
@@ -2945,8 +2945,16 @@ def modf(x1, **kwargs):
     """
 
     x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    if x1_desc and not kwargs:
-        return dpnp_modf(x1_desc)
+    if x1_desc:
+        if dpnp.is_cuda_backend(x1_desc.get_array()):
+            raise NotImplementedError(
+                "Running on CUDA is currently not supported"
+            )
+
+        if kwargs:
+            pass
+        else:
+            return dpnp_modf(x1_desc)
 
     return call_origin(numpy.modf, x1, **kwargs)
 
 
@@ -192,6 +192,11 @@ def partition(x1, kth, axis=-1, kind="introselect", order=None):
 
     x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
     if x1_desc:
+        if dpnp.is_cuda_backend(x1_desc.get_array()):
+            raise NotImplementedError(
+                "Running on CUDA is currently not supported"
+            )
+
         if not isinstance(kth, int):
             pass
         elif x1_desc.ndim == 0:
 
@@ -397,7 +397,14 @@ def _batched_qr(a, mode="reduced"):
         batch_size,
         depends=[copy_ev],
     )
-    _manager.add_event_pair(ht_ev, geqrf_ev)
+
+    # w/a to avoid raice conditional on CUDA during multiple runs
+    # TODO: Remove it ones the OneMath issue is resolved
+    # https://github.com/uxlfoundation/oneMath/issues/626
+    if dpnp.is_cuda_backend(a_sycl_queue):
+        ht_ev.wait()
+    else:
+        _manager.add_event_pair(ht_ev, geqrf_ev)
 
     if mode in ["r", "raw"]:
         if mode == "r":
@@ -2468,7 +2475,14 @@ def dpnp_qr(a, mode="reduced"):
     ht_ev, geqrf_ev = li._geqrf(
         a_sycl_queue, a_t.get_array(), tau_h.get_array(), depends=[copy_ev]
     )
-    _manager.add_event_pair(ht_ev, geqrf_ev)
+
+    # w/a to avoid raice conditional on CUDA during multiple runs
+    # TODO: Remove it ones the OneMath issue is resolved
+    # https://github.com/uxlfoundation/oneMath/issues/626
+    if dpnp.is_cuda_backend(a_sycl_queue):
+        ht_ev.wait()
+    else:
+        _manager.add_event_pair(ht_ev, geqrf_ev)
 
     if mode in ["r", "raw"]:
         if mode == "r":