IntelPython
diff --git a/‎.github/workflows/generate-coverage.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/generate-coverage.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/generate-docs.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/generate-docs.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/os-llvm-sycl-build.yml
Lines changed: 32 additions & 24 deletions b/‎.github/workflows/os-llvm-sycl-build.yml
Lines changed: 32 additions & 24 deletions
diff --git a/‎conda-recipe/meta.yaml
Lines changed: 1 addition & 1 deletion b/‎conda-recipe/meta.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎dpctl/tensor/_copy_utils.py
Lines changed: 5 additions & 0 deletions b/‎dpctl/tensor/_copy_utils.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎dpctl/tensor/_elementwise_common.py
Lines changed: 35 additions & 22 deletions b/‎dpctl/tensor/_elementwise_common.py
Lines changed: 35 additions & 22 deletions
diff --git a/‎dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
Lines changed: 4 additions & 3 deletions b/‎dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
Lines changed: 1 addition & 1 deletion b/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
Lines changed: 47 additions & 0 deletions b/‎dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
Lines changed: 47 additions & 0 deletions
diff --git a/‎dpctl/tensor/libtensor/source/elementwise_functions.hpp
Lines changed: 3 additions & 1 deletion b/‎dpctl/tensor/libtensor/source/elementwise_functions.hpp
Lines changed: 3 additions & 1 deletion
diff --git a/‎dpctl/tensor/libtensor/source/tensor_py.cpp
Lines changed: 10 additions & 0 deletions b/‎dpctl/tensor/libtensor/source/tensor_py.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎dpctl/tests/_numpy_warnings.py
Lines changed: 28 additions & 0 deletions b/‎dpctl/tests/_numpy_warnings.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎dpctl/tests/conftest.py
Lines changed: 8 additions & 1 deletion b/‎dpctl/tests/conftest.py
Lines changed: 8 additions & 1 deletion
@@ -79,7 +79,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
+          pip install numpy cython"<3" setuptools pytest pytest-cov scikit-build cmake coverage[toml]
 
       - name: Build dpctl with coverage
         shell: bash -l {0}
 
@@ -26,7 +26,7 @@ jobs:
       - name: Install Intel OneAPI
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         run: |
-          sudo apt-get install intel-oneapi-dpcpp-cpp-compiler
+          sudo apt-get install intel-oneapi-compiler-dpcpp-cpp
       - name: Install Lua
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         run: |
@@ -49,7 +49,7 @@ jobs:
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
+          pip install numpy cython"<3" setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
 
@@ -11,9 +11,9 @@ jobs:
 
     env:
       DOWNLOAD_URL_PREFIX: https://github.com/intel/llvm/releases/download
-      DRIVER_PATH: 2023-WW13
-      OCLCPUEXP_FN: oclcpuexp-2023.15.3.0.20_rel.tar.gz
-      FPGAEMU_FN: fpgaemu-2023.15.3.0.20_rel.tar.gz
+      DRIVER_PATH: 2023-WW27
+      OCLCPUEXP_FN: oclcpuexp-2023.16.6.0.28_rel.tar.gz
+      FPGAEMU_FN: fpgaemu-2023.16.6.0.28_rel.tar.gz
       TBB_URL: https://github.com/oneapi-src/oneTBB/releases/download/v2021.9.0/
       TBB_INSTALL_DIR: oneapi-tbb-2021.9.0
       TBB_FN: oneapi-tbb-2021.9.0-lin.tgz
@@ -37,34 +37,42 @@ jobs:
             ${{ runner.os }}-
 
       - name: Download and install nightly and components
+        env:
+          USE_LATEST_SYCLOS: 0
         shell: bash -l {0}
         run: |
           cd /home/runner/work
           mkdir -p sycl_bundle
           cd sycl_bundle
-          # get list of shas and tags from remote, filter sycl-nightly tags and reverse order
-          export LLVM_TAGS=$(git -c 'versionsort.suffix=-' ls-remote --tags --sort='v:refname' https://github.com/intel/llvm.git | \
-                       grep sycl-nightly | awk '{a[i++]=$0} END {for (j=i-1; j>=0;) print a[j--] }')
-          # initialize
-          unset DEPLOY_NIGHTLY_TAG
-          unset DEPLOY_NIGHTLY_TAG_SHA
-
-          # go through tags and find the most recent one where nighly build binary is available
-          while IFS= read -r NEXT_LLVM_TAG; do
-              export NEXT_LLVM_TAG_SHA=$(echo ${NEXT_LLVM_TAG} | awk '{print $1}')
-              export NEXT_NIGHTLY_TAG=$(python3 -c "import sys, urllib.parse as ul; print (ul.quote_plus(sys.argv[1]))" \
-                                          $(echo ${NEXT_LLVM_TAG} | awk '{gsub(/^refs\/tags\//, "", $2)} {print $2}'))
-              if [[ `wget -S --spider ${DOWNLOAD_URL_PREFIX}/${NEXT_NIGHTLY_TAG}/dpcpp-compiler.tar.gz  2>&1 | grep 'HTTP/1.1 200 OK'` ]];
-              then
-                  export DEPLOY_NIGHTLY_TAG=${NEXT_NIGHTLY_TAG}
-                  export DEPLOY_LLVM_TAG_SHA=${NEXT_LLVM_TAG_SHA}
-                  break
-              fi
-          done <<< "${LLVM_TAGS}"
+          if [[ "${USE_LATEST_SYCLOS:-0}" -eq "1" ]]; then
+              # get list of shas and tags from remote, filter sycl-nightly tags and reverse order
+              export LLVM_TAGS=$(git -c 'versionsort.suffix=-' ls-remote --tags --sort='v:refname' https://github.com/intel/llvm.git | \
+                          grep sycl-nightly | awk '{a[i++]=$0} END {for (j=i-1; j>=0;) print a[j--] }')
+              # initialize
+              unset DEPLOY_NIGHTLY_TAG
+              unset DEPLOY_NIGHTLY_TAG_SHA
+
+              # go through tags and find the most recent one where nighly build binary is available
+              while IFS= read -r NEXT_LLVM_TAG; do
+                  export NEXT_LLVM_TAG_SHA=$(echo ${NEXT_LLVM_TAG} | awk '{print $1}')
+                  export NEXT_NIGHTLY_TAG=$(python3 -c "import sys, urllib.parse as ul; print (ul.quote_plus(sys.argv[1]))" \
+                                              $(echo ${NEXT_LLVM_TAG} | awk '{gsub(/^refs\/tags\//, "", $2)} {print $2}'))
+                  if [[ `wget -S --spider ${DOWNLOAD_URL_PREFIX}/${NEXT_NIGHTLY_TAG}/dpcpp-compiler.tar.gz  2>&1 | grep 'HTTP/1.1 200 OK'` ]];
+                  then
+                      export DEPLOY_NIGHTLY_TAG=${NEXT_NIGHTLY_TAG}
+                      export DEPLOY_LLVM_TAG_SHA=${NEXT_LLVM_TAG_SHA}
+                      break
+                  fi
+              done <<< "${LLVM_TAGS}"
+          else
+              # Use latest known to work tag instead
+              export DEPLOY_NIGHTLY_TAG="sycl-nightly%2F20230606"
+              export DEPLOY_LLVM_TAG_SHA=f44d0133d4b0077298f034697a1f3818ff1d6134
+          fi
 
           [[ -n "${DEPLOY_NIGHTLY_TAG}" ]] || exit 1
           [[ -n "${DEPLOY_LLVM_TAG_SHA}" ]] || exit 1
-          echo "Using ${m} corresponding to intel/llvm at ${DEPLOY_LLVM_TAG_SHA}"
+          echo "Using ${DEPLOY_NIGHTLY_TAG} corresponding to intel/llvm at ${DEPLOY_LLVM_TAG_SHA}"
 
           if [[ -f bundle_id.txt && ( "$(cat bundle_id.txt)" == "${DEPLOY_LLVM_TAG_SHA}" ) ]]; then
               echo "Using cached download of ${DEPLOY_LLVM_TAG_SHA}"
@@ -100,7 +108,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake
+          pip install numpy cython"<3" setuptools pytest scikit-build cmake
 
       - name: Checkout repo
         uses: actions/checkout@v3
 
@@ -20,7 +20,7 @@ requirements:
         - cmake  >=3.21
         - ninja
         - git
-        - cython
+        - cython  <3
         - python
         - scikit-build
         - numpy
 
@@ -213,6 +213,11 @@ def _copy_same_shape(dst, src):
     """Assumes src and dst have the same shape."""
     # check that memory regions do not overlap
     if ti._array_overlap(dst, src):
+        if src._pointer == dst._pointer and (
+            src is dst
+            or (src.strides == dst.strides and src.dtype == dst.dtype)
+        ):
+            return
         _copy_overlapping(src=src, dst=dst)
         return
 
 
@@ -52,6 +52,15 @@ def __call__(self, x, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
             raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
 
+        if order not in ["C", "F", "K", "A"]:
+            order = "K"
+        buf_dt, res_dt = _find_buf_dtype(
+            x.dtype, self.result_type_resolver_fn_, x.sycl_device
+        )
+        if res_dt is None:
+            raise RuntimeError
+
+        orig_out = out
         if out is not None:
             if not isinstance(out, dpt.usm_ndarray):
                 raise TypeError(
@@ -64,8 +73,21 @@ def __call__(self, x, out=None, order="K"):
                     f"Expected output shape is {x.shape}, got {out.shape}"
                 )
 
-            if ti._array_overlap(x, out):
-                raise TypeError("Input and output arrays have memory overlap")
+            if res_dt != out.dtype:
+                raise TypeError(
+                    f"Output array of type {res_dt} is needed,"
+                    f" got {out.dtype}"
+                )
+
+            if (
+                buf_dt is None
+                and ti._array_overlap(x, out)
+                and not ti._same_logical_tensors(x, out)
+            ):
+                # Allocate a temporary buffer to avoid memory overlapping.
+                # Note if `buf_dt` is not None, a temporary copy of `x` will be
+                # created, so the array overlap check isn't needed.
+                out = dpt.empty_like(out)
 
             if (
                 dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
@@ -75,13 +97,6 @@ def __call__(self, x, out=None, order="K"):
                     "Input and output allocation queues are not compatible"
                 )
 
-        if order not in ["C", "F", "K", "A"]:
-            order = "K"
-        buf_dt, res_dt = _find_buf_dtype(
-            x.dtype, self.result_type_resolver_fn_, x.sycl_device
-        )
-        if res_dt is None:
-            raise RuntimeError
         exec_q = x.sycl_queue
         if buf_dt is None:
             if out is None:
@@ -91,17 +106,20 @@ def __call__(self, x, out=None, order="K"):
                     if order == "A":
                         order = "F" if x.flags.f_contiguous else "C"
                     out = dpt.empty_like(x, dtype=res_dt, order=order)
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f" got {out.dtype}"
-                    )
 
-            ht, _ = self.unary_fn_(x, out, sycl_queue=exec_q)
-            ht.wait()
+            ht_unary_ev, unary_ev = self.unary_fn_(x, out, sycl_queue=exec_q)
+
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev]
+                )
+                ht_copy_ev.wait()
+                out = orig_out
 
+            ht_unary_ev.wait()
             return out
+
         if order == "K":
             buf = _empty_like_orderK(x, buf_dt)
         else:
@@ -117,11 +135,6 @@ def __call__(self, x, out=None, order="K"):
                 out = _empty_like_orderK(buf, res_dt)
             else:
                 out = dpt.empty_like(buf, dtype=res_dt, order=order)
-        else:
-            if buf_dt != out.dtype:
-                raise TypeError(
-                    f"Output array of type {buf_dt} is needed, got {out.dtype}"
-                )
 
         ht, _ = self.unary_fn_(buf, out, sycl_queue=exec_q, depends=[copy_ev])
         ht_copy_ev.wait()
 
@@ -198,8 +198,9 @@ struct SequentialBooleanReduction
             // must convert to boolean first to handle nans
             using dpctl::tensor::type_utils::convert_impl;
             outT val = convert_impl<bool, argT>(inp_[inp_offset]);
+            ReductionOp op = reduction_op_;
 
-            red_val = reduction_op_(red_val, val);
+            red_val = op(red_val, val);
         }
 
         out_[out_iter_offset] = red_val;
@@ -452,9 +453,9 @@ struct StridedBooleanReduction
                 // must convert to boolean first to handle nans
                 using dpctl::tensor::type_utils::convert_impl;
                 bool val = convert_impl<bool, argT>(inp_[inp_offset]);
+                ReductionOp op = reduction_op_;
 
-                local_red_val =
-                    reduction_op_(local_red_val, static_cast<outT>(val));
+                local_red_val = op(local_red_val, static_cast<outT>(val));
             }
         }
         // reduction and atomic operations are performed
 
@@ -72,7 +72,7 @@ template <typename argT, typename resT> struct ProjFunctor
         const realT y = std::imag(in);
 
         if (std::isinf(x) || std::isinf(y)) {
-            const realT res_im = std::copysign(0.0, y);
+            const realT res_im = std::copysign(realT(0), y);
             return resT{std::numeric_limits<realT>::infinity(), res_im};
         }
         return in;
 
@@ -100,6 +100,53 @@ struct MemoryOverlap
     }
 };
 
+struct SameLogicalTensors
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        // Same ndim
+        int nd1 = ar1.get_ndim();
+        if (nd1 != ar2.get_ndim())
+            return false;
+
+        // Same dtype
+        int tn1 = ar1.get_typenum();
+        if (tn1 != ar2.get_typenum())
+            return false;
+
+        // Same pointer
+        const char *ar1_data = ar1.get_data();
+        const char *ar2_data = ar2.get_data();
+
+        if (ar1_data != ar2_data)
+            return false;
+
+        // Same shape and strides
+        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
+        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
+
+        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
+            return false;
+
+        // Same shape and strides
+        auto const &ar1_strides = ar1.get_strides_vector();
+        auto const &ar2_strides = ar2.get_strides_vector();
+
+        auto ar1_beg_it = std::begin(ar1_strides);
+        auto ar1_end_it = std::end(ar1_strides);
+
+        auto ar2_beg_it = std::begin(ar2_strides);
+
+        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
+            return false;
+
+        // all checks passed: arrays are logical views
+        // into the same memory
+        return true;
+    }
+};
+
 } // namespace overlap
 } // namespace tensor
 } // namespace dpctl
@@ -128,7 +128,9 @@ py_unary_ufunc(dpctl::tensor::usm_ndarray src,
 
     // check memory overlap
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
 
@@ -60,6 +60,7 @@ using dpctl::tensor::c_contiguous_strides;
 using dpctl::tensor::f_contiguous_strides;
 
 using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
 
 using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
 
@@ -338,6 +339,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array overlap",
           py::arg("array1"), py::arg("array2"));
 
+    auto same_logical_tensors = [](dpctl::tensor::usm_ndarray x1,
+                                   dpctl::tensor::usm_ndarray x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
+
     m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
           py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 
@@ -0,0 +1,28 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import pytest
+
+
+@pytest.fixture
+def suppress_invalid_numpy_warnings():
+    # invalid: treatment for invalid floating-point operation
+    # (result is not an expressible number, typically indicates
+    # that a NaN was produced)
+    old_settings = numpy.seterr(invalid="ignore")
+    yield
+    numpy.seterr(**old_settings)  # reset to default
@@ -26,8 +26,15 @@
     invalid_filter,
     valid_filter,
 )
+from _numpy_warnings import suppress_invalid_numpy_warnings
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "helper"))
 
 # common fixtures
-__all__ = ["check", "device_selector", "invalid_filter", "valid_filter"]
+__all__ = [
+    "check",
+    "device_selector",
+    "invalid_filter",
+    "suppress_invalid_numpy_warnings",
+    "valid_filter",
+]
Original file line number	Diff line number	Diff line change
`@@ -198,8 +198,9 @@ struct SequentialBooleanReduction`
`198`	`198`	`// must convert to boolean first to handle nans`
`199`	`199`	`using dpctl::tensor::type_utils::convert_impl;`
`200`	`200`	`outT val = convert_impl<bool, argT>(inp_[inp_offset]);`
	`201`	`+ ReductionOp op = reduction_op_;`
`201`	`202`
`202`		`- red_val = reduction_op_(red_val, val);`
	`203`	`+ red_val = op(red_val, val);`
`203`	`204`	`}`
`204`	`205`
`205`	`206`	`out_[out_iter_offset] = red_val;`
`@@ -452,9 +453,9 @@ struct StridedBooleanReduction`
`452`	`453`	`// must convert to boolean first to handle nans`
`453`	`454`	`using dpctl::tensor::type_utils::convert_impl;`
`454`	`455`	`bool val = convert_impl<bool, argT>(inp_[inp_offset]);`
	`456`	`+ ReductionOp op = reduction_op_;`
`455`	`457`
`456`		`- local_red_val =`
`457`		`- reduction_op_(local_red_val, static_cast<outT>(val));`
	`458`	`+ local_red_val = op(local_red_val, static_cast<outT>(val));`
`458`	`459`	`}`
`459`	`460`	`}`
`460`	`461`	`// reduction and atomic operations are performed`
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ template <typename argT, typename resT> struct ProjFunctor`
`72`	`72`	`const realT y = std::imag(in);`
`73`	`73`
`74`	`74`	`if (std::isinf(x) \|\| std::isinf(y)) {`
`75`		`- const realT res_im = std::copysign(0.0, y);`
	`75`	`+ const realT res_im = std::copysign(realT(0), y);`
`76`	`76`	`return resT{std::numeric_limits<realT>::infinity(), res_im};`
`77`	`77`	`}`
`78`	`78`	`return in;`
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,9 @@ py_unary_ufunc(dpctl::tensor::usm_ndarray src,`
`128`	`128`
`129`	`129`	`// check memory overlap`
`130`	`130`	`auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();`
`131`		`- if (overlap(src, dst)) {`
	`131`	`+ auto const &same_logical_tensors =`
	`132`	`+ dpctl::tensor::overlap::SameLogicalTensors();`
	`133`	`+ if (overlap(src, dst) && !same_logical_tensors(src, dst)) {`
`132`	`134`	`throw py::value_error("Arrays index overlapping segments of memory");`
`133`	`135`	`}`
`134`	`136`