From d4fd805dda7c60a2c09983a9cd5aa1b04d9477d1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Feb 2026 09:57:04 -0800
Subject: [PATCH 001/145] Rename folder dpctl to dpctl_ext

---
 .../tensor/libtensor/include/kernels/alignment.hpp              | 0
 .../tensor/libtensor/include/kernels/dpctl_tensor_types.hpp     | 0
 .../libtensor/include/kernels/elementwise_functions/common.hpp  | 0
 .../include/kernels/elementwise_functions/common_detail.hpp     | 0
 .../include/kernels/elementwise_functions/logaddexp.hpp         | 0
 .../libtensor/include/kernels/elementwise_functions/maximum.hpp | 0
 .../libtensor/include/kernels/elementwise_functions/minimum.hpp | 0
 .../include/kernels/elementwise_functions/sycl_complex.hpp      | 0
 .../include/kernels/elementwise_functions/vec_size_util.hpp     | 0
 .../tensor/libtensor/include/utils/indexing_utils.hpp           | 0
 .../tensor/libtensor/include/utils/math_utils.hpp               | 0
 .../tensor/libtensor/include/utils/memory_overlap.hpp           | 0
 .../tensor/libtensor/include/utils/offset_utils.hpp             | 0
 .../tensor/libtensor/include/utils/output_validation.hpp        | 0
 .../tensor/libtensor/include/utils/strided_iters.hpp            | 0
 .../tensor/libtensor/include/utils/sycl_alloc_utils.hpp         | 0
 .../tensor/libtensor/include/utils/sycl_utils.hpp               | 0
 .../tensor/libtensor/include/utils/type_dispatch.hpp            | 0
 .../tensor/libtensor/include/utils/type_dispatch_building.hpp   | 0
 .../tensor/libtensor/include/utils/type_utils.hpp               | 0
 dpnp/backend/extensions/blas/CMakeLists.txt                     | 2 +-
 dpnp/backend/extensions/fft/CMakeLists.txt                      | 2 +-
 dpnp/backend/extensions/indexing/CMakeLists.txt                 | 2 +-
 dpnp/backend/extensions/lapack/CMakeLists.txt                   | 2 +-
 dpnp/backend/extensions/statistics/CMakeLists.txt               | 2 +-
 dpnp/backend/extensions/ufunc/CMakeLists.txt                    | 2 +-
 dpnp/backend/extensions/vm/CMakeLists.txt                       | 2 +-
 dpnp/backend/extensions/window/CMakeLists.txt                   | 2 +-
 28 files changed, 8 insertions(+), 8 deletions(-)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/alignment.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/indexing_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/math_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/memory_overlap.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/offset_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/output_validation.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/strided_iters.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_alloc_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_utils.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch_building.hpp (100%)
 rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_utils.hpp (100%)

diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/alignment.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/math_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/offset_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/output_validation.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/strided_iters.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp
similarity index 100%
rename from dpctl/tensor/libtensor/include/utils/type_utils.hpp
rename to dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 0015eda84843..cbc3e31d923b 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -68,7 +68,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 0569ecc8bca4..edc7bff7dce4 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -61,7 +61,7 @@ target_include_directories(
     ${python_module_name}
     PRIVATE
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index c0de75ae3146..39f68ffba846 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -65,7 +65,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 76b25c3a6d10..59499a3b28f8 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -86,7 +86,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index e04279b75e49..8544e816e8d6 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -70,7 +70,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index 55a750f8423f..293cef0ab326 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -88,7 +88,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 32d6a6765a00..551c43842af2 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -110,7 +110,7 @@ target_include_directories(
     PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 6fe04e334f42..01274317782d 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -66,7 +66,7 @@ target_include_directories(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_SOURCE_DIR}/dpnp/backend/include
-        ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include
+        ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include
 )
 target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
 

From c040713d50cd10c628990b628cb74b0a5029f99b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:04:36 -0800
Subject: [PATCH 002/145] Add simplify_iteration_space implementation to
 libtensor

---
 .../source/simplify_iteration_space.cpp       | 544 ++++++++++++++++++
 .../source/simplify_iteration_space.hpp       | 130 +++++
 2 files changed, 674 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp

diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
new file mode 100644
index 000000000000..2526f022e0ac
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -0,0 +1,544 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "simplify_iteration_space.hpp"
+#include "utils/strided_iters.hpp"
+#include <cstddef>
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &nd,
+                                const py::ssize_t *const &shape,
+                                std::vector<py::ssize_t> const &strides,
+                                // output
+                                std::vector<py::ssize_t> &simplified_shape,
+                                std::vector<py::ssize_t> &simplified_strides,
+                                py::ssize_t &offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_stride;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.insert(std::end(simplified_strides),
+                                  std::begin(strides), std::end(strides));
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+        int contracted_nd = simplify_iteration_stride(
+            nd, simplified_shape.data(), simplified_strides.data(),
+            offset // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.push_back((strides[0] >= 0) ? strides[0]
+                                                       : -strides[0]);
+        if ((strides[0] < 0) && (shape[0] > 1)) {
+            offset += (shape[0] - 1) * strides[0];
+        }
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space(int &nd,
+                              const py::ssize_t *const &shape,
+                              std::vector<py::ssize_t> const &src_strides,
+                              std::vector<py::ssize_t> const &dst_strides,
+                              // output
+                              std::vector<py::ssize_t> &simplified_shape,
+                              std::vector<py::ssize_t> &simplified_src_strides,
+                              std::vector<py::ssize_t> &simplified_dst_strides,
+                              py::ssize_t &src_offset,
+                              py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_two_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::begin(simplified_shape), shape,
+                                shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.insert(std::end(simplified_src_strides),
+                                      std::begin(src_strides),
+                                      std::end(src_strides));
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_two_strides(
+            nd, simplified_shape.data(), simplified_src_strides.data(),
+            simplified_dst_strides.data(),
+            src_offset, // modified by reference
+            dst_offset  // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if (src_strides[0] < 0 && dst_strides[0] < 0) {
+            simplified_src_strides.push_back(-src_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src_offset += (shape[0] - 1) * src_strides[0];
+                dst_offset += (shape[0] - 1) * dst_strides[0];
+            }
+        }
+        else {
+            simplified_src_strides.push_back(src_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_3(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_three_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_three_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (dst_strides[0] < 0)) {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_4(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // src3
+    std::vector<py::ssize_t> const &src3_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_src3_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &src3_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_four_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src3_strides.reserve(nd);
+        simplified_src3_strides.insert(std::end(simplified_src3_strides),
+                                       std::begin(src3_strides),
+                                       std::end(src3_strides));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_four_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_src3_strides.data(),
+            simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            src3_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_src3_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        src3_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_src3_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (src3_strides[0] < 0) && (dst_strides[0] < 0))
+        {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_src3_strides.push_back(-src3_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                src3_offset += src3_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_src3_strides.push_back(src3_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void compact_iteration_space(int &nd,
+                             const py::ssize_t *const &shape,
+                             std::vector<py::ssize_t> const &strides,
+                             // output
+                             std::vector<py::ssize_t> &compact_shape,
+                             std::vector<py::ssize_t> &compact_strides)
+{
+    using dpctl::tensor::strides::compact_iteration;
+    if (nd > 1) {
+        // Compact iteration space to reduce dimensionality
+        // and improve access pattern
+        compact_shape.reserve(nd);
+        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.insert(std::end(compact_strides), std::begin(strides),
+                               std::end(strides));
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd =
+            compact_iteration(nd, compact_shape.data(), compact_strides.data());
+        compact_shape.resize(contracted_nd);
+        compact_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        compact_shape.reserve(nd);
+        compact_shape.push_back(shape[0]);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.push_back(strides[0]);
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
+ * axis_end) and dir2 (along given set of axes)
+ */
+void split_iteration_space(const std::vector<py::ssize_t> &shape_vec,
+                           const std::vector<py::ssize_t> &strides_vec,
+                           int axis_start,
+                           int axis_end,
+                           std::vector<py::ssize_t> &dir1_shape_vec,
+                           std::vector<py::ssize_t> &dir2_shape_vec,
+                           std::vector<py::ssize_t> &dir1_strides_vec,
+                           std::vector<py::ssize_t> &dir2_strides_vec)
+{
+    int nd = static_cast<int>(shape_vec.size());
+    int dir2_sz = axis_end - axis_start;
+    int dir1_sz = nd - dir2_sz;
+
+    assert(dir1_sz > 0);
+    assert(dir2_sz > 0);
+
+    dir1_shape_vec.resize(dir1_sz);
+    dir2_shape_vec.resize(dir2_sz);
+
+    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
+              dir1_shape_vec.begin());
+    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
+              dir1_shape_vec.begin() + axis_start);
+
+    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
+              dir2_shape_vec.begin());
+
+    dir1_strides_vec.resize(dir1_sz);
+    dir2_strides_vec.resize(dir2_sz);
+
+    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
+              dir1_strides_vec.begin());
+    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
+              dir1_strides_vec.begin() + axis_start);
+
+    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
+              dir2_strides_vec.begin());
+
+    return;
+}
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(nd - 1 - i) * s;
+        s *= shape.at(nd - 1 - i);
+    }
+
+    return flat_index;
+}
+
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(i) * s;
+        s *= shape.at(i);
+    }
+
+    return flat_index;
+}
+
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[nd - 1 - dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[nd - 1 - dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[0] = i_;
+    }
+    return mi;
+}
+
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[nd - 1] = i_;
+    }
+    return mi;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
new file mode 100644
index 000000000000..d3448ee1f5fd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -0,0 +1,130 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &,
+                                const py::ssize_t *const &,
+                                std::vector<py::ssize_t> const &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &);
+
+void simplify_iteration_space(int &,
+                              const py::ssize_t *const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              py::ssize_t &,
+                              py::ssize_t &);
+
+void simplify_iteration_space_3(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void simplify_iteration_space_4(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // src3
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void compact_iteration_space(int &,
+                             const py::ssize_t *const &,
+                             std::vector<py::ssize_t> const &,
+                             // output
+                             std::vector<py::ssize_t> &,
+                             std::vector<py::ssize_t> &);
+
+void split_iteration_space(const std::vector<py::ssize_t> &,
+                           const std::vector<py::ssize_t> &,
+                           int,
+                           int,
+                           // output
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &);
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From 14b466facfe6b23f92113ccc2dbb224e2727bf3c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:14:43 -0800
Subject: [PATCH 003/145] Extend codespell ignore list for libtensor

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index cdf592535d11..67fb75cb5f54 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT"
 quiet-level = 3
 
 [tool.coverage.report]

From dcc421bc61c36549d3e6865927f495abab15d078 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:15:09 -0800
Subject: [PATCH 004/145] Add copy_and_cast kernels to libtensor

---
 .../include/kernels/copy_and_cast.hpp         | 1288 +++++++++++++++++
 .../include/kernels/copy_as_contiguous.hpp    |  655 +++++++++
 .../libtensor/source/copy_as_contig.cpp       |  758 ++++++++++
 .../libtensor/source/copy_as_contig.hpp       |   61 +
 4 files changed, 2762 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
new file mode 100644
index 000000000000..a07d311a7fcb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -0,0 +1,1288 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copy_and_cast
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_generic_kernel;
+
+template <typename srcT,
+          typename dstT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class copy_cast_contig_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_contig_kernel;
+
+template <typename srcTy, typename dstTy>
+class Caster
+{
+public:
+    Caster() = default;
+    dstTy operator()(const srcTy &src) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(src);
+    }
+};
+
+template <typename srcT, typename dstT, typename CastFnT, typename IndexerT>
+class GenericCopyFunctor
+{
+private:
+    const srcT *src_ = nullptr;
+    dstT *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer)
+        : src_(src_p), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        static constexpr CastFnT fn{};
+        dst_[dst_offset] = fn(src_[src_offset]);
+    }
+};
+
+/*!
+  @defgroup CopyAndCastKernels
+ */
+
+/*!
+ * @brief Function pointer type for generic array cast and copying function.
+ */
+typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
+ `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have array dimensionality specified via argument `nd`. The
+ `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
+ first `nd` elements encode common shape, second `nd` elements contain strides
+ of `src` array, and the trailing `nd` elements contain strides of `dst` array.
+ `src_p` and `dst_p` represent pointers into respective arrays, but the start of
+ iteration begins at offset of `src_offset` elements for `src` array and at
+ offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue
+ `q` with events `depends` and `additional_depends` as dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  nd      Array dimensionality, i.e. number of indices needed to
+ identify an element of each array.
+   @param  shape_and_strides  Kernel accessible USM pointer to packed shape and
+ strides.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  src_offset  Offset to the beginning of iteration in number of
+ elements of source array from `src_p`.
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  dst_offset  Offset to the beginning of iteration in number of
+ elements of destination array from `dst_p`.
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+   @param  additional_depends Additional list of events to wait for before
+ starting computations, if any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_generic_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset,
+                                                shape_and_strides};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<class copy_cast_generic_kernel<
+            srcTy, dstTy, TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>,
+                               TwoOffsets_StridedIndexer>(src_tp, dst_tp,
+                                                          indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get generic function pointer of type `fnT` for given source
+ * data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_generic_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for contiguous arrays
+
+template <typename srcT,
+          typename dstT,
+          typename CastFnT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class ContigCopyFunctor
+{
+private:
+    std::size_t nelems;
+    const srcT *src_p = nullptr;
+    dstT *dst_p = nullptr;
+
+public:
+    ContigCopyFunctor(const std::size_t nelems_,
+                      const srcT *src_p_,
+                      dstT *dst_p_)
+        : nelems(nelems_), src_p(src_p_), dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr CastFnT fn{};
+
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex_v;
+        if constexpr (!enable_sg_loadstore || is_complex_v<srcT> ||
+                      is_complex_v<dstT>) {
+            std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * elems_per_sg + (gid % sgSize)
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                dst_p[offset] = fn(src_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems) {
+                sycl::vec<dstT, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto src_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&src_p[offset]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[offset]);
+
+                    const sycl::vec<srcT, vec_sz> src_vec =
+                        sub_group_load<vec_sz>(sg, src_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; k++) {
+                        dst_vec[k] = fn(src_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t start = base + sg.get_local_id()[0];
+                for (std::size_t k = start; k < nelems; k += sgSize) {
+                    dst_p[k] = fn(src_p[k]);
+                }
+            }
+        }
+    }
+};
+
+/*!
+ * @brief Function pointer type for contiguous array cast and copy function.
+ */
+typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray
+ to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have the same number of elements `nelems`.
+ `src_cp` and `dst_cp` represent char pointers to the start of respective
+ arrays. Kernel is submitted to sycl queue `q` with events `depends` as
+ dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_contig_impl(sycl::queue &q,
+                                      std::size_t nelems,
+                                      const char *src_cp,
+                                      char *dst_cp,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_cp);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
+
+        std::size_t lws = 64;
+        static constexpr std::uint32_t vec_sz = 4;
+        static constexpr std::uint32_t n_vecs = 2;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(src_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, enable_sg_loadstore>(nelems, src_tp,
+                                                               dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, disable_sg_loadstore>(nelems, src_tp,
+                                                                dst_tp));
+        }
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get specialized function pointer for casting and copying
+ * contiguous arrays.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for 1D arrays
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 1D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 2D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Specialized for given array dimension function to copy `nelems`
+ elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy`
+ to `dstTy`.
+
+   Both arrays have array dimensionality known at compile time and specified in
+ template parameters `nd`. Arrays' shape and strides are provided as
+ `std::array`. `src_p` and `dst_p` represent pointers into respective arrays,
+ but the start of iteration begins at offset of `src_offset` elements for `src`
+ array and at offset `dst_offset` elements for `dst` array. Kernel is submitted
+ to sycl queue `q` with events `depends` as dependencies.
+
+   @param q  The queue where the routine should be executed.
+   @param nelems  Number of elements to cast and copy.
+   @param shape   Common shape of the arrays.
+   @param src_strides Strides of the source array.
+   @param dst_strides Strides of the destination array.
+   @param src_p  Kernel accessible USM pointer for the source array
+   @param src_offset  Offset to the beginning of iteration in number of elements
+ of the source array from `src_p`.
+   @param dst_p  Kernel accessible USM pointer for the destination array
+   @param dst_offset  Offset to the beginning of iteration in number of elements
+ of the destination array from `src_p`.
+   @param depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy, int nd>
+sycl::event copy_and_cast_nd_specialized_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const std::array<ssize_t, nd> &shape,
+    const std::array<ssize_t, nd> &src_strides,
+    const std::array<ssize_t, nd> &dst_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        using IndexerT = TwoOffsets_FixedDimStridedIndexer<nd>;
+        const IndexerT indexer{shape, src_strides, dst_strides, src_offset,
+                               dst_offset};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.depends_on(depends);
+        cgh.parallel_for<
+            class copy_cast_generic_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, IndexerT>(
+                src_tp, dst_tp, indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get 1D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast1DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 1>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get 2D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast2DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 2>;
+        return f;
+    }
+};
+
+// ====================== Copying from host to USM
+
+template <typename AccessorT,
+          typename dstTy,
+          typename CastFnT,
+          typename IndexerT>
+class GenericCopyFromHostFunctor
+{
+private:
+    AccessorT src_acc_;
+    dstTy *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFromHostFunctor(const AccessorT &src_acc,
+                               dstTy *dst_p,
+                               const IndexerT &indexer)
+        : src_acc_(src_acc), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        CastFnT fn{};
+        dst_[dst_offset] = fn(src_acc_[src_offset]);
+    }
+};
+
+typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy`.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Arrays' metadata are given in packed USM vector of length `3*nd` whose first
+ * `nd` elements contain arrays' shape, next `nd` elements specify source
+ * strides in elements (not bytes), and trailing `nd` elements specify
+ * destination array strides. Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param nd The dimensionality of arrays
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides.
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param src_min_nelem_offset  Smallest value of offset relative to
+ * `host_src_p` in number of elements attained while iterating over elements of
+ * the source array.
+ * @param src_max_nelem_offset  Largest value of offset relative to `host_src_p`
+ * in number of elements attained while iterating over elements of the source
+ * array.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *host_src_p,
+    ssize_t src_offset,
+    ssize_t src_min_nelem_offset,
+    ssize_t src_max_nelem_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
+
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_min_nelem_offset,
+        sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        const TwoOffsets_StridedIndexer indexer{
+            nd, src_offset - src_min_nelem_offset, dst_offset,
+            const_cast<const ssize_t *>(shape_and_strides)};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy,
+                                                    TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>,
+                                       TwoOffsets_StridedIndexer>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_impl<D, S>;
+        return f;
+    }
+};
+
+typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  /* nelems */
+    const char *, /* src_pointer */
+    ssize_t,      /* src_offset */
+    char *,       /* dst_pointer */
+    ssize_t,      /* dst_offset */
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy` for contiguous arrays.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param src_stride The stride of source array in elements
+ * @param dst_stride The stride of destimation array in elements
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_contig_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const char *host_src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_offset,
+        sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        using IndexerT = TwoOffsets_CombinedIndexer<NoOpIndexer, NoOpIndexer>;
+        static constexpr NoOpIndexer src_indexer{};
+        static constexpr NoOpIndexer dst_indexer{};
+        static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer,
+                                                            dst_indexer};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<
+            copy_cast_from_host_contig_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>, IndexerT>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// =============== Copying for reshape ================== //
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_reshape_generic_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class GenericCopyForReshapeFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    GenericCopyForReshapeFunctor(const char *src_ptr,
+                                 char *dst_ptr,
+                                 const SrcIndexerT &src_indexer,
+                                 const DstIndexerT &dst_indexer)
+        : src_p(reinterpret_cast<const Ty *>(src_ptr)),
+          dst_p(reinterpret_cast<Ty *>(dst_ptr)), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const ssize_t src_offset = src_indexer_(wiid.get(0));
+        const ssize_t dst_offset = dst_indexer_(wiid.get(0));
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // src_nd
+    int,             // dst_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    char *,          // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array while reshaping.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index(i,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  nelems The number of elements to copy
+ * @param  src_nd Array dimension of the source array
+ * @param  dst_nd Array dimension of the destination array
+ * @param  packed_shapes_and_strides Kernel accessible USM array of size
+ * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape,
+ * dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event
+    copy_for_reshape_generic_impl(sycl::queue &q,
+                                  std::size_t nelems,
+                                  int src_nd,
+                                  int dst_nd,
+                                  const ssize_t *packed_shapes_and_strides,
+                                  const char *src_p,
+                                  char *dst_p,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 2*(src_nd + dst_nd)
+        //   [ src_shape; src_strides; dst_shape; dst_strides ]
+
+        const ssize_t *src_shape_and_strides =
+            const_cast<const ssize_t *>(packed_shapes_and_strides);
+
+        const ssize_t *dst_shape_and_strides = const_cast<const ssize_t *>(
+            packed_shapes_and_strides + (2 * src_nd));
+
+        const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
+        const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
+
+        using KernelName =
+            copy_for_reshape_generic_kernel<Ty, StridedIndexer, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            GenericCopyForReshapeFunctor<Ty, StridedIndexer, StridedIndexer>(
+                src_p, dst_p, src_indexer, dst_indexer));
+    });
+
+    return copy_for_reshape_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForReshapeGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_reshape_generic_impl<Ty>;
+        return f;
+    }
+};
+
+// ================== Copying for roll ================== //
+
+/*! @brief Functor to cyclically roll global_id to the left */
+struct LeftRolled1DTransformer
+{
+    LeftRolled1DTransformer(std::size_t offset, std::size_t size)
+        : offset_(offset), size_(size)
+    {
+    }
+
+    std::size_t operator()(std::size_t gid) const
+    {
+        const std::size_t shifted_gid =
+            ((gid < offset_) ? gid + size_ - offset_ : gid - offset_);
+        return shifted_gid;
+    }
+
+private:
+    std::size_t offset_ = 0;
+    std::size_t size_ = 1;
+};
+
+/*! @brief Indexer functor to compose indexer and transformer */
+template <typename IndexerT, typename TransformerT>
+struct CompositionIndexer
+{
+    CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
+
+    auto operator()(std::size_t gid) const
+    {
+        return f_(t_(gid));
+    }
+
+private:
+    IndexerT f_;
+    TransformerT t_;
+};
+
+/*! @brief Indexer functor to find offset for nd-shifted indices lifted from
+ * iteration id */
+struct RolledNDIndexer
+{
+    RolledNDIndexer(int nd,
+                    const ssize_t *shape,
+                    const ssize_t *strides,
+                    const ssize_t *ndshifts,
+                    ssize_t starting_offset)
+        : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
+          starting_offset_(starting_offset)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+private:
+    int nd_ = -1;
+    const ssize_t *shape_ = nullptr;
+    const ssize_t *strides_ = nullptr;
+    const ssize_t *ndshifts_ = nullptr;
+    ssize_t starting_offset_ = 0;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd_);
+        ssize_t relative_offset_(0);
+        _ind.get_left_rolled_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_,    // shape ptr
+            strides_,  // strides ptr
+            ndshifts_, // shifts ptr
+            relative_offset_);
+        return starting_offset_ + relative_offset_;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_strided_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class StridedCopyForRollFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    StridedCopyForRollFunctor(const Ty *src_ptr,
+                              Ty *dst_ptr,
+                              const SrcIndexerT &src_indexer,
+                              const DstIndexerT &dst_indexer)
+        : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const std::size_t gid = wiid.get(0);
+
+        const ssize_t src_offset = src_indexer_(gid);
+        const ssize_t dst_offset = dst_indexer_(gid);
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // shift
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  nd     Array dimensionality of the destination and source arrays
+ * @param  packed_shapes_and_strides Kernel accessible USM array
+ * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of first element of src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of first element of dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_strided_impl(sycl::queue &q,
+                                       std::size_t shift,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *packed_shapes_and_strides,
+                                       const char *src_p,
+                                       ssize_t src_offset,
+                                       char *dst_p,
+                                       ssize_t dst_offset,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 3 * nd
+        //   [ common_shape; src_strides; dst_strides ]
+
+        const StridedIndexer src_indexer{nd, src_offset,
+                                         packed_shapes_and_strides};
+        const LeftRolled1DTransformer left_roll_transformer{shift, nelems};
+
+        using CompositeIndexerT =
+            CompositionIndexer<StridedIndexer, LeftRolled1DTransformer>;
+
+        const CompositeIndexerT rolled_src_indexer(src_indexer,
+                                                   left_roll_transformer);
+
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset,
+                                           packed_shapes_and_strides,
+                                           packed_shapes_and_strides + 2 * nd};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, CompositeIndexerT,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, CompositeIndexerT,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, rolled_src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+// define function type
+typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  // shift
+    std::size_t,  // num_elements
+    const char *, // src_data_ptr
+    ssize_t,      // src_offset
+    char *,       // dst_data_ptr
+    ssize_t,      // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+class copy_for_roll_contig_kernel;
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of the start of array src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of the start of array dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_contig_impl(sycl::queue &q,
+                                      std::size_t shift,
+                                      std::size_t nelems,
+                                      const char *src_p,
+                                      ssize_t src_offset,
+                                      char *dst_p,
+                                      ssize_t dst_offset,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        static constexpr NoOpIndexer src_indexer{};
+        const LeftRolled1DTransformer roller{shift, nelems};
+
+        const CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
+            left_rolled_src_indexer{src_indexer, roller};
+        static constexpr NoOpIndexer dst_indexer{};
+
+        using KernelName = copy_for_roll_contig_kernel<Ty>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p) + src_offset;
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<
+                Ty, CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>,
+                NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer,
+                             dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollStridedFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_strided_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_contig_impl<Ty>;
+        return f;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_ndshift_strided_kernel;
+
+// define function type
+typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shape, strides, shifts
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+sycl::event copy_for_roll_ndshift_strided_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *packed_shapes_and_strides_and_shifts,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides_and_shifts:
+        //   USM array of size 4 * nd
+        //   [ common_shape; src_strides; dst_strides; shifts ]
+
+        const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
+        const ssize_t *src_strides_ptr =
+            packed_shapes_and_strides_and_shifts + nd;
+        const ssize_t *dst_strides_ptr =
+            packed_shapes_and_strides_and_shifts + 2 * nd;
+        const ssize_t *shifts_ptr =
+            packed_shapes_and_strides_and_shifts + 3 * nd;
+
+        const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr,
+                                          shifts_ptr, src_offset};
+
+        const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr,
+                                                 dst_strides_ptr};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, RolledNDIndexer,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, RolledNDIndexer,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollNDShiftFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_ndshift_strided_impl<Ty>;
+        return f;
+    }
+};
+
+} // namespace copy_and_cast
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
new file mode 100644
index 000000000000..b4f367448758
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -0,0 +1,655 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copy_as_contig
+{
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class CopyAsCContigFunctor
+{
+private:
+    std::size_t nelems;
+    const T *src_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT src_indexer;
+
+public:
+    CopyAsCContigFunctor(std::size_t n,
+                         const T *src_,
+                         T *dst_,
+                         const IndexerT &src_indexer_)
+        : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static_assert(vec_sz > 0);
+        static_assert(n_vecs > 0);
+
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_max_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
+            // gid % sgSize == gid - (gid / sgSize) * sgSize
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                auto src_offset = src_indexer(offset);
+                dst_p[offset] = src_p[src_offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            const std::uint16_t elems_per_sg = elems_per_wi * sgSize;
+
+            if (base + elems_per_sg < nelems) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    // it == vec_id * vec_sz, for  0 <= vec_id < n_vecs
+                    const std::size_t block_start_id = base + it * sgSize;
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[block_start_id]);
+
+                    const std::size_t elem_id0 =
+                        block_start_id + sg.get_local_id();
+                    sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        const std::size_t elem_id = elem_id0 + k * sgSize;
+                        const ssize_t src_offset = src_indexer(elem_id);
+                        dst_vec[k] = src_p[src_offset];
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                const std::size_t k0 = base + lane_id;
+                for (std::size_t k = k0; k < nelems; k += sgSize) {
+                    const ssize_t src_offset = src_indexer(k);
+                    dst_p[k] = src_p[src_offset];
+                }
+            }
+        }
+    }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sg_load,
+          typename KernelName>
+sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
+                                     std::size_t nelems,
+                                     const T *src,
+                                     T *dst,
+                                     const IndexerT &src_indexer,
+                                     const std::vector<sycl::event> &depends)
+{
+    static_assert(vec_sz > 0);
+    static_assert(n_vecs > 0);
+
+    static constexpr std::size_t preferred_lws = 256;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const std::size_t lws =
+        ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;
+
+    static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+    const std::size_t nelems_per_group = nelems_per_wi * lws;
+    const std::size_t n_groups =
+        (nelems + nelems_per_group - 1) / (nelems_per_group);
+
+    sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.use_kernel_bundle(kb);
+
+        const sycl::range<1> gRange{n_groups * lws};
+        const sycl::range<1> lRange{lws};
+
+        cgh.parallel_for<KernelName>(
+            sycl::nd_range<1>(gRange, lRange),
+            CopyAsCContigFunctor<T, IndexerT, vec_sz, n_vecs, enable_sg_load>(
+                nelems, src, dst, src_indexer));
+    });
+    return copy_ev;
+}
+
+template <typename T,
+          typename IndexT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sgload>
+class as_contig_krn;
+
+template <typename T>
+sycl::event
+    as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *shape_and_strides,
+                                       const char *src_p,
+                                       char *dst_p,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
+
+    static constexpr std::uint8_t vec_sz = 4u;
+    static constexpr std::uint8_t n_vecs = 2u;
+
+    using dpctl::tensor::kernels::alignment_utils::
+        disabled_sg_loadstore_wrapper_krn;
+    using dpctl::tensor::kernels::alignment_utils::is_aligned;
+    using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+    sycl::event copy_ev;
+    if (is_aligned<required_alignment>(dst_p)) {
+        static constexpr bool enable_sg_load = true;
+        using KernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           enable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+    else {
+        static constexpr bool disable_sg_load = false;
+        using InnerKernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
+        using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           disable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+
+    return copy_ev;
+}
+
+typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_array_generic_impl<T>;
+    }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint16_t tile_size,
+          std::uint16_t n_lines>
+class as_contig_batch_of_square_matrices_krn;
+
+namespace detail
+{
+/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination
+   strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks
+   to avoid race condition
+ */
+template <typename T, typename BatchIndexerT>
+sycl::event as_c_contiguous_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    const BatchIndexerT &batch_two_offsets_indexer,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    static constexpr std::uint16_t private_tile_size = 4;
+    static constexpr std::uint16_t n_lines = 2;
+    static constexpr std::uint16_t block_size =
+        n_lines * private_tile_size * private_tile_size;
+
+    static constexpr std::uint16_t lws0 = block_size;
+    static constexpr std::uint16_t lws1 = n_lines;
+    static constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
+
+    static_assert(nelems_per_wi * lws1 == block_size);
+    static_assert(nelems_per_wi == private_tile_size * private_tile_size);
+
+    static constexpr std::uint32_t lws = lws0 * lws1;
+
+    const std::size_t n_tiles = (n + block_size - 1) / block_size;
+
+    const ssize_t src_stride = src_ld;
+    const ssize_t dst_stride = dst_ld;
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    using KernelName =
+        as_contig_batch_of_square_matrices_krn<T, BatchIndexerT,
+                                               private_tile_size, lws1>;
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::local_accessor<T, 1> local_block(block_size * block_size, cgh);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> nd_it) {
+            // 1. Read block from source array into SLM
+            const std::uint32_t lid_lin = nd_it.get_local_linear_id();
+            const std::size_t gr_id_lin = nd_it.get_group_linear_id();
+
+            const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles);
+            const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles);
+
+            const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id);
+            const auto &src_batch_offset = batch_two_offsets.get_first_offset();
+            const auto &dst_batch_offset =
+                batch_two_offsets.get_second_offset();
+
+            // Block id
+            /* 0 <= src_gr_i1 < n_groups_n1 */
+            const std::size_t src_tile_i1 = rem / n_tiles;
+            /* 0 <= src_gr_i0 < n_groups_n0 */
+            const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles;
+
+            // ID of element within the block
+            /* 0 <= src_i1 < lws1 */
+            const std::uint32_t src_i1 = lid_lin / lws0;
+            /* 0 <= src_i0 < lws0 */
+            const std::uint32_t src_i0 = lid_lin - src_i1 * lws0;
+
+            // Matrix element ID
+            const std::size_t src_tile_start0 = src_tile_i0 * block_size;
+            const std::size_t src_tile_start1 = src_tile_i1 * block_size;
+            const std::size_t src_gid0 = (src_tile_start0 + src_i0);
+            const std::size_t src_gid1 = (src_tile_start1 + src_i1);
+
+            // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) *
+            // src_stride
+            const std::size_t src_offset0 =
+                src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride;
+            const std::size_t pr_step_src = lws1 * src_stride;
+
+            const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size;
+            const std::uint32_t pr_step_local = lws1 * block_size;
+
+            for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                local_block[local_offset0 + pr_step_local * pr_id] =
+                    (src_gid0 < n && src_gid1 + pr_id * lws1 < n)
+                        ? src_tp[src_offset0 + pr_step_src * pr_id]
+                        : T(0);
+            }
+
+            const std::uint32_t local_dim0 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start0 + block_size, n) -
+                src_tile_start0);
+            const std::uint32_t local_dim1 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start1 + block_size, n) -
+                src_tile_start1);
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 2. Permute the block matrix in SLM using two private arrays
+            std::array<T, nelems_per_wi> private_block_01 = {T(0)};
+            std::array<T, nelems_per_wi> private_block_10 = {T(0)};
+
+            // 0 <= lid_lin < lws0 * lws1 ==
+            //       (block_size * block_size / nelems_per_wi) ==
+            //       (block_size/private_tile_size)**2
+            static constexpr std::uint16_t n_private_tiles_per_axis =
+                block_size / private_tile_size;
+            const std::uint16_t local_tile_id0 =
+                lid_lin / n_private_tiles_per_axis;
+            const std::uint16_t local_tile_id1 =
+                lid_lin - local_tile_id0 * n_private_tiles_per_axis;
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+
+                        const std::uint16_t pr_offset =
+                            pr_i1 * private_tile_size + pr_i0;
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // read (local_tile_id0, local_tile_id1)
+                        const std::uint16_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        private_block_01[pr_offset] =
+                            local_block[local_01_offset];
+
+                        // read (local_tile_id1, local_tile_id0)
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        private_block_10[pr_offset] =
+                            local_block[local_10_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+                        const std::uint16_t pr_offset =
+                            pr_i0 * private_tile_size + pr_i1;
+
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // write back permuted private blocks
+                        const std::uint32_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        local_block[local_01_offset] =
+                            private_block_10[pr_offset];
+
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        local_block[local_10_offset] =
+                            private_block_01[pr_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 3. Write out permuted SLM to destination array
+
+            const std::size_t dst_tile_start0 = src_tile_start0;
+            const std::size_t dst_tile_start1 = src_tile_start1;
+
+            if (local_dim0 == block_size && local_dim1 == block_size) {
+                const std::uint16_t dst_i0 = src_i1;
+                const std::uint16_t dst_i1 = src_i0;
+
+                const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                const std::size_t dst_offset0 =
+                    dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                const std::size_t pr_step_dst = lws1 * dst_stride;
+
+                const std::uint16_t _local_offset0 =
+                    dst_i0 * block_size + dst_i1;
+                const std::uint16_t _pr_step_local = lws1 * block_size;
+
+                for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                    if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) {
+                        dst_tp[dst_offset0 + pr_step_dst * pr_id] =
+                            local_block[_local_offset0 +
+                                        _pr_step_local * pr_id];
+                    }
+                }
+            }
+            else {
+                // map local_linear_id into (local_dim0, local_dim1)
+                for (std::uint16_t el_id = lid_lin;
+                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
+                {
+
+                    // 0 <= local_i0 < local_dim0
+                    const std::uint16_t loc_i0 = el_id / local_dim1;
+                    // 0 <= local_i1 < local_dim1
+                    const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1;
+
+                    const std::uint16_t dst_i0 = loc_i0;
+                    const std::uint16_t dst_i1 = loc_i1;
+
+                    const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                    const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                    const std::size_t dst_offset =
+                        dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                    const std::uint16_t local_offset =
+                        loc_i0 * block_size + loc_i1;
+
+                    if ((dst_gid1 < n) && (dst_gid0 < n)) {
+                        dst_tp[dst_offset] = local_block[local_offset];
+                    }
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+} // end of namespace detail
+
+template <typename T>
+sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    ssize_t src_batch_step,
+    ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT =
+        TwoOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer>;
+
+    const auto &src_batch_indexer =
+        Strided1DIndexer(batch_nelems, src_batch_step);
+    const auto &dst_batch_indexer =
+        Strided1DIndexer(batch_nelems, dst_batch_step);
+
+    const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p,
+        dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of batch elements */
+    ssize_t,       /* distance between batches in source array */
+    ssize_t,       /* distance between batches in destination array */
+    std::size_t,   /* size of square matrices in the batch */
+    const char *,
+    ssize_t, /* untyped pointer to F-contig source array, and matrix leading
+                dimension */
+    char *,
+    ssize_t, /* untyped pointer to C-contig destination array, and matrix
+                leading dimension */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContig1DBatchOfSquareMatricesFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_1d_batch_of_square_matrices_impl<T>;
+    }
+};
+
+template <typename T>
+sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    int batch_nd,
+    const ssize_t *src_batch_shape_strides,
+    const ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT = TwoOffsets_CombinedIndexer<SrcIndexerT, DstIndexerT>;
+
+    static constexpr ssize_t zero_offset{0};
+
+    const SrcIndexerT src_batch_indexer{batch_nd, zero_offset,
+                                        src_batch_shape_strides};
+    const DstIndexerT dst_batch_indexer{/* size */ batch_nelems,
+                                        /* step */ dst_batch_step};
+
+    const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer,
+                                                  dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld,
+        dst_p, dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of matrices in the batch */
+    int,
+    const ssize_t *, /* dimensionality, and packed [shape, src_strides]
+                        describing iteration over batch in source array */
+    ssize_t,         /* distance between batches in destination array */
+    std::size_t,     /* matrix size */
+    const char *,
+    ssize_t, /* untyped pointer to source array of F-contig matrices, and
+                leading dimension of the matrix */
+    char *,
+    ssize_t, /* untyped pointer to destination array of F-contig matrices, and
+                leading dimension of the matrix */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigNDBatchOfSquareMatricesFactory
+{
+    fnT get()
+    {
+        return as_c_contiguous_nd_batch_of_square_matrices_impl<T>;
+    }
+};
+
+} // namespace copy_as_contig
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
new file mode 100644
index 000000000000..53b39ff5874c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -0,0 +1,758 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <numeric>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_as_contiguous.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_array_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+static as_c_contiguous_array_impl_fn_ptr_t
+    as_c_contig_array_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+void init_copy_as_contig_dispatch_vectors(void)
+{
+
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContig1DBatchOfSquareMatricesFactory;
+    using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory;
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContigNDBatchOfSquareMatricesFactory;
+    using td_ns::DispatchVectorBuilder;
+
+    // Generic to c-contig
+    DispatchVectorBuilder<as_c_contiguous_array_impl_fn_ptr_t, AsCContigFactory,
+                          td_ns::num_types>
+        dtv_as_c_contig_array;
+
+    dtv_as_c_contig_array.populate_dispatch_vector(
+        as_c_contig_array_dispatch_vector);
+
+    // 1D batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_1d_batch_of_square_matrices;
+
+    dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_1d_batch_of_square_matrices_dispatch_vector);
+
+    // ND batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_nd_batch_of_square_matrices;
+
+    dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector);
+}
+
+namespace
+{
+
+template <typename dimT>
+std::size_t get_nelems(const std::vector<dimT> &shape)
+{
+    auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t {
+        return prod * static_cast<std::size_t>(term);
+    };
+
+    static constexpr std::size_t unit{1};
+
+    const std::size_t nelems =
+        std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn);
+    return nelems;
+}
+
+} // end of anonymous namespace
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    const int src_nd = src.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.back();
+        if (n == dst_shape_vec[src_nd - 2]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[src_nd - 2] == unit_stride) {
+                return py_as_c_contig_f2c(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be F-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.front();
+        if (n == dst_shape_vec[1]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[1] == unit_stride) {
+                return py_as_f_contig_c2f(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = src_shape_vec.back();
+    if (src_shape_vec[src_nd - 2] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec),
+                               std::end(src_shape_vec) - 2);
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec),
+                                     std::end(src_strides_vec) - 2);
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec),
+                                     std::end(dst_strides_vec) - 2);
+    }
+
+    // simplify batch iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.back(), dst.get_data(),
+                    dst_strides_vec[src_nd - 2], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.back(), dst.get_data(),
+                dst_strides_vec[src_nd - 2], all_depends);
+
+    // async free of shape_strides temporary
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = dst_shape_vec.front();
+    if (dst_shape_vec[1] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[1] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec) + 2,
+                               std::end(src_shape_vec));
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec) + 2,
+                                     std::end(src_strides_vec));
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec) + 2,
+                                     std::end(dst_strides_vec));
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.front(), dst.get_data(),
+                    dst_strides_vec[1], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.front(), dst.get_data(),
+                dst_strides_vec[1], all_depends);
+
+    // async free of shape_strides
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
new file mode 100644
index 000000000000..2de67098b7fa
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
@@ -0,0 +1,61 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+void init_copy_as_contig_dispatch_vectors(void);
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl

From 5a9c14cd5ac07cf0a79da70e67b1cd9c28f063c6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:16:36 -0800
Subject: [PATCH 005/145] Add copy_usm_ndarray_into_usm_ndarray implementation

---
 .../source/copy_and_cast_usm_to_usm.cpp       | 310 ++++++++++++++++++
 .../source/copy_and_cast_usm_to_usm.hpp       |  60 ++++
 2 files changed, 370 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp

diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
new file mode 100644
index 000000000000..0458aa75ac32
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -0,0 +1,310 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <thread>
+#include <type_traits>
+#include <utility>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t;
+
+static copy_and_cast_generic_fn_ptr_t
+    copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_1d_fn_ptr_t
+    copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_contig_fn_ptr_t
+    copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && (i < src_nd); ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    // check for applicability of special cases:
+    //      (both C-contiguous || both F-contiguous)
+    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
+    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
+    if (both_c_contig || both_f_contig) {
+
+        sycl::event copy_ev;
+        if (src_type_id == dst_type_id) {
+
+            int src_elem_size = src.get_elemsize();
+
+            copy_ev = exec_q.memcpy(static_cast<void *>(dst_data),
+                                    static_cast<const void *>(src_data),
+                                    src_nelems * src_elem_size, depends);
+        }
+        else {
+            auto contig_fn =
+                copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id];
+            copy_ev =
+                contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+        }
+        // make sure src and dst are not GC-ed before copy_ev is complete
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    if ((src_type_id == dst_type_id) && (src_nd > 1)) {
+        if (is_dst_c_contig) {
+            return py_as_c_contig(src, dst, exec_q, depends);
+        }
+        else if (is_dst_f_contig) {
+            return py_as_f_contig(src, dst, exec_q, depends);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd < 2) {
+        if (nd == 1) {
+            std::array<py::ssize_t, 1> shape_arr = {simplified_shape[0]};
+            std::array<py::ssize_t, 1> src_strides_arr = {
+                simplified_src_strides[0]};
+            std::array<py::ssize_t, 1> dst_strides_arr = {
+                simplified_dst_strides[0]};
+
+            sycl::event copy_and_cast_1d_event;
+            if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
+                (src_offset == 0) && (dst_offset == 0))
+            {
+                auto contig_fn =
+                    copy_and_cast_contig_dispatch_table[dst_type_id]
+                                                       [src_type_id];
+                copy_and_cast_1d_event =
+                    contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+            }
+            else {
+                auto fn =
+                    copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+                copy_and_cast_1d_event =
+                    fn(exec_q, src_nelems, shape_arr, src_strides_arr,
+                       dst_strides_arr, src_data, src_offset, dst_data,
+                       dst_offset, depends);
+            }
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
+                copy_and_cast_1d_event);
+        }
+        else if (nd == 0) { // case of a scalar
+            assert(src_nelems == 1);
+            std::array<py::ssize_t, 1> shape_arr = {1};
+            std::array<py::ssize_t, 1> src_strides_arr = {1};
+            std::array<py::ssize_t, 1> dst_strides_arr = {1};
+
+            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+
+            sycl::event copy_and_cast_0d_event = fn(
+                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
+                src_data, src_offset, dst_data, dst_offset, depends);
+
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
+                copy_and_cast_0d_event);
+        }
+    }
+
+    // Generic implementation
+    auto copy_and_cast_fn =
+        copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const sycl::event &copy_and_cast_generic_ev = copy_and_cast_fn(
+        exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
+        dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_and_cast_generic_ev}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_and_cast_generic_ev);
+}
+
+void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory;
+    DispatchTableBuilder<copy_and_cast_contig_fn_ptr_t,
+                         CopyAndCastContigFactory, num_types>
+        dtb_contig;
+    dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory;
+    DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
+                         CopyAndCastGenericFactory, num_types>
+        dtb_generic;
+    dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory;
+    DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
+                         num_types>
+        dtb_1d;
+    dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
new file mode 100644
index 000000000000..d2a2dcaf7b85
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -0,0 +1,60 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From 4f6334054fc08df7c2c2f7657bc5f4569ee4363a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:18:36 -0800
Subject: [PATCH 006/145] Add pybind11 bindings for
 dpctl_ext.tensor._tensor_impl

---
 .../tensor/libtensor/source/tensor_ctors.cpp  | 502 ++++++++++++++++++
 1 file changed, 502 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp

diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
new file mode 100644
index 000000000000..b41b5c9ce423
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -0,0 +1,502 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+#include <thread>
+#include <type_traits>
+#include <utility>
+
+#include "dpnp4pybind11.hpp"
+
+// #include "accumulators.hpp"
+// #include "boolean_advanced_indexing.hpp"
+// #include "clip.hpp"
+#include "copy_and_cast_usm_to_usm.hpp"
+#include "copy_as_contig.hpp"
+// #include "copy_for_reshape.hpp"
+// #include "copy_for_roll.hpp"
+// #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+// #include "device_support_queries.hpp"
+// #include "eye_ctor.hpp"
+// #include "full_ctor.hpp"
+// #include "integer_advanced_indexing.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+// #include "linear_sequences.hpp"
+// #include "repeat.hpp"
+#include "simplify_iteration_space.hpp"
+// #include "triul_ctor.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/strided_iters.hpp"
+// #include "where.hpp"
+// #include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+
+static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
+
+namespace
+{
+
+using dpctl::tensor::c_contiguous_strides;
+using dpctl::tensor::f_contiguous_strides;
+
+using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::py_as_c_contig;
+using dpctl::tensor::py_internal::py_as_f_contig;
+
+/* =========================== Copy for reshape ============================= */
+
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
+
+/* =========================== Copy for roll ============================= */
+
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+
+/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
+
+// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
+
+/* ============= linear-sequence ==================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+
+/* ================ Full ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_full;
+
+/* ================ Zeros ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_zeros;
+
+/* ============== Advanced Indexing ============= */
+// using dpctl::tensor::py_internal::usm_ndarray_put;
+// using dpctl::tensor::py_internal::usm_ndarray_take;
+
+// using dpctl::tensor::py_internal::py_extract;
+// using dpctl::tensor::py_internal::py_mask_positions;
+// using dpctl::tensor::py_internal::py_nonzero;
+// using dpctl::tensor::py_internal::py_place;
+
+/* ================= Repeat ====================*/
+// using dpctl::tensor::py_internal::py_cumsum_1d;
+// using dpctl::tensor::py_internal::py_repeat_by_scalar;
+// using dpctl::tensor::py_internal::py_repeat_by_sequence;
+
+/* ================ Eye ================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_eye;
+
+/* =========================== Tril and triu ============================== */
+
+// using dpctl::tensor::py_internal::usm_ndarray_triul;
+
+/* =========================== Where ============================== */
+
+// using dpctl::tensor::py_internal::py_where;
+
+/* =========================== Clip ============================== */
+// using dpctl::tensor::py_internal::py_clip;
+
+// populate dispatch tables
+void init_dispatch_tables(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_and_cast_usm_to_usm_dispatch_tables();
+    // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    // init_advanced_indexing_dispatch_tables();
+    // init_where_dispatch_tables();
+    return;
+}
+
+// populate dispatch vectors
+void init_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_as_contig_dispatch_vectors();
+    // init_copy_for_reshape_dispatch_vectors();
+    // init_copy_for_roll_dispatch_vectors();
+    // init_linear_sequences_dispatch_vectors();
+    // init_full_ctor_dispatch_vectors();
+    // init_zeros_ctor_dispatch_vectors();
+    // init_eye_ctor_dispatch_vectors();
+    // init_triul_ctor_dispatch_vectors();
+
+    // populate_masked_extract_dispatch_vectors();
+    // populate_masked_place_dispatch_vectors();
+
+    // populate_mask_positions_dispatch_vectors();
+
+    // populate_cumsum_1d_dispatch_vectors();
+    // init_repeat_dispatch_vectors();
+
+    // init_clip_dispatch_vectors();
+
+    return;
+}
+
+} // namespace
+
+PYBIND11_MODULE(_tensor_impl, m)
+{
+    init_dispatch_tables();
+    init_dispatch_vectors();
+
+    using dpctl::tensor::strides::contract_iter;
+    m.def(
+        "_contract_iter", &contract_iter<py::ssize_t, py::value_error>,
+        "Simplifies iteration of array of given shape & stride. Returns "
+        "a triple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension, which traverses the same elements as the original "
+        "iterator, possibly in a different order.");
+
+    m.def("_copy_usm_ndarray_into_usm_ndarray",
+          &copy_usm_ndarray_into_usm_ndarray,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same "
+          "shape. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_c_contig", &py_as_c_contig,
+          "Copies from usm_ndarray `src` into C-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_f_contig", &py_as_f_contig,
+          "Copies from usm_ndarray `src` into F-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    using dpctl::tensor::strides::contract_iter2;
+    m.def(
+        "_contract_iter2", &contract_iter2<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of pair of arrays of given shape "
+        "with strides stride1 and stride2. Returns "
+        "a 5-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter3;
+    m.def(
+        "_contract_iter3", &contract_iter3<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 3-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, and stride3. Returns "
+        "a 7-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter4;
+    m.def(
+        "_contract_iter4", &contract_iter4<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 4-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, stride3, and stride4. Returns "
+        "a 9-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    static constexpr char orderC = 'C';
+    m.def(
+        "_ravel_multi_index",
+        [](const std::vector<py::ssize_t> &mi,
+           const std::vector<py::ssize_t> &shape, char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_ravel_multi_index_c(mi,
+                                                                        shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_ravel_multi_index_f(mi,
+                                                                        shape);
+            }
+        },
+        "");
+
+    m.def(
+        "_unravel_index",
+        [](py::ssize_t flat_index, const std::vector<py::ssize_t> &shape,
+           char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_unravel_index_c(flat_index,
+                                                                    shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_unravel_index_f(flat_index,
+                                                                    shape);
+            }
+        },
+        "");
+
+    // m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "number of elements using underlying 'C'-contiguous order for
+    //       flat " "traversal. " "Returns a tuple of events: (ht_event,
+    //       comp_event)", py::arg("src"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "shapes using underlying 'C'-contiguous order for flat "
+    //       "traversal with shift. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("src"), py::arg("dst"), py::arg("shift"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
+    //       " "shapes using underlying 'C'-contiguous order for " "traversal
+    //       with shifts along each axis. " "Returns a tuple of events:
+    //       (ht_event, comp_event)", py::arg("src"), py::arg("dst"),
+    //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
+    //       py::list());
+
+    // m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+    //       "specified by "
+    //       "starting point `start` and step `dt`. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("start"), py::arg("dt"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+    //       "specified by "
+    //       "starting point `start` and end point `end`. "
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("start"), py::arg("end"), py::arg("dst"),
+    //       py::arg("include_endpoint"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_copy_numpy_ndarray_into_usm_ndarray",
+    //       &copy_numpy_ndarray_into_usm_ndarray,
+    //       "Copy from numpy array `src` into usm_ndarray `dst`
+    //       synchronously.", py::arg("src"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+    //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_full_usm_ndarray", &usm_ndarray_full,
+    //       "Populate usm_ndarray `dst` with given fill_value.",
+    //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_take", &usm_ndarray_take,
+    //       "Takes elements at usm_ndarray indices `ind` and axes starting "
+    //       "at axis `axis_start` from array `src` and copies them "
+    //       "into usm_ndarray `dst` synchronously."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("src"), py::arg("ind"), py::arg("dst"),
+    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
+    //   py::arg("depends") = py::list());
+
+    // m.def("_put", &usm_ndarray_put,
+    //       "Puts elements at usm_ndarray indices `ind` and axes starting "
+    //       "at axis `axis_start` into array `dst` from "
+    //       "usm_ndarray `val` synchronously."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("dst"), py::arg("ind"), py::arg("val"),
+    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_eye", &usm_ndarray_eye,
+    //       "Fills input 2D contiguous usm_ndarray `dst` with "
+    //       "zeros outside of the diagonal "
+    //       "specified by "
+    //       "the diagonal index `k` "
+    //       "which is filled with ones."
+    //       "Returns a tuple of events: (ht_event, comp_event)",
+    //       py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("default_device_fp_type",
+    //       dpctl::tensor::py_internal::default_device_fp_type,
+    //       "Gives default floating point type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_int_type",
+    //       dpctl::tensor::py_internal::default_device_int_type,
+    //       "Gives default signed integer type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_uint_type",
+    //       dpctl::tensor::py_internal::default_device_uint_type,
+    //       "Gives default unsigned integer type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_bool_type",
+    //       dpctl::tensor::py_internal::default_device_bool_type,
+    //       "Gives default boolean type supported by device.", py::arg("dev"));
+
+    // m.def("default_device_complex_type",
+    //       dpctl::tensor::py_internal::default_device_complex_type,
+    //       "Gives default complex floating point type supported by device.",
+    //       py::arg("dev"));
+
+    // m.def("default_device_index_type",
+    //       dpctl::tensor::py_internal::default_device_index_type,
+    //       "Gives default index type supported by device.", py::arg("dev"));
+
+    // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
+    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+    //                   sycl::queue &exec_q,
+    //                   const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    // };
+    // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
+    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+    //                   sycl::queue &exec_q,
+    //                   const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    // };
+    // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+    //       py::arg("cumsum"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto overlap = [](const dpctl::tensor::usm_ndarray &x1,
+                      const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &overlap = MemoryOverlap();
+        return overlap(x1, x2);
+    };
+    m.def("_array_overlap", overlap,
+          "Determines if the memory regions indexed by each array overlap",
+          py::arg("array1"), py::arg("array2"));
+
+    // auto same_logical_tensors =
+    //     [](const dpctl::tensor::usm_ndarray &x1,
+    //        const dpctl::tensor::usm_ndarray &x2) -> bool {
+    //     auto const &same_logical_tensors = SameLogicalTensors();
+    //     return same_logical_tensors(x1, x2);
+    // };
+    // m.def("_same_logical_tensors", same_logical_tensors,
+    //       "Determines if the memory regions indexed by each array are the
+    //       same", py::arg("array1"), py::arg("array2"));
+
+    // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+    //       py::arg("mask_shape"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
+    //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
+    //                           const dpctl::tensor::usm_ndarray &dst,
+    //                           const dpctl::tensor::usm_ndarray &reps,
+    //                           const dpctl::tensor::usm_ndarray &cumsum,
+    //                           std::optional<int> axis, sycl::queue &exec_q,
+    //                           const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     if (axis) {
+    //         return py_repeat_by_sequence(src, dst, reps, cumsum,
+    //         axis.value(),
+    //                                      exec_q, depends);
+    //     }
+    //     else {
+    //         return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
+    //                                      depends);
+    //     }
+    // };
+    // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
+    //       py::arg("dst"), py::arg("reps"), py::arg("cumsum"),
+    //       py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") =
+    //       py::list());
+
+    // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
+    //                         const dpctl::tensor::usm_ndarray &dst,
+    //                         const py::ssize_t reps, std::optional<int> axis,
+    //                         sycl::queue &exec_q,
+    //                         const std::vector<sycl::event> depends)
+    //     -> std::pair<sycl::event, sycl::event> {
+    //     if (axis) {
+    //         return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
+    //                                    depends);
+    //     }
+    //     else {
+    //         return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
+    //     }
+    // };
+    // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
+    //       py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
+    //       py::arg("depends") = py::list());
+
+    // m.def("_clip", &py_clip,
+    //       "Clamps elements of array `x` to the range "
+    //       "[`min`, `max] and writes the result to the "
+    //       "array `dst` for each element of `x`, `min`, and `max`."
+    //       "Returns a tuple of events: (hev, ev)",
+    //       py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+}

From 634579c5f0d64d44805d0a020cb4ca5ae1d5e774 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:24:11 -0800
Subject: [PATCH 007/145] Add CMake build files for dpctl_ext

---
 dpctl_ext/CMakeLists.txt        | 205 ++++++++++++++++++++++++++++++++
 dpctl_ext/tensor/CMakeLists.txt | 175 +++++++++++++++++++++++++++
 2 files changed, 380 insertions(+)
 create mode 100644 dpctl_ext/CMakeLists.txt
 create mode 100644 dpctl_ext/tensor/CMakeLists.txt

diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
new file mode 100644
index 000000000000..bb33a4f57332
--- /dev/null
+++ b/dpctl_ext/CMakeLists.txt
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+find_package(Python REQUIRED COMPONENTS NumPy)
+
+# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present)
+# -w is to set working directory (and correctly set __pyx_f[] array of filenames)
+set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
+find_package(Cython REQUIRED)
+
+if(WIN32)
+    string(
+        CONCAT WARNING_FLAGS
+        "-Wall "
+        "-Wextra "
+        "-Winit-self "
+        "-Wunused-function "
+        "-Wuninitialized "
+        "-Wmissing-declarations "
+        "-Wstrict-prototypes "
+        "-Wno-unused-parameter "
+    )
+    string(CONCAT SDL_FLAGS "/GS " "/DynamicBase ")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_C_FLAGS_COVERAGE
+        "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
+    )
+    set(CMAKE_CXX_FLAGS_COVERAGE
+        "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG"
+    )
+    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
+    set(DPCTL_LDFLAGS "/NXCompat;/DynamicBase")
+    mark_as_advanced(
+        CMAKE_CXX_FLAGS_COVERAGE
+        CMAKE_C_FLAGS_COVERAGE
+        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
+    )
+elseif(UNIX)
+    string(
+        CONCAT WARNING_FLAGS
+        "-Wall "
+        "-Wextra "
+        "-Winit-self "
+        "-Wunused-function "
+        "-Wuninitialized "
+        "-Wmissing-declarations "
+        "-Wstrict-prototypes "
+        "-Wno-unused-parameter "
+        "-fdiagnostics-color=auto "
+    )
+    string(
+        CONCAT SDL_FLAGS
+        "-fstack-protector "
+        "-fstack-protector-all "
+        "-fpic "
+        "-fPIC "
+        "-D_FORTIFY_SOURCE=2 "
+        "-Wformat "
+        "-Wformat-security "
+        #       "-fno-strict-overflow "    # no-strict-overflow is implied by -fwrapv
+        "-fno-delete-null-pointer-checks "
+        "-fwrapv "
+    )
+    string(CONCAT CFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
+    string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\""
+    )
+    set(CMAKE_C_FLAGS_COVERAGE "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O1 -g1 -DDEBUG")
+    set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O1 -g1 -DDEBUG")
+    set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}")
+    set(DPCTL_LDFLAGS "-z,noexecstack,-z,relro,-z,now")
+    mark_as_advanced(
+        CMAKE_CXX_FLAGS_COVERAGE
+        CMAKE_C_FLAGS_COVERAGE
+        CMAKE_MODULE_LINKER_FLAGS_COVERAGE
+    )
+else()
+    message(FATAL_ERROR "Unsupported system.")
+endif()
+
+# at build time create include/ directory and copy header files over
+set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+set(CMAKE_INSTALL_RPATH "$ORIGIN")
+
+function(build_dpctl_ext _trgt _src _dest)
+    set(options SYCL)
+    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN})
+    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
+    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
+    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
+    if(BUILD_DPCTL_EXT_SYCL)
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
+        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
+        if(DPCTL_OFFLOAD_COMPRESS)
+            target_link_options(${_trgt} PRIVATE --offload-compress)
+        endif()
+        if(_dpctl_sycl_targets)
+            # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE ${_dpctl_sycl_target_compile_options}
+            )
+            target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options})
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
+    if(DPCTL_GENERATE_COVERAGE)
+        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
+        if(BUILD_DPCTL_EXT_SYCL)
+            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
+    set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+    target_link_options(${_trgt} PRIVATE ${_linker_options})
+    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
+    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
+    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
+    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
+
+    # TODO: create separate folder inside build folder that contains only
+    #   headers related to this target and appropriate folder structure to
+    #   eliminate shadow dependencies
+    get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY)
+    # TODO: do not set directory if we did not generate header
+    target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir})
+    set(_rpath_value "$ORIGIN")
+    if(BUILD_DPCTL_EXT_RELATIVE_PATH)
+        set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}")
+    endif()
+    if(DPCTL_WITH_REDIST)
+        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
+    endif()
+    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
+
+    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
+    install(
+        FILES ${_generated_api_h}
+        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    install(
+        FILES ${_generated_public_h}
+        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    if(DPCTL_GENERATE_COVERAGE)
+        get_filename_component(_original_src_dir ${_src} DIRECTORY)
+        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
+        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
+    endif()
+
+    # Create target with headers only, because python is managing all the
+    # library imports at runtime
+    set(_trgt_headers ${_trgt}_headers)
+    add_library(${_trgt_headers} INTERFACE)
+    add_dependencies(${_trgt_headers} ${_trgt})
+    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
+    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
+endfunction()
+
+add_subdirectory(tensor)
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
new file mode 100644
index 000000000000..ed8294b76615
--- /dev/null
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+if(WIN32)
+    if(${CMAKE_VERSION} VERSION_LESS "3.23")
+        # this is a work-around for target_link_options inserting option after -link option, cause
+        # linker to ignore it.
+        set(CMAKE_CXX_LINK_FLAGS
+            "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel"
+        )
+    endif()
+endif()
+
+set(_static_lib_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+)
+set(_tensor_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+)
+
+set(_static_lib_trgt simplify_iteration_space)
+
+add_library(${_static_lib_trgt} STATIC ${_static_lib_sources})
+target_include_directories(
+    ${_static_lib_trgt}
+    PRIVATE
+        ${Python_INCLUDE_DIRS}
+        ${DPCTL_INCLUDE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+)
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES})
+set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+set(_py_trgts)
+
+set(python_module_name _tensor_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(_clang_prefix "")
+if(WIN32)
+    set(_clang_prefix "/clang:")
+endif()
+
+set(_no_fast_math_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+)
+list(
+    APPEND _no_fast_math_sources
+    # ${_elementwise_sources}
+    # ${_reduction_sources}
+    # ${_sorting_sources}
+    # ${_linalg_sources}
+    # ${_accumulator_sources}
+)
+
+foreach(_src_fn ${_no_fast_math_sources})
+    get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
+    set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math")
+    set_source_files_properties(
+        ${_src_fn}
+        PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}"
+    )
+endforeach()
+
+set(_compiler_definitions "")
+
+set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+foreach(python_module_name ${_py_trgts})
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE -fno-sycl-id-queries-fit-in-int
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE -fsycl-device-code-split=per_kernel
+    )
+    if(DPCTL_OFFLOAD_COMPRESS)
+        target_link_options(${python_module_name} PRIVATE --offload-compress)
+    endif()
+
+    target_include_directories(
+        ${python_module_name}
+        PRIVATE
+            ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+            ${Dpctl_INCLUDE_DIR}
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+    )
+    target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    if(DPCTL_GENERATE_COVERAGE)
+        if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
+            target_compile_options(
+                ${python_module_name}
+                PRIVATE -fprofile-instr-generate -fcoverage-mapping
+            )
+        endif()
+        target_link_options(
+            ${python_module_name}
+            PRIVATE -fprofile-instr-generate -fcoverage-mapping
+        )
+    endif()
+    if(_dpctl_sycl_targets)
+        # make fat binary
+        target_compile_options(
+            ${python_module_name}
+            PRIVATE ${_dpctl_sycl_target_compile_options}
+        )
+        target_link_options(
+            ${python_module_name}
+            PRIVATE ${_dpctl_sycl_target_link_options}
+        )
+    endif()
+    # TODO: update source so they reference individual libraries instead of
+    #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
+    # NOTE: dpctl C-API is resolved at runtime via Python
+    # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
+    if(DPCTL_WITH_REDIST)
+        set_target_properties(
+            ${python_module_name}
+            PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.."
+        )
+    endif()
+    # TODO: revert to `DESTINATION "dpctl/tensor"`
+    install(TARGETS ${python_module_name} DESTINATION "dpctl_ext/tensor")
+endforeach()

From 79d40f235d10d1b9d514d9db07939d0bb447086c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:31:12 -0800
Subject: [PATCH 008/145] Add empty __init__ to dpctl_ext/

---
 dpctl_ext/__init__.py        | 27 +++++++++++++++++++++++++++
 dpctl_ext/tensor/__init__.py | 27 +++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 dpctl_ext/__init__.py
 create mode 100644 dpctl_ext/tensor/__init__.py

diff --git a/dpctl_ext/__init__.py b/dpctl_ext/__init__.py
new file mode 100644
index 000000000000..a71324cb88d8
--- /dev/null
+++ b/dpctl_ext/__init__.py
@@ -0,0 +1,27 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
new file mode 100644
index 000000000000..a71324cb88d8
--- /dev/null
+++ b/dpctl_ext/tensor/__init__.py
@@ -0,0 +1,27 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************

From 7949c17c3586a4ad0222c6abbf3a616202834c68 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 04:53:03 -0800
Subject: [PATCH 009/145] Enable _same_logical_tensors in _tensor_impl

---
 .../tensor/libtensor/source/tensor_ctors.cpp   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index b41b5c9ce423..ca3b7bd49116 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -430,15 +430,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array overlap",
           py::arg("array1"), py::arg("array2"));
 
-    // auto same_logical_tensors =
-    //     [](const dpctl::tensor::usm_ndarray &x1,
-    //        const dpctl::tensor::usm_ndarray &x2) -> bool {
-    //     auto const &same_logical_tensors = SameLogicalTensors();
-    //     return same_logical_tensors(x1, x2);
-    // };
-    // m.def("_same_logical_tensors", same_logical_tensors,
-    //       "Determines if the memory regions indexed by each array are the
-    //       same", py::arg("array1"), py::arg("array2"));
+    auto same_logical_tensors =
+        [](const dpctl::tensor::usm_ndarray &x1,
+           const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
 
     // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
     //       py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),

From 29d6c029190714cab8a460c02f32130c7ea59cc6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 05:14:28 -0800
Subject: [PATCH 010/145] Add device_support_queries to enable default device
 types

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../source/device_support_queries.cpp         | 184 ++++++++++++++++++
 .../source/device_support_queries.hpp         |  58 ++++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  56 +++---
 4 files changed, 271 insertions(+), 29 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ed8294b76615..ee8da2e49506 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -56,7 +56,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
new file mode 100644
index 000000000000..51eb7dba1b6c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
@@ -0,0 +1,184 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace
+{
+
+std::string _default_device_fp_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "f8";
+    }
+    else {
+        return "f4";
+    }
+}
+
+int get_numpy_major_version()
+{
+    namespace py = pybind11;
+
+    py::module_ numpy = py::module_::import("numpy");
+    py::str version_string = numpy.attr("__version__");
+    py::module_ numpy_lib = py::module_::import("numpy.lib");
+
+    py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
+    int major_version = numpy_version.attr("major").cast<int>();
+
+    return major_version;
+}
+
+std::string _default_device_int_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "i8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "l";
+    }
+}
+
+std::string _default_device_uint_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "u8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "L";
+    }
+}
+
+std::string _default_device_complex_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "c16";
+    }
+    else {
+        return "c8";
+    }
+}
+
+std::string _default_device_bool_type(const sycl::device &)
+{
+    return "b1";
+}
+
+std::string _default_device_index_type(const sycl::device &)
+{
+    return "i8";
+}
+
+sycl::device _extract_device(const py::object &arg)
+{
+    auto const &api = dpctl::detail::dpctl_capi::get();
+
+    PyObject *source = arg.ptr();
+    if (api.PySyclQueue_Check_(source)) {
+        const sycl::queue &q = py::cast<sycl::queue>(arg);
+        return q.get_device();
+    }
+    else if (api.PySyclDevice_Check_(source)) {
+        return py::cast<sycl::device>(arg);
+    }
+    else {
+        throw py::type_error(
+            "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`.");
+    }
+}
+
+} // namespace
+
+std::string default_device_fp_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_fp_type(d);
+}
+
+std::string default_device_int_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_int_type(d);
+}
+
+std::string default_device_uint_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_uint_type(d);
+}
+
+std::string default_device_bool_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_bool_type(d);
+}
+
+std::string default_device_complex_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_complex_type(d);
+}
+
+std::string default_device_index_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_index_type(d);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
new file mode 100644
index 000000000000..6ea01dcd49d7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
@@ -0,0 +1,58 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::string default_device_fp_type(const py::object &);
+extern std::string default_device_int_type(const py::object &);
+extern std::string default_device_uint_type(const py::object &);
+extern std::string default_device_bool_type(const py::object &);
+extern std::string default_device_complex_type(const py::object &);
+extern std::string default_device_index_type(const py::object &);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index ca3b7bd49116..911d75ebd925 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -52,7 +52,7 @@
 // #include "copy_for_reshape.hpp"
 // #include "copy_for_roll.hpp"
 // #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
-// #include "device_support_queries.hpp"
+#include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
 // #include "full_ctor.hpp"
 // #include "integer_advanced_indexing.hpp"
@@ -360,33 +360,33 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
     //       py::arg("depends") = py::list());
 
-    // m.def("default_device_fp_type",
-    //       dpctl::tensor::py_internal::default_device_fp_type,
-    //       "Gives default floating point type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_int_type",
-    //       dpctl::tensor::py_internal::default_device_int_type,
-    //       "Gives default signed integer type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_uint_type",
-    //       dpctl::tensor::py_internal::default_device_uint_type,
-    //       "Gives default unsigned integer type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_bool_type",
-    //       dpctl::tensor::py_internal::default_device_bool_type,
-    //       "Gives default boolean type supported by device.", py::arg("dev"));
-
-    // m.def("default_device_complex_type",
-    //       dpctl::tensor::py_internal::default_device_complex_type,
-    //       "Gives default complex floating point type supported by device.",
-    //       py::arg("dev"));
-
-    // m.def("default_device_index_type",
-    //       dpctl::tensor::py_internal::default_device_index_type,
-    //       "Gives default index type supported by device.", py::arg("dev"));
+    m.def("default_device_fp_type",
+          dpctl::tensor::py_internal::default_device_fp_type,
+          "Gives default floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_int_type",
+          dpctl::tensor::py_internal::default_device_int_type,
+          "Gives default signed integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_uint_type",
+          dpctl::tensor::py_internal::default_device_uint_type,
+          "Gives default unsigned integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_bool_type",
+          dpctl::tensor::py_internal::default_device_bool_type,
+          "Gives default boolean type supported by device.", py::arg("dev"));
+
+    m.def("default_device_complex_type",
+          dpctl::tensor::py_internal::default_device_complex_type,
+          "Gives default complex floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_index_type",
+          dpctl::tensor::py_internal::default_device_index_type,
+          "Gives default index type supported by device.", py::arg("dev"));
 
     // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
     //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,

From 936e7198e2014330b34c5918a63230ea699e063e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 05:52:17 -0800
Subject: [PATCH 011/145] Enable building and packaging of dpctl_ext

---
 CMakeLists.txt | 1 +
 setup.py       | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 386b17b44294..d2ee5e84c0c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -336,3 +336,4 @@ if(DEFINED SKBUILD)
 endif()
 
 add_subdirectory(dpnp)
+add_subdirectory(dpctl_ext)
diff --git a/setup.py b/setup.py
index cc21221299c4..a0c54b066dcf 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,9 @@
         "dpnp.scipy",
         "dpnp.scipy.linalg",
         "dpnp.scipy.special",
+        # dpctl_ext
+        "dpctl_ext",
+        "dpctl_ext.tensor",
     ],
     package_data={
         "dpnp": [

From cd85f1e333bcad154272946f71c127b9ea9a916b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 06:14:39 -0800
Subject: [PATCH 012/145] Use _tensor_impl from dpctl_ext.tensor in dpnp

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py   | 2 +-
 dpnp/dpnp_iface.py                          | 2 +-
 dpnp/dpnp_iface_searching.py                | 2 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 +-
 dpnp/scipy/linalg/_utils.py                 | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 57bf50422fa0..b63bf61f8dad 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -31,7 +31,6 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._copy_utils as dtc
-import dpctl.tensor._tensor_impl as dti
 import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
@@ -45,6 +44,7 @@
     _validate_dtype,
 )
 
+import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index fba1a215756a..832446c826ba 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -45,11 +45,11 @@
 
 import dpctl
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
 from .dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 6eefe010b699..fdbd317d31dd 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -40,8 +40,8 @@
 """
 
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as dti
 
+import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
 from .dpnp_array import dpnp_array
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 30be5d1ff5cb..4d8e3cdfbd0d 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -28,7 +28,6 @@
 
 import dpctl
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -38,6 +37,7 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 282c645d1095..8eb9187236bf 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -42,9 +42,9 @@
 
 from warnings import warn
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations

From 0c6780a8f8b45e87263fbf316bc17aac5ed91dc1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 09:56:50 -0800
Subject: [PATCH 013/145] Move put() and take() to dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 dpctl_ext/tensor/__init__.py                  |  11 +
 dpctl_ext/tensor/_indexing_functions.py       | 329 +++++++
 dpctl_ext/tensor/_numpy_helper.py             |  45 +
 .../kernels/integer_advanced_indexing.hpp     | 427 +++++++++
 .../source/integer_advanced_indexing.cpp      | 819 ++++++++++++++++++
 .../source/integer_advanced_indexing.hpp      |  73 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  42 +-
 8 files changed, 1726 insertions(+), 22 deletions(-)
 create mode 100644 dpctl_ext/tensor/_indexing_functions.py
 create mode 100644 dpctl_ext/tensor/_numpy_helper.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ee8da2e49506..ae8b72d71873 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -49,7 +49,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a71324cb88d8..35453dbf9a46 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -25,3 +25,14 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
+
+
+from dpctl_ext.tensor._indexing_functions import (
+    put,
+    take,
+)
+
+__all__ = [
+    "put",
+    "take",
+]
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
new file mode 100644
index 000000000000..106df09cf97e
--- /dev/null
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -0,0 +1,329 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.utils
+
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+
+
+def _get_indexing_mode(name):
+    modes = {"wrap": 0, "clip": 1}
+    try:
+        return modes[name]
+    except KeyError:
+        raise ValueError(
+            "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name)
+        )
+
+
+def put(x, indices, vals, /, *, axis=None, mode="wrap"):
+    """put(x, indices, vals, axis=None, mode="wrap")
+
+    Puts values into an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array the values will be put into.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        vals (usm_ndarray):
+            Array of values to be put into ``x``.
+            Must be broadcastable to the result shape
+            ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``.
+        axis (int, optional):
+            The axis along which the values will be placed.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    .. note::
+
+        If input array ``indices`` contains duplicates, a race condition
+        occurs, and the value written into corresponding positions in ``x``
+        may vary from run to run. Preserving sequential semantics in handing
+        the duplicates to achieve deterministic behavior requires additional
+        work, e.g.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor as dpt
+
+                def put_vec_duplicates(vec, ind, vals):
+                    "Put values into vec, handling possible duplicates in ind"
+                    assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1
+
+                    # find positions of last occurrences of each
+                    # unique index
+                    ind_flipped = dpt.flip(ind)
+                    ind_uniq = dpt.unique_all(ind_flipped).indices
+                    has_dups = len(ind) != len(ind_uniq)
+
+                    if has_dups:
+                        ind_uniq = dpt.subtract(vec.size - 1, ind_uniq)
+                        ind = dpt.take(ind, ind_uniq)
+                        vals = dpt.take(vals, ind_uniq)
+
+                    dpt.put(vec, ind, vals)
+
+                n = 512
+                ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1)))
+                x = dpt.zeros(ind.size, dtype="int32")
+                vals = dpt.arange(ind.size, dtype=x.dtype)
+
+                # Values corresponding to last positions of
+                # duplicate indices are written into the vector x
+                put_vec_duplicates(x, ind, vals)
+
+                parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype))
+                expected = dpt.concat(parts)
+                assert dpt.all(x == expected)
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
+    else:
+        queues_ = [x.sycl_queue, indices.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type]
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        val_shape = indices.shape
+
+    if not isinstance(vals, dpt.usm_ndarray):
+        vals = dpt.asarray(
+            vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
+        )
+    # choose to throw here for consistency with `place`
+    if vals.size == 0:
+        raise ValueError(
+            "cannot put into non-empty indices along an empty axis"
+        )
+    if vals.dtype == x.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, x.dtype)
+    rhs = dpt.broadcast_to(rhs, val_shape)
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, put_ev = ti._put(
+        x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, put_ev)
+
+
+def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
+    """take(x, indices, axis=None, out=None, mode="wrap")
+
+    Takes elements from an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array that elements will be taken from.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        axis (int, optional):
+            The axis along which the values will be selected.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        out (Optional[usm_ndarray]):
+            Output array to populate. Array must have the correct
+            shape and the expected data type.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    Returns:
+       usm_ndarray:
+          Array with shape
+          ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]``
+          filled with elements from ``x``.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue])
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    res_usm_type = dpctl.utils.get_coerced_usm_type(
+        [x.usm_type, indices.usm_type]
+    )
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        res_shape = indices.shape
+
+    dt = x.dtype
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+        if dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpctl.utils.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if ti._array_overlap(x, out):
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, take_ev = ti._take(
+        x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, take_ev)
+
+    if not (orig_out is None or out is orig_out):
+        # Copy the out data from temporary buffer to original memory
+        ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_ev)
+        out = orig_out
+
+    return out
diff --git a/dpctl_ext/tensor/_numpy_helper.py b/dpctl_ext/tensor/_numpy_helper.py
new file mode 100644
index 000000000000..4ad735823cb3
--- /dev/null
+++ b/dpctl_ext/tensor/_numpy_helper.py
@@ -0,0 +1,45 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import numpy as np
+
+_npver = np.lib.NumpyVersion(np.__version__)
+
+if _npver < "1.25.0":  # pragma: no cover
+    from numpy import AxisError
+else:
+    from numpy.exceptions import AxisError
+
+if _npver >= "2.0.0":
+    from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple
+else:  # pragma: no cover
+    from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
+
+
+__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"]
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..1b2c79d2e2a5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -0,0 +1,427 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/indexing_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace indexing
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class TakeFunctor
+{
+private:
+    const char *src_ = nullptr;
+    char *dst_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    TakeFunctor(const char *src_cp,
+                char *dst_cp,
+                char **ind_cp,
+                int k,
+                std::size_t ind_nelems,
+                const ssize_t *axes_shape_and_strides,
+                const OrthogIndexer &orthog_strider_,
+                const IndicesIndexer &ind_strider_,
+                const AxesIndexer &axes_strider_)
+        : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const T *src = reinterpret_cast<const T *>(src_);
+        T *dst = reinterpret_cast<T *>(dst_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t src_offset = orthog_offsets.get_first_offset();
+        ssize_t dst_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            src_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        dst_offset += axes_strider(i_along);
+
+        dst[dst_offset] = src[src_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class take_kernel;
+
+typedef sycl::event (*take_fn_ptr_t)(sycl::queue &,
+                                     std::size_t,
+                                     std::size_t,
+                                     int,
+                                     int,
+                                     int,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const char *,
+                                     char *,
+                                     char **,
+                                     ssize_t,
+                                     ssize_t,
+                                     const ssize_t *,
+                                     const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event take_impl(sycl::queue &q,
+                      std::size_t orthog_nelems,
+                      std::size_t ind_nelems,
+                      int nd,
+                      int ind_nd,
+                      int k,
+                      const ssize_t *orthog_shape_and_strides,
+                      const ssize_t *axes_shape_and_strides,
+                      const ssize_t *ind_shape_and_strides,
+                      const char *src_p,
+                      char *dst_p,
+                      char **ind_p,
+                      ssize_t src_offset,
+                      ssize_t dst_offset,
+                      const ssize_t *ind_offsets,
+                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event take_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            take_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            TakeFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>(
+                src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return take_ev;
+}
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class PutFunctor
+{
+private:
+    char *dst_ = nullptr;
+    const char *val_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    PutFunctor(char *dst_cp,
+               const char *val_cp,
+               char **ind_cp,
+               int k,
+               std::size_t ind_nelems,
+               const ssize_t *axes_shape_and_strides,
+               const OrthogIndexer &orthog_strider_,
+               const IndicesIndexer &ind_strider_,
+               const AxesIndexer &axes_strider_)
+        : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        T *dst = reinterpret_cast<T *>(dst_);
+        const T *val = reinterpret_cast<const T *>(val_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t dst_offset = orthog_offsets.get_first_offset();
+        ssize_t val_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            dst_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        val_offset += axes_strider(i_along);
+
+        dst[dst_offset] = val[val_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class put_kernel;
+
+typedef sycl::event (*put_fn_ptr_t)(sycl::queue &,
+                                    std::size_t,
+                                    std::size_t,
+                                    int,
+                                    int,
+                                    int,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    char *,
+                                    const char *,
+                                    char **,
+                                    ssize_t,
+                                    ssize_t,
+                                    const ssize_t *,
+                                    const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event put_impl(sycl::queue &q,
+                     std::size_t orthog_nelems,
+                     std::size_t ind_nelems,
+                     int nd,
+                     int ind_nd,
+                     int k,
+                     const ssize_t *orthog_shape_and_strides,
+                     const ssize_t *axes_shape_and_strides,
+                     const ssize_t *ind_shape_and_strides,
+                     char *dst_p,
+                     const char *val_p,
+                     char **ind_p,
+                     ssize_t dst_offset,
+                     ssize_t val_offset,
+                     const ssize_t *ind_offsets,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event put_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            put_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            PutFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>(
+                dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return put_ev;
+}
+
+template <typename fnT, typename T, typename indT>
+struct TakeWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = take_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct TakeClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = take_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = put_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = put_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+} // namespace indexing
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
new file mode 100644
index 000000000000..244acfe3955f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -0,0 +1,819 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.take and
+/// dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <utility>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/integer_advanced_indexing.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "integer_advanced_indexing.hpp"
+
+#define INDEXING_MODES 2
+#define WRAP_MODE      0
+#define CLIP_MODE      1
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::indexing::put_fn_ptr_t;
+using dpctl::tensor::kernels::indexing::take_fn_ptr_t;
+
+static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                        [td_ns::num_types];
+
+static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                      [td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::vector<sycl::event>
+    _populate_kernel_params(sycl::queue &exec_q,
+                            std::vector<sycl::event> &host_task_events,
+                            char **device_ind_ptrs,
+                            py::ssize_t *device_ind_sh_st,
+                            py::ssize_t *device_ind_offsets,
+                            py::ssize_t *device_orthog_sh_st,
+                            py::ssize_t *device_along_sh_st,
+                            const py::ssize_t *inp_shape,
+                            const py::ssize_t *arr_shape,
+                            std::vector<py::ssize_t> &inp_strides,
+                            std::vector<py::ssize_t> &arr_strides,
+                            std::vector<py::ssize_t> &ind_sh_sts,
+                            std::vector<char *> &ind_ptrs,
+                            std::vector<py::ssize_t> &ind_offsets,
+                            int axis_start,
+                            int k,
+                            int ind_nd,
+                            int inp_nd,
+                            int orthog_sh_elems,
+                            int ind_sh_elems)
+{
+
+    using usm_host_allocator_T =
+        dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT sz_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_sh_st_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), sz_allocator);
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, sz_allocator);
+
+    std::shared_ptr<shT> host_orthog_sh_st_shp =
+        std::make_shared<shT>(3 * orthog_sh_elems, sz_allocator);
+
+    std::shared_ptr<shT> host_along_sh_st_shp =
+        std::make_shared<shT>(2 * (k + ind_sh_elems), sz_allocator);
+
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_sh_st_shp->begin());
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
+    const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+        host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size());
+
+    const sycl::event &device_ind_sh_st_copy_ev =
+        exec_q.copy<py::ssize_t>(host_ind_sh_st_shp->data(), device_ind_sh_st,
+                                 host_ind_sh_st_shp->size());
+
+    const sycl::event &device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_offsets_shp->data(), device_ind_offsets,
+        host_ind_offsets_shp->size());
+
+    int orthog_nd = inp_nd - k;
+
+    if (orthog_nd > 0) {
+        if (axis_start > 0) {
+            std::copy(inp_shape, inp_shape + axis_start,
+                      host_orthog_sh_st_shp->begin());
+            std::copy(inp_strides.begin(), inp_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems);
+            std::copy(arr_strides.begin(), arr_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems);
+        }
+        if (inp_nd > (axis_start + k)) {
+            std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
+                      host_orthog_sh_st_shp->begin() + axis_start);
+            std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(),
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems +
+                          axis_start);
+
+            std::copy(arr_strides.begin() + axis_start + ind_nd,
+                      arr_strides.end(),
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems +
+                          axis_start);
+        }
+    }
+
+    if (inp_nd > 0) {
+        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
+                  host_along_sh_st_shp->begin());
+
+        std::copy(inp_strides.begin() + axis_start,
+                  inp_strides.begin() + axis_start + k,
+                  host_along_sh_st_shp->begin() + k);
+    }
+
+    if (ind_nd > 0) {
+        std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k);
+        std::copy(arr_strides.begin() + axis_start,
+                  arr_strides.begin() + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k + ind_nd);
+    }
+
+    const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_orthog_sh_st_shp->data(), device_orthog_sh_st,
+        host_orthog_sh_st_shp->size());
+
+    const sycl::event &device_along_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_along_sh_st_shp->data(), device_along_sh_st,
+        host_along_sh_st_shp->size());
+
+    const sycl::event &shared_ptr_cleanup_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on({device_along_sh_st_copy_ev,
+                            device_orthog_sh_st_copy_ev,
+                            device_ind_offsets_copy_ev,
+                            device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
+            cgh.host_task(
+                [host_ind_offsets_shp = std::move(host_ind_offsets_shp),
+                 host_ind_sh_st_shp = std::move(host_ind_sh_st_shp),
+                 host_ind_ptrs_shp = std::move(host_ind_ptrs_shp),
+                 host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp),
+                 host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {});
+        });
+    host_task_events.push_back(shared_ptr_cleanup_ev);
+
+    std::vector<sycl::event> sh_st_pack_deps{
+        device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,
+        device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev,
+        device_along_sh_st_copy_ev};
+    return sh_st_pack_deps;
+}
+
+/* Utility to parse python object py_ind into vector of `usm_ndarray`s */
+std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
+                                                     const py::object &py_ind)
+{
+    std::size_t ind_count = py::len(py_ind);
+    std::vector<dpctl::tensor::usm_ndarray> res;
+    res.reserve(ind_count);
+
+    bool nd_is_known = false;
+    int nd = -1;
+    for (std::size_t i = 0; i < ind_count; ++i) {
+        py::object el_i = py_ind[py::cast(i)];
+        dpctl::tensor::usm_ndarray arr_i =
+            py::cast<dpctl::tensor::usm_ndarray>(el_i);
+        if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
+            throw py::value_error("Index allocation queue is not compatible "
+                                  "with execution queue");
+        }
+        if (nd_is_known) {
+            if (nd != arr_i.get_ndim()) {
+                throw py::value_error(
+                    "Indices must have the same number of dimensions.");
+            }
+        }
+        else {
+            nd_is_known = true;
+            nd = arr_i.get_ndim();
+        }
+        res.push_back(arr_i);
+    }
+
+    return res;
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &src,
+                     const py::object &py_ind,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     int axis_start,
+                     std::uint8_t mode,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+
+    int k = ind.size();
+
+    if (k == 0) {
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(src_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(src_nd));
+    }
+    if (src_nd == 0) {
+        if (dst_nd != ind_nd) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+    else {
+        if (dst_nd != (src_nd - k + ind_nd)) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (src_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(src_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Array memory overlap.");
+    }
+
+    py::ssize_t src_offset = py::ssize_t(0);
+    py::ssize_t dst_offset = py::ssize_t(0);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == dst_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shape does not match shape of axis in destination.");
+        }
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * ind_nelems);
+
+    int ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, 0);
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(dst, ind_)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // rearrange to past where indices shapes are checked
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(src_nd - k, 1);
+
+    // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
+    //                          src_strides[:axis] + src_strides[axis+k:],
+    //                          dst_strides[:axis] +
+    //                          dst_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [src_shape[axis:axis+k],
+    //                               src_strides[axis:axis+k],
+    //                               dst_shape[axis:axis+ind.ndim],
+    //                               dst_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event take_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
+           src_offset, dst_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {take_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, take_generic_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst,
+                    const py::object &py_ind,
+                    const dpctl::tensor::usm_ndarray &val,
+                    int axis_start,
+                    std::uint8_t mode,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+    int k = ind.size();
+
+    if (k == 0) {
+        // no indices to write to
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int dst_nd = dst.get_ndim();
+    int val_nd = val.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(dst_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(dst_nd));
+    }
+    if (dst_nd == 0) {
+        if (val_nd != ind_nd) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+    else {
+        if (val_nd != (dst_nd - k + ind_nd)) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+
+    std::size_t dst_nelems = dst.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *val_shape = val.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (dst_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(dst_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *dst_data = dst.get_data();
+    char *val_data = val.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(val, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    py::ssize_t dst_offset = py::ssize_t(0);
+    py::ssize_t val_offset = py::ssize_t(0);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int dst_typenum = dst.get_typenum();
+    int val_typenum = val.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+    int val_type_id = array_types.typenum_to_lookup_id(val_typenum);
+
+    if (dst_type_id != val_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == val_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shapes does not match shape of axis in vals.");
+        }
+    }
+
+    auto ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(ind_, dst)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(dst_nd - k, 1);
+
+    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
+    //                          dst_strides[:axis] + dst_strides[axis+k:],
+    //                          val_strides[:axis] +
+    //                          val_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [dst_shape[axis:axis+k],
+    //                               dst_strides[axis:axis+k],
+    //                               val_shape[axis:axis+ind.ndim],
+    //                               val_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto dst_strides = dst.get_strides_vector();
+    auto val_strides = val.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event put_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
+           dst_offset, val_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {put_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, put_generic_ev);
+}
+
+void init_advanced_indexing_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::indexing::TakeClipFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeClipFactory, num_types>
+        dtb_takeclip;
+    dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::TakeWrapFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeWrapFactory, num_types>
+        dtb_takewrap;
+    dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutClipFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutClipFactory, num_types> dtb_putclip;
+    dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutWrapFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutWrapFactory, num_types> dtb_putwrap;
+    dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..57f0ddda132c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -0,0 +1,73 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.take and dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &,
+                     const py::object &,
+                     const dpctl::tensor::usm_ndarray &,
+                     int,
+                     std::uint8_t,
+                     sycl::queue &,
+                     const std::vector<sycl::event> & = {});
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &,
+                    const py::object &,
+                    const dpctl::tensor::usm_ndarray &,
+                    int,
+                    std::uint8_t,
+                    sycl::queue &,
+                    const std::vector<sycl::event> & = {});
+
+extern void init_advanced_indexing_dispatch_tables(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 911d75ebd925..c18761031fd0 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -55,7 +55,7 @@
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
 // #include "full_ctor.hpp"
-// #include "integer_advanced_indexing.hpp"
+#include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 // #include "linear_sequences.hpp"
 // #include "repeat.hpp"
@@ -110,8 +110,8 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 // using dpctl::tensor::py_internal::usm_ndarray_zeros;
 
 /* ============== Advanced Indexing ============= */
-// using dpctl::tensor::py_internal::usm_ndarray_put;
-// using dpctl::tensor::py_internal::usm_ndarray_take;
+using dpctl::tensor::py_internal::usm_ndarray_put;
+using dpctl::tensor::py_internal::usm_ndarray_take;
 
 // using dpctl::tensor::py_internal::py_extract;
 // using dpctl::tensor::py_internal::py_mask_positions;
@@ -145,7 +145,7 @@ void init_dispatch_tables(void)
 
     init_copy_and_cast_usm_to_usm_dispatch_tables();
     // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
-    // init_advanced_indexing_dispatch_tables();
+    init_advanced_indexing_dispatch_tables();
     // init_where_dispatch_tables();
     return;
 }
@@ -332,23 +332,23 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
     //       py::arg("depends") = py::list());
 
-    // m.def("_take", &usm_ndarray_take,
-    //       "Takes elements at usm_ndarray indices `ind` and axes starting "
-    //       "at axis `axis_start` from array `src` and copies them "
-    //       "into usm_ndarray `dst` synchronously."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("src"), py::arg("ind"), py::arg("dst"),
-    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
-    //   py::arg("depends") = py::list());
-
-    // m.def("_put", &usm_ndarray_put,
-    //       "Puts elements at usm_ndarray indices `ind` and axes starting "
-    //       "at axis `axis_start` into array `dst` from "
-    //       "usm_ndarray `val` synchronously."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("dst"), py::arg("ind"), py::arg("val"),
-    //       py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_take", &usm_ndarray_take,
+          "Takes elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` from array `src` and copies them "
+          "into usm_ndarray `dst` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_put", &usm_ndarray_put,
+          "Puts elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` into array `dst` from "
+          "usm_ndarray `val` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_eye", &usm_ndarray_eye,
     //       "Fills input 2D contiguous usm_ndarray `dst` with "

From 87e5482f2faf3bff2549b48c999bbab516fce168 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 09:59:18 -0800
Subject: [PATCH 014/145] Use put/take from dpctl_ext.tensor in dpnp

---
 dpnp/dpnp_iface_indexing.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 6e7ab778299b..6421f39fd4e4 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -52,6 +52,8 @@
 from dpctl.tensor._indexing_functions import _get_indexing_mode
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -295,7 +297,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
                 "Input and output allocation queues are not compatible"
             )
 
-        if ti._array_overlap(x, out):
+        if ti_ext._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
             out = dpt.empty_like(out)
     else:
@@ -304,7 +306,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
 
-    h_ev, take_ev = ti._take(
+    h_ev, take_ev = ti_ext._take(
         src=x,
         ind=(inds,),
         dst=out,
@@ -813,7 +815,7 @@ def extract(condition, a):
         usm_a = dpt.reshape(usm_a, -1)
         usm_cond = dpt.reshape(usm_cond, -1)
 
-        usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0])
+        usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
             usm_a = dpt.reshape(usm_a, -1)
@@ -1713,7 +1715,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
     if axis is None and usm_a.ndim > 1:
         usm_a = dpt.reshape(usm_a, -1)
 
-    dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
+    dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
         in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
 

From b537f30115be31858782e6a7ace1fc52f54c5f9d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Feb 2026 10:33:51 -0800
Subject: [PATCH 015/145] Move full() to dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 dpctl_ext/tensor/__init__.py                  |   4 +
 dpctl_ext/tensor/_ctors.py                    | 169 ++++++++++
 .../include/kernels/constructors.hpp          | 171 ++++++++++
 .../tensor/libtensor/source/full_ctor.cpp     | 315 ++++++++++++++++++
 .../tensor/libtensor/source/full_ctor.hpp     |  60 ++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  14 +-
 7 files changed, 727 insertions(+), 8 deletions(-)
 create mode 100644 dpctl_ext/tensor/_ctors.py
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ae8b72d71873..0c52d766afbf 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -52,7 +52,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 35453dbf9a46..9f4c27608a99 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,12 +27,16 @@
 # *****************************************************************************
 
 
+from dpctl_ext.tensor._ctors import (
+    full,
+)
 from dpctl_ext.tensor._indexing_functions import (
     put,
     take,
 )
 
 __all__ = [
+    "full",
     "put",
     "take",
 ]
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
new file mode 100644
index 000000000000..5caa07099c56
--- /dev/null
+++ b/dpctl_ext/tensor/_ctors.py
@@ -0,0 +1,169 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from numbers import Number
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._data_types import _get_dtype
+from dpctl.tensor._device import normalize_queue_device
+
+import dpctl_ext.tensor._tensor_impl as ti
+
+
+def _cast_fill_val(fill_val, dt):
+    """
+    Casts the Python scalar `fill_val` to another Python type coercible to the
+    requested data type `dt`, if necessary.
+    """
+    val_type = type(fill_val)
+    if val_type in [float, complex] and np.issubdtype(dt, np.integer):
+        return int(fill_val.real)
+    elif val_type is complex and np.issubdtype(dt, np.floating):
+        return fill_val.real
+    elif val_type is int and np.issubdtype(dt, np.integer):
+        return _to_scalar(fill_val, dt)
+    else:
+        return fill_val
+
+
+def _to_scalar(obj, sc_ty):
+    """A way to convert object to NumPy scalar type.
+    Raises OverflowError if obj can not be represented
+    using the requested scalar type.
+    """
+    zd_arr = np.asarray(obj, dtype=sc_ty)
+    return zd_arr[()]
+
+
+def _validate_fill_value(fill_val):
+    """Validates that `fill_val` is a numeric or boolean scalar."""
+    # TODO: verify if `np.True_` and `np.False_` should be instances of
+    # Number in NumPy, like other NumPy scalars and like Python bools
+    # check for `np.bool_` separately as NumPy<2 has no `np.bool`
+    if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_):
+        raise TypeError(
+            f"array cannot be filled with scalar of type {type(fill_val)}"
+        )
+
+
+def full(
+    shape,
+    fill_value,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with `fill_value`.
+
+    Args:
+        shape (tuple):
+            Dimensions of the array to be created.
+        fill_value (int,float,complex,usm_ndarray):
+            fill value
+        dtype (optional): data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+
+    if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+        if (
+            isinstance(fill_value, dpt.usm_ndarray)
+            and sycl_queue is None
+            and device is None
+        ):
+            sycl_queue = fill_value.sycl_queue
+        else:
+            sycl_queue = normalize_queue_device(
+                sycl_queue=sycl_queue, device=device
+            )
+        X = dpt.asarray(
+            fill_value,
+            dtype=dtype,
+            order=order,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
+    else:
+        _validate_fill_value(fill_value)
+
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    usm_type = usm_type if usm_type is not None else "device"
+    dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    fill_value = _cast_fill_val(fill_value, dtype)
+
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
+    return res
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
new file mode 100644
index 000000000000..dfd1b889aafe
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -0,0 +1,171 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor constructors.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace constructors
+{
+
+using dpctl::tensor::ssize_t;
+
+/*!
+  @defgroup CtorKernels
+ */
+
+template <typename Ty>
+class full_strided_kernel;
+
+using namespace dpctl::tensor::offset_utils;
+
+/* ================ Full ================== */
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param fill_v  Value to fill the array with
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &q,
+                             std::size_t nelems,
+                             dstTy fill_v,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        dstTy *p = reinterpret_cast<dstTy *>(dst_p);
+        cgh.fill<dstTy>(p, fill_v, nelems);
+    });
+
+    return fill_ev;
+}
+
+template <typename Ty, typename IndexerT>
+class FullStridedFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty fill_v;
+    IndexerT indexer;
+
+public:
+    FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_)
+        : p(p_), fill_v(fill_v_), indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        auto offset = indexer(id.get(0));
+        p[offset] = fill_v;
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param fill_v  Value to fill the array with
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &q,
+                              int nd,
+                              std::size_t nelems,
+                              const ssize_t *shape_strides,
+                              dstTy fill_v,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+
+    dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+    using dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexer strided_indexer(nd, 0, shape_strides);
+
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = full_strided_kernel<dstTy>;
+        using Impl = FullStridedFunctor<dstTy, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                     Impl(dst_tp, fill_v, strided_indexer));
+    });
+
+    return fill_ev;
+}
+
+} // namespace constructors
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
new file mode 100644
index 000000000000..e1f61be4a12a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -0,0 +1,315 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "full_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &,
+                                            std::size_t,
+                                            const py::object &,
+                                            char *,
+                                            const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const py::object &py_value,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    sycl::event fill_ev;
+
+    if constexpr (sizeof(dstTy) == sizeof(char)) {
+        const auto memset_val = sycl::bit_cast<unsigned char>(fill_v);
+        fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                       nelems * sizeof(dstTy));
+        });
+    }
+    else {
+        bool is_zero = false;
+        if constexpr (sizeof(dstTy) == 1) {
+            is_zero = (std::uint8_t{0} == sycl::bit_cast<std::uint8_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 2) {
+            is_zero =
+                (std::uint16_t{0} == sycl::bit_cast<std::uint16_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 4) {
+            is_zero =
+                (std::uint32_t{0} == sycl::bit_cast<std::uint32_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 8) {
+            is_zero =
+                (std::uint64_t{0} == sycl::bit_cast<std::uint64_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 16) {
+            struct UInt128
+            {
+
+                constexpr UInt128() : v1{}, v2{} {}
+                UInt128(const UInt128 &) = default;
+
+                operator bool() const
+                {
+                    return bool(!v1) && bool(!v2);
+                }
+
+                std::uint64_t v1;
+                std::uint64_t v2;
+            };
+            is_zero = static_cast<bool>(sycl::bit_cast<UInt128>(fill_v));
+        }
+
+        if (is_zero) {
+            static constexpr int memset_val = 0;
+            fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                           nelems * sizeof(dstTy));
+            });
+        }
+        else {
+            using dpctl::tensor::kernels::constructors::full_contig_impl;
+
+            fill_ev =
+                full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
+        }
+    }
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullContigFactory
+{
+    fnT get()
+    {
+        fnT f = full_contig_impl<Ty>;
+        return f;
+    }
+};
+
+typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &,
+                                             int,
+                                             std::size_t,
+                                             py::ssize_t *,
+                                             const py::object &,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given strided memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &exec_q,
+                              int nd,
+                              std::size_t nelems,
+                              py::ssize_t *shape_strides,
+                              const py::object &py_value,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    using dpctl::tensor::kernels::constructors::full_strided_impl;
+    sycl::event fill_ev = full_strided_impl<dstTy>(
+        exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends);
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullStridedFactory
+{
+    fnT get()
+    {
+        fnT f = full_strided_impl<Ty>;
+        return f;
+    }
+};
+
+static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types];
+static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    // py_value should be coercible into data type of dst
+
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = full_contig_dispatch_vector[dst_typeid];
+
+        sycl::event full_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), py_value, dst_data,
+               depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {full_contig_event}),
+            full_contig_event);
+    }
+    else {
+        int nd = dst.get_ndim();
+        auto const &dst_shape = dst.get_shape_vector();
+        auto const &dst_strides = dst.get_strides_vector();
+
+        auto fn = full_strided_dispatch_vector[dst_typeid];
+
+        std::vector<sycl::event> host_task_events;
+        host_task_events.reserve(2);
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape, dst_strides);
+        auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+        const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+        py::ssize_t *shape_strides = shape_strides_owner.get();
+
+        const sycl::event &full_strided_ev =
+            fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data,
+               {copy_shape_ev});
+
+        // free shape_strides
+        const auto &temporaries_cleanup_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {full_strided_ev}, shape_strides_owner);
+        host_task_events.push_back(temporaries_cleanup_ev);
+
+        return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events),
+                              full_strided_ev);
+    }
+}
+
+void init_full_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<full_contig_fn_ptr_t, FullContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(full_contig_dispatch_vector);
+
+    DispatchVectorBuilder<full_strided_fn_ptr_t, FullStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(full_strided_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
new file mode 100644
index 000000000000..d664b2013506
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
@@ -0,0 +1,60 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends = {});
+
+extern void init_full_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index c18761031fd0..c72c0b49622a 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -54,7 +54,7 @@
 // #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
-// #include "full_ctor.hpp"
+#include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 // #include "linear_sequences.hpp"
@@ -103,7 +103,7 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ================ Full ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_full;
+using dpctl::tensor::py_internal::usm_ndarray_full;
 
 /* ================ Zeros ================== */
 
@@ -159,7 +159,7 @@ void init_dispatch_vectors(void)
     // init_copy_for_reshape_dispatch_vectors();
     // init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
-    // init_full_ctor_dispatch_vectors();
+    init_full_ctor_dispatch_vectors();
     // init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
     // init_triul_ctor_dispatch_vectors();
@@ -327,10 +327,10 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
     //       py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_full_usm_ndarray", &usm_ndarray_full,
-    //       "Populate usm_ndarray `dst` with given fill_value.",
-    //       py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_full_usm_ndarray", &usm_ndarray_full,
+          "Populate usm_ndarray `dst` with given fill_value.",
+          py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("_take", &usm_ndarray_take,
           "Takes elements at usm_ndarray indices `ind` and axes starting "

From d50f263f089dfd52edb4daa15edd3f86807965e5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:06:00 -0800
Subject: [PATCH 016/145] Use full and _full_usm_ndarray from dpctl_ext in dpnp

---
 dpnp/dpnp_algo/dpnp_fill.py | 6 ++++--
 dpnp/dpnp_container.py      | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 112ea3af0fdb..f7e6f0f608b1 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -32,12 +32,14 @@
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
 from dpctl.tensor._tensor_impl import (
-    _copy_usm_ndarray_into_usm_ndarray,
-    _full_usm_ndarray,
     _zeros_usm_ndarray,
 )
 
 import dpnp
+from dpctl_ext.tensor._tensor_impl import (
+    _copy_usm_ndarray_into_usm_ndarray,
+    _full_usm_ndarray,
+)
 
 
 def dpnp_fill(arr, val):
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 4975db17c717..b13bf96cda28 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -38,6 +38,7 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 
@@ -228,7 +229,7 @@ def full(
         fill_value = fill_value.get_array()
 
     """Creates `dpnp_array` having a specified shape, filled with fill_value."""
-    array_obj = dpt.full(
+    array_obj = dpt_ext.full(
         shape,
         fill_value,
         dtype=dtype,

From f189dc540477ceadf35dcb127325056c5e0c406b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:22:55 -0800
Subject: [PATCH 017/145] Update .gitignore to ignore .so files in dpctl_ext

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5d2725d3186f..4ae07ccbbdb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,5 @@ dpnp/**/*.cpython*.so
 dpnp/**/*.pyd
 *~
 core
+
+dpctl_ext/**/*.cpython*.so

From f9a181721784c843907c16e2e1d5569c487cf9e3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:23:51 -0800
Subject: [PATCH 018/145] Move _zeros_usm_ndarray to dpctl_ext

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../tensor/libtensor/source/tensor_ctors.cpp  |  12 +-
 .../tensor/libtensor/source/zeros_ctor.cpp    | 168 ++++++++++++++++++
 .../tensor/libtensor/source/zeros_ctor.hpp    |  59 ++++++
 4 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 0c52d766afbf..cb468b9a226d 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -53,7 +53,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index c72c0b49622a..b55439162f90 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -64,7 +64,7 @@
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
 // #include "where.hpp"
-// #include "zeros_ctor.hpp"
+#include "zeros_ctor.hpp"
 
 namespace py = pybind11;
 
@@ -107,7 +107,7 @@ using dpctl::tensor::py_internal::usm_ndarray_full;
 
 /* ================ Zeros ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_zeros;
+using dpctl::tensor::py_internal::usm_ndarray_zeros;
 
 /* ============== Advanced Indexing ============= */
 using dpctl::tensor::py_internal::usm_ndarray_put;
@@ -160,7 +160,7 @@ void init_dispatch_vectors(void)
     // init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
-    // init_zeros_ctor_dispatch_vectors();
+    init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
     // init_triul_ctor_dispatch_vectors();
 
@@ -323,9 +323,9 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       synchronously.", py::arg("src"), py::arg("dst"),
     //       py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
-    //       "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+          "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     m.def("_full_usm_ndarray", &usm_ndarray_full,
           "Populate usm_ndarray `dst` with given fill_value.",
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
new file mode 100644
index 000000000000..4558743b3c22
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -0,0 +1,168 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &,
+                                             std::size_t,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with zeros.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event zeros_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+
+    static constexpr int memset_val(0);
+    sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                   nelems * sizeof(dstTy));
+    });
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct ZerosContigFactory
+{
+    fnT get()
+    {
+        fnT f = zeros_contig_impl<Ty>;
+        return f;
+    }
+};
+
+static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends)
+{
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = zeros_contig_dispatch_vector[dst_typeid];
+
+        sycl::event zeros_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), dst_data, depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {zeros_contig_event}),
+            zeros_contig_event);
+    }
+    else {
+        throw std::runtime_error(
+            "Only population of contiguous usm_ndarray objects is supported.");
+    }
+}
+
+void init_zeros_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<zeros_contig_fn_ptr_t, ZerosContigFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(zeros_contig_dispatch_vector);
+
+    return;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
new file mode 100644
index 000000000000..51270a3443cc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -0,0 +1,59 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_zeros_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From 4b8505acf111ec2636afa0d2a9a25cf8677e02c7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:25:05 -0800
Subject: [PATCH 019/145] Use _zeros_usm_ndarray from dpctl_ext in dpnp_fill.py

---
 dpnp/dpnp_algo/dpnp_fill.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index f7e6f0f608b1..0d6640c3b8b5 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -31,14 +31,12 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
-from dpctl.tensor._tensor_impl import (
-    _zeros_usm_ndarray,
-)
 
 import dpnp
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
+    _zeros_usm_ndarray,
 )
 
 

From 61106b2e208d7f331bebc3335a49bc23212510c1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:39:35 -0800
Subject: [PATCH 020/145] Move linear-sequence implementations to
 dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/constructors.hpp          | 178 ++++++++++
 .../libtensor/source/linear_sequences.cpp     | 312 ++++++++++++++++++
 .../libtensor/source/linear_sequences.hpp     |  69 ++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  38 +--
 5 files changed, 579 insertions(+), 20 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index cb468b9a226d..af0e2a7aa49f 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -48,7 +48,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index dfd1b889aafe..20775b071ea8 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -58,11 +58,189 @@ using dpctl::tensor::ssize_t;
   @defgroup CtorKernels
  */
 
+template <typename Ty>
+class linear_sequence_step_kernel;
+template <typename Ty, typename wTy>
+class linear_sequence_affine_kernel;
 template <typename Ty>
 class full_strided_kernel;
+// template <typename Ty> class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
+template <typename Ty>
+class LinearSequenceStepFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty step_v;
+
+public:
+    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            p[i] = Ty{start_v.real() + i * step_v.real(),
+                      start_v.imag() + i * step_v.imag()};
+        }
+        else {
+            p[i] = start_v + i * step_v;
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting value and
+ * increment.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start_v Typed starting value of the sequence
+ * @param step_v  Typed increment of the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                Ty start_v,
+                                Ty step_v,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
+    });
+
+    return lin_space_step_event;
+}
+
+// Constructor to populate tensor with linear sequence defined by
+// start and and data
+
+template <typename Ty, typename wTy>
+class LinearSequenceAffineFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty end_v;
+    std::size_t n;
+
+public:
+    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
+          n((den == 0) ? 1 : den)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        wTy wc = wTy(i) / n;
+        wTy w = wTy(n - i) / n;
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            using reT = typename Ty::value_type;
+            auto _w = static_cast<reT>(w);
+            auto _wc = static_cast<reT>(wc);
+            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
+            re_comb =
+                sycl::fma(end_v.real(), _wc,
+                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
+            auto im_comb =
+                sycl::fma(start_v.imag(), _w,
+                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
+            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
+            Ty affine_comb = Ty{re_comb, im_comb};
+            p[i] = affine_comb;
+        }
+        else if constexpr (std::is_floating_point<Ty>::value) {
+            Ty _w = static_cast<Ty>(w);
+            Ty _wc = static_cast<Ty>(wc);
+            auto affine_comb =
+                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
+            affine_comb = sycl::fma(end_v, _wc, affine_comb);
+            p[i] = affine_comb;
+        }
+        else {
+            using dpctl::tensor::type_utils::convert_impl;
+            auto affine_comb = start_v * w + end_v * wc;
+            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting and end values.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence.
+ * @param start_v Stating value of the sequence.
+ * @param end_v   End-value of the sequence.
+ * @param include_endpoint  Whether the end-value is included in the sequence.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  Ty start_v,
+                                  Ty end_v,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    const bool device_supports_doubles =
+        exec_q.get_device().has(sycl::aspect::fp64);
+    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
+
+    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (device_supports_doubles) {
+            using KernelName = linear_sequence_affine_kernel<Ty, double>;
+            using Impl = LinearSequenceAffineFunctor<Ty, double>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+        else {
+            using KernelName = linear_sequence_affine_kernel<Ty, float>;
+            using Impl = LinearSequenceAffineFunctor<Ty, float>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+    });
+
+    return lin_space_affine_event;
+}
+
 /* ================ Full ================== */
 
 /*!
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
new file mode 100644
index 000000000000..02c4a8ad0fa1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
@@ -0,0 +1,312 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpnp4pybind11.hpp"
+#include <complex>
+#include <cstddef>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "linear_sequences.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+// Constructor to populate tensor with linear sequence defined by
+// start and step data
+
+typedef sycl::event (*lin_space_step_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &step,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by starting value and increment
+ * given as Python objects.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start Starting value of the sequence as Python object. Must be
+ * convertible to array element data type `Ty`.
+ * @param step  Increment of the sequence as Python object. Must be convertible
+ * to array element data type `Ty`.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const py::object &start,
+                                const py::object &step,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty step_v = py::cast<Ty>(step);
+
+    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
+
+    auto lin_space_step_event = lin_space_step_impl<Ty>(
+        exec_q, nelems, start_v, step_v, array_data, depends);
+
+    return lin_space_step_event;
+}
+
+typedef sycl::event (*lin_space_affine_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &end,
+    bool include_endpoint,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified  by starting and end values given
+ * as Python objects.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param start Stating value of the sequence as Python object. Must be
+ * convertible to array data element type `Ty`.
+ * @param end   End-value of the sequence as Python object. Must be convertible
+ * to array data element type `Ty`.
+ * @param include_endpoint  Whether the end-value is included in the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const py::object &start,
+                                  const py::object &end,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty end_v = py::cast<Ty>(end);
+
+    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
+
+    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
+        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
+
+    return lin_space_affine_event;
+}
+
+using dpctl::utils::keep_args_alive;
+
+static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
+
+static lin_space_affine_fn_ptr_t
+    lin_space_affine_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_step(const py::object &start,
+                                     const py::object &dt,
+                                     const dpctl::tensor::usm_ndarray &dst,
+                                     sycl::queue &exec_q,
+                                     const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_step_event;
+
+    auto fn = lin_space_step_dispatch_vector[dst_typeid];
+
+    linspace_step_event =
+        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
+                          linspace_step_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_affine(const py::object &start,
+                                       const py::object &end,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       bool include_endpoint,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation context");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_affine_event;
+
+    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
+
+    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
+                               end, include_endpoint, dst_data, depends);
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
+        linspace_affine_event);
+}
+
+/*!
+ * @brief  Factor to get function pointer of type `fnT` for array with elements
+ * of type `Ty`.
+ * @defgroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceStepFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_step_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for array data type
+ * `Ty`.
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceAffineFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_affine_impl<Ty>;
+        return f;
+    }
+};
+
+void init_linear_sequences_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
+
+    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
new file mode 100644
index 000000000000..321cd2f23efe
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_step(
+    const py::object &start,
+    const py::object &dt,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
+    const py::object &start,
+    const py::object &end,
+    const dpctl::tensor::usm_ndarray &dst,
+    bool include_endpoint,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_linear_sequences_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index b55439162f90..dd660c497f9a 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -57,7 +57,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
-// #include "linear_sequences.hpp"
+#include "linear_sequences.hpp"
 // #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 // #include "triul_ctor.hpp"
@@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ============= linear-sequence ==================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 /* ================ Full ================== */
 
@@ -158,7 +158,7 @@ void init_dispatch_vectors(void)
     init_copy_as_contig_dispatch_vectors();
     // init_copy_for_reshape_dispatch_vectors();
     // init_copy_for_roll_dispatch_vectors();
-    // init_linear_sequences_dispatch_vectors();
+    init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
@@ -300,22 +300,22 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
     //       py::list());
 
-    // m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-    //       "specified by "
-    //       "starting point `start` and step `dt`. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("start"), py::arg("dt"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and step `dt`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("dt"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-    //       "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-    //       "specified by "
-    //       "starting point `start` and end point `end`. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("start"), py::arg("end"), py::arg("dst"),
-    //       py::arg("include_endpoint"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and end point `end`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("end"), py::arg("dst"),
+          py::arg("include_endpoint"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_copy_numpy_ndarray_into_usm_ndarray",
     //       &copy_numpy_ndarray_into_usm_ndarray,

From a030579be8525d6f23674d5c9a4a171ab842f500 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 02:40:33 -0800
Subject: [PATCH 021/145] Use _tensor_impl from dpctl_ext in dpnp_utils_fft.py

---
 dpnp/fft/dpnp_utils_fft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 4e2b7aaaf842..c692774a424f 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -42,7 +42,6 @@
 from collections.abc import Sequence
 
 import dpctl
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -51,6 +50,7 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 

From a1d6fa39ba8607b191177d6acb0ca2f3cf8f49fc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 03:03:08 -0800
Subject: [PATCH 022/145] Move tril()/triu() to dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 dpctl_ext/tensor/__init__.py                  |   4 +
 dpctl_ext/tensor/_ctors.py                    | 157 +++++++++++
 .../include/kernels/constructors.hpp          | 138 ++++++++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  46 ++--
 .../tensor/libtensor/source/triul_ctor.cpp    | 253 ++++++++++++++++++
 .../tensor/libtensor/source/triul_ctor.hpp    |  62 +++++
 7 files changed, 638 insertions(+), 24 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index af0e2a7aa49f..1375c8316754 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -54,7 +54,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 9f4c27608a99..3c6939eff7a0 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -29,6 +29,8 @@
 
 from dpctl_ext.tensor._ctors import (
     full,
+    tril,
+    triu,
 )
 from dpctl_ext.tensor._indexing_functions import (
     put,
@@ -39,4 +41,6 @@
     "full",
     "put",
     "take",
+    "tril",
+    "triu",
 ]
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 5caa07099c56..a0e7b28e66ff 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -26,6 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+import operator
 from numbers import Number
 
 import dpctl
@@ -167,3 +168,159 @@ def full(
     hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
     _manager.add_event_pair(hev, full_ev)
     return res
+
+
+def tril(x, /, *, k=0):
+    """
+    Returns the lower triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The lower triangular part of the matrix is defined as the elements on and
+    below the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal above which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            A lower-triangular array or a stack of lower-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k >= shape[nd - 1] - 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    elif k < -shape[nd - 2]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, tril_ev = ti._tril(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, tril_ev)
+
+    return res
+
+
+def triu(x, /, *, k=0):
+    """
+    Returns the upper triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The upper triangular part of the matrix is defined as the elements on and
+    above the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal below which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            An upper-triangular array or a stack of upper-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpctl.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k > shape[nd - 1]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    elif k <= -shape[nd - 2] + 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, triu_ev = ti._triu(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, triu_ev)
+
+    return res
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 20775b071ea8..8d53655b2754 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -343,6 +343,144 @@ sycl::event full_strided_impl(sycl::queue &q,
     return fill_ev;
 }
 
+/* =========================== Tril and triu ============================== */
+
+// define function type
+typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &,
+                                    ssize_t,   // inner_range  //ssize_t
+                                    ssize_t,   // outer_range
+                                    char *,    // src_data_ptr
+                                    char *,    // dst_data_ptr
+                                    ssize_t,   // nd
+                                    ssize_t *, // shape_and_strides
+                                    ssize_t,   // k
+                                    const std::vector<sycl::event> &,
+                                    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy triangular matrices from source stack to destination
+ * stack.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param inner_range  Number of elements in each matrix.
+ * @param outer_range  Number of matrices to copy.
+ * @param src_p  Kernel accessible USM pointer for the source array.
+ * @param dst_p  Kernel accessible USM pointer for the destination array.
+ * @param nd  The array dimensionality of source and destination arrays.
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides of arrays.
+ * @param k Position of the diagonal above/below which to copy filling the rest
+ * with zero elements.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends  List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty, bool>
+class tri_kernel;
+template <typename Ty, bool upper>
+sycl::event tri_impl(sycl::queue &exec_q,
+                     ssize_t inner_range,
+                     ssize_t outer_range,
+                     char *src_p,
+                     char *dst_p,
+                     ssize_t nd,
+                     ssize_t *shape_and_strides,
+                     ssize_t k,
+                     const std::vector<sycl::event> &depends,
+                     const std::vector<sycl::event> &additional_depends)
+{
+    static constexpr int d2 = 2;
+    ssize_t src_s = nd;
+    ssize_t dst_s = 2 * nd;
+    ssize_t nd_1 = nd - 1;
+    ssize_t nd_2 = nd - 2;
+    Ty *src = reinterpret_cast<Ty *>(src_p);
+    Ty *dst = reinterpret_cast<Ty *>(dst_p);
+
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        cgh.parallel_for<tri_kernel<Ty, upper>>(
+            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
+                ssize_t outer_gid = idx[0] / inner_range;
+                ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+
+                ssize_t src_inner_offset = 0, dst_inner_offset = 0;
+                bool to_copy{false};
+
+                {
+                    using dpctl::tensor::strides::CIndexer_array;
+                    CIndexer_array<d2, ssize_t> indexer_i(
+                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
+                    indexer_i.set(inner_gid);
+                    const std::array<ssize_t, d2> &inner = indexer_i.get();
+                    src_inner_offset =
+                        inner[0] * shape_and_strides[src_s + nd_2] +
+                        inner[1] * shape_and_strides[src_s + nd_1];
+                    dst_inner_offset =
+                        inner[0] * shape_and_strides[dst_s + nd_2] +
+                        inner[1] * shape_and_strides[dst_s + nd_1];
+
+                    if constexpr (upper)
+                        to_copy = (inner[0] + k >= inner[1]);
+                    else
+                        to_copy = (inner[0] + k <= inner[1]);
+                }
+
+                ssize_t src_offset = 0;
+                ssize_t dst_offset = 0;
+                {
+                    using dpctl::tensor::strides::CIndexer_vector;
+                    CIndexer_vector<ssize_t> outer(nd - d2);
+                    outer.get_displacement(
+                        outer_gid, shape_and_strides, shape_and_strides + src_s,
+                        shape_and_strides + dst_s, src_offset, dst_offset);
+                }
+
+                src_offset += src_inner_offset;
+                dst_offset += dst_inner_offset;
+
+                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
+            });
+    });
+    return tri_ev;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TrilGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*tril*/ true>;
+        return f;
+    }
+};
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TriuGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*triu*/ false>;
+        return f;
+    }
+};
+
 } // namespace constructors
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index dd660c497f9a..f2afce105f7f 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -60,7 +60,7 @@
 #include "linear_sequences.hpp"
 // #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
-// #include "triul_ctor.hpp"
+#include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
 // #include "where.hpp"
@@ -129,7 +129,7 @@ using dpctl::tensor::py_internal::usm_ndarray_take;
 
 /* =========================== Tril and triu ============================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_triul;
+using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 /* =========================== Where ============================== */
 
@@ -162,7 +162,7 @@ void init_dispatch_vectors(void)
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
-    // init_triul_ctor_dispatch_vectors();
+    init_triul_ctor_dispatch_vectors();
 
     // populate_masked_extract_dispatch_vectors();
     // populate_masked_place_dispatch_vectors();
@@ -388,27 +388,27 @@ PYBIND11_MODULE(_tensor_impl, m)
           dpctl::tensor::py_internal::default_device_index_type,
           "Gives default index type supported by device.", py::arg("dev"));
 
-    // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
-    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-    //                   sycl::queue &exec_q,
-    //                   const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
-    // };
-    // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
-    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    };
+    m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
-    //                   const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-    //                   sycl::queue &exec_q,
-    //                   const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
-    // };
-    // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
-    //       py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    };
+    m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
     //       py::arg("cumsum"), py::arg("sycl_queue"),
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
new file mode 100644
index 000000000000..0890dfdb4766
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
@@ -0,0 +1,253 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm> // for std::copy
+#include <cstddef>   // for std::size_t
+#include <memory>    // for std::make_shared
+#include <stdexcept> // for std::runtime_error
+#include <utility>   // for std::pair, std::move
+#include <vector>    // for std::vector, std::begin, std::end
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::tri_fn_ptr_t;
+
+static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types];
+static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    if (src_nd < 2) {
+        throw py::value_error("Array dimensions less than 2.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && i < src_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (dst_typeid != src_typeid) {
+        throw py::value_error("Array dtype are not the same.");
+    }
+
+    // check same queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation contexts");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd - 2;
+    const py::ssize_t *shape = src_shape;
+
+    const shT iter_src_strides(std::begin(src_strides),
+                               std::begin(src_strides) + nd);
+    const shT iter_dst_strides(std::begin(dst_strides),
+                               std::begin(dst_strides) + nd);
+
+    simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (src_offset != 0 || dst_offset != 0) {
+        throw py::value_error("Reversed slice for dst is not supported");
+    }
+
+    nd += 2;
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT allocator(exec_q);
+    auto shp_host_shape_and_strides =
+        std::make_shared<usmshT>(3 * nd, allocator);
+
+    std::copy(simplified_shape.begin(), simplified_shape.end(),
+              shp_host_shape_and_strides->begin());
+    (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2];
+    (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1];
+
+    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
+              shp_host_shape_and_strides->begin() + nd);
+    (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1];
+
+    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
+              shp_host_shape_and_strides->begin() + 2 * nd);
+    (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1];
+
+    auto dev_shape_and_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(3 * nd,
+                                                                     exec_q);
+    py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get();
+
+    const sycl::event &copy_shape_and_strides = exec_q.copy<py::ssize_t>(
+        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+
+    py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2];
+    py::ssize_t outer_range = src_nelems / inner_range;
+
+    sycl::event tri_ev;
+    if (part == 'l') {
+        auto fn = tril_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+    else {
+        auto fn = triu_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(tri_ev);
+        const auto &ctx = exec_q.get_context();
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task(
+            [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides),
+             dev_shape_and_strides, ctx]() {
+                // capture of shp_host_shape_and_strides ensure the underlying
+                // vector exists for the entire execution of copying kernel
+                sycl_free_noexcept(dev_shape_and_strides, ctx);
+            });
+    });
+    // since host_task now owns USM allocation, release ownership by smart
+    // pointer
+    dev_shape_and_strides_owner.release();
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev);
+}
+
+void init_triul_ctor_dispatch_vectors(void)
+{
+
+    using namespace td_ns;
+    using dpctl::tensor::kernels::constructors::TrilGenericFactory;
+    using dpctl::tensor::kernels::constructors::TriuGenericFactory;
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb1;
+    dvb1.populate_dispatch_vector(tril_generic_dispatch_vector);
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb2;
+    dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
new file mode 100644
index 000000000000..08889df6227f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
@@ -0,0 +1,62 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_triul_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From f1d6e5650910eec6f330b2de902a93a1ae95df5f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 6 Feb 2026 03:05:03 -0800
Subject: [PATCH 023/145] Use tril/triu/_tril from dpctl_ext.tensor in dpnp

---
 dpnp/dpnp_container.py           | 4 ++--
 dpnp/linalg/dpnp_utils_linalg.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index b13bf96cda28..c8e28529cd57 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -270,13 +270,13 @@ def ones(
 
 def tril(x1, /, *, k=0):
     """Creates `dpnp_array` as lower triangular part of an input array."""
-    array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
 def triu(x1, /, *, k=0):
     """Creates `dpnp_array` as upper triangular part of an input array."""
-    array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 196cd2ae9da5..5fb1c099dde2 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -42,12 +42,12 @@
 
 from typing import NamedTuple
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy import prod
 
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations

From 668079060d9ece02fbb6887c2313edca9e6ecbef Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Feb 2026 02:47:35 -0800
Subject: [PATCH 024/145] Disable pylint no-name-in-module for dpctl_ext

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py   | 1 +
 dpnp/dpnp_iface.py                          | 3 +--
 dpnp/dpnp_iface_searching.py                | 1 +
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index b63bf61f8dad..d8235b84e2d0 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -44,6 +44,7 @@
     _validate_dtype,
 )
 
+# pylint: disable=no-name-in-module
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 832446c826ba..6220c61db6d9 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -40,6 +40,7 @@
 """
 
 # pylint: disable=protected-access
+# pylint: disable=no-name-in-module
 
 import os
 
@@ -53,8 +54,6 @@
 import dpnp
 
 from .dpnp_array import dpnp_array
-
-# pylint: disable=no-name-in-module
 from .dpnp_utils import (
     dpnp_descriptor,
     map_dtype_to_device,
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index fdbd317d31dd..74fbc9b37d13 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -41,6 +41,7 @@
 
 import dpctl.tensor as dpt
 
+# pylint: disable=no-name-in-module
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 4d8e3cdfbd0d..2de2bc15372c 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -37,6 +37,7 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
+# pylint: disable=no-name-in-module
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi

From 263b7175f4aab799cd4fa100602011e8e23d046b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 04:31:01 -0800
Subject: [PATCH 025/145] Add TODO comments

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py   | 2 ++
 dpnp/dpnp_iface.py                          | 2 ++
 dpnp/dpnp_iface_searching.py                | 2 ++
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 ++
 dpnp/scipy/linalg/_utils.py                 | 2 ++
 setup.py                                    | 2 +-
 6 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index d8235b84e2d0..88abcee5035c 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -45,6 +45,8 @@
 )
 
 # pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 6220c61db6d9..50b474014666 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -50,6 +50,8 @@
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 74fbc9b37d13..16ab633d506b 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -42,6 +42,8 @@
 import dpctl.tensor as dpt
 
 # pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 2de2bc15372c..3dfd3c23ee7f 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -38,6 +38,8 @@
 from dpctl.utils import ExecutionPlacementError
 
 # pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 8eb9187236bf..ce832d8f4529 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -44,6 +44,8 @@
 
 import dpctl.utils as dpu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
diff --git a/setup.py b/setup.py
index a0c54b066dcf..7ffef3bed9d8 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@
         "dpnp.scipy",
         "dpnp.scipy.linalg",
         "dpnp.scipy.special",
-        # dpctl_ext
+        # TODO: replace with dpctl; dpctl.tensor
         "dpctl_ext",
         "dpctl_ext.tensor",
     ],

From 4130c1b80aa108ca127040a6c4ea15bcaa86173f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 04:53:39 -0800
Subject: [PATCH 026/145] Use default_device_complex_type from dpctl_ext on
 test_array_api_info.py

---
 dpnp/tests/test_array_api_info.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py
index b310192ffc59..32730c8724dc 100644
--- a/dpnp/tests/test_array_api_info.py
+++ b/dpnp/tests/test_array_api_info.py
@@ -1,9 +1,11 @@
-import numpy
 import pytest
 from dpctl import SyclDeviceCreationError, get_devices, select_default_device
-from dpctl.tensor._tensor_impl import default_device_complex_type
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor....`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._tensor_impl import default_device_complex_type
 from dpnp.tests.helper import (
     has_support_aspect64,
     is_win_platform,

From 17ca9ab52368f3bbdbfbdf6410b82823c98c53c0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 06:59:55 -0800
Subject: [PATCH 027/145] Remove unused build_dpctl_ext function

---
 dpctl_ext/CMakeLists.txt | 80 ----------------------------------------
 1 file changed, 80 deletions(-)

diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
index bb33a4f57332..cdb007a2d230 100644
--- a/dpctl_ext/CMakeLists.txt
+++ b/dpctl_ext/CMakeLists.txt
@@ -122,84 +122,4 @@ set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN")
 
-function(build_dpctl_ext _trgt _src _dest)
-    set(options SYCL)
-    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN})
-    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
-    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
-    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
-    if(BUILD_DPCTL_EXT_SYCL)
-        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
-        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
-        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
-        if(DPCTL_OFFLOAD_COMPRESS)
-            target_link_options(${_trgt} PRIVATE --offload-compress)
-        endif()
-        if(_dpctl_sycl_targets)
-            # make fat binary
-            target_compile_options(
-                ${_trgt}
-                PRIVATE ${_dpctl_sycl_target_compile_options}
-            )
-            target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options})
-        endif()
-    endif()
-    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
-    if(DPCTL_GENERATE_COVERAGE)
-        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
-        if(BUILD_DPCTL_EXT_SYCL)
-            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
-        endif()
-    endif()
-    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
-    set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
-    target_link_options(${_trgt} PRIVATE ${_linker_options})
-    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
-    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
-    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
-    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
-
-    # TODO: create separate folder inside build folder that contains only
-    #   headers related to this target and appropriate folder structure to
-    #   eliminate shadow dependencies
-    get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY)
-    # TODO: do not set directory if we did not generate header
-    target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir})
-    set(_rpath_value "$ORIGIN")
-    if(BUILD_DPCTL_EXT_RELATIVE_PATH)
-        set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}")
-    endif()
-    if(DPCTL_WITH_REDIST)
-        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
-    endif()
-    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
-
-    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
-    install(
-        FILES ${_generated_api_h}
-        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
-        OPTIONAL
-    )
-    install(
-        FILES ${_generated_public_h}
-        # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}`
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
-        OPTIONAL
-    )
-    if(DPCTL_GENERATE_COVERAGE)
-        get_filename_component(_original_src_dir ${_src} DIRECTORY)
-        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
-        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
-    endif()
-
-    # Create target with headers only, because python is managing all the
-    # library imports at runtime
-    set(_trgt_headers ${_trgt}_headers)
-    add_library(${_trgt_headers} INTERFACE)
-    add_dependencies(${_trgt_headers} ${_trgt})
-    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
-    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
-endfunction()
-
 add_subdirectory(tensor)

From 79cb2a45f28f5099701c0728a6def5c8961c5279 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 07:51:45 -0800
Subject: [PATCH 028/145] Apply remarks for CMake files

---
 dpctl_ext/CMakeLists.txt        | 10 ++-------
 dpctl_ext/tensor/CMakeLists.txt | 38 ++++++++++++++++++---------------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
index cdb007a2d230..e58693091422 100644
--- a/dpctl_ext/CMakeLists.txt
+++ b/dpctl_ext/CMakeLists.txt
@@ -27,13 +27,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-find_package(Python REQUIRED COMPONENTS NumPy)
-
-# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present)
-# -w is to set working directory (and correctly set __pyx_f[] array of filenames)
-set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
-find_package(Cython REQUIRED)
-
+# TODO: rework this logic to remove current duplication
 if(WIN32)
     string(
         CONCAT WARNING_FLAGS
@@ -118,7 +112,7 @@ else()
 endif()
 
 # at build time create include/ directory and copy header files over
-set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN")
 
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ee8da2e49506..28e7a4cb55f4 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -27,8 +27,10 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+find_package(Python COMPONENTS Development)
+
 if(WIN32)
-    if(${CMAKE_VERSION} VERSION_LESS "3.23")
+    if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
         # linker to ignore it.
         set(CMAKE_CXX_LINK_FLAGS
@@ -37,6 +39,7 @@ if(WIN32)
     endif()
 endif()
 
+# TODO: reuse this library for dpnp ufunc extension build
 set(_static_lib_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
 )
@@ -67,11 +70,11 @@ add_library(${_static_lib_trgt} STATIC ${_static_lib_sources})
 target_include_directories(
     ${_static_lib_trgt}
     PRIVATE
-        ${Python_INCLUDE_DIRS}
-        ${DPCTL_INCLUDE_DIR}
+        # ${Python_INCLUDE_DIRS}
+        # ${Dpctl_INCLUDE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
 )
-target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES})
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python)
 set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 set(_py_trgts)
@@ -94,14 +97,14 @@ set(_no_fast_math_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
-list(
-    APPEND _no_fast_math_sources
-    # ${_elementwise_sources}
-    # ${_reduction_sources}
-    # ${_sorting_sources}
-    # ${_linalg_sources}
-    # ${_accumulator_sources}
-)
+#list(
+#APPEND _no_fast_math_sources
+# ${_elementwise_sources}
+# ${_reduction_sources}
+# ${_sorting_sources}
+# ${_linalg_sources}
+# ${_accumulator_sources}
+#)
 
 foreach(_src_fn ${_no_fast_math_sources})
     get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
@@ -114,7 +117,7 @@ endforeach()
 
 set(_compiler_definitions "")
 
-set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+set(_linker_options "LINKER:${DPNP_LDFLAGS}")
 foreach(python_module_name ${_py_trgts})
     target_compile_options(
         ${python_module_name}
@@ -124,6 +127,7 @@ foreach(python_module_name ${_py_trgts})
         ${python_module_name}
         PRIVATE -fsycl-device-code-split=per_kernel
     )
+    # TODO: expand DPCTL_OFFLOAD_COMPRESS to the whole dpnp level
     if(DPCTL_OFFLOAD_COMPRESS)
         target_link_options(${python_module_name} PRIVATE --offload-compress)
     endif()
@@ -149,22 +153,22 @@ foreach(python_module_name ${_py_trgts})
             PRIVATE -fprofile-instr-generate -fcoverage-mapping
         )
     endif()
-    if(_dpctl_sycl_targets)
+    if(_dpnp_sycl_targets)
         # make fat binary
         target_compile_options(
             ${python_module_name}
-            PRIVATE ${_dpctl_sycl_target_compile_options}
+            PRIVATE ${_dpnp_sycl_target_compile_options}
         )
         target_link_options(
             ${python_module_name}
-            PRIVATE ${_dpctl_sycl_target_link_options}
+            PRIVATE ${_dpnp_sycl_target_link_options}
         )
     endif()
     # TODO: update source so they reference individual libraries instead of
     #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
     # NOTE: dpctl C-API is resolved at runtime via Python
     # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
-    if(DPCTL_WITH_REDIST)
+    if(DPNP_WITH_REDIST)
         set_target_properties(
             ${python_module_name}
             PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.."

From 4bf080edc0e5d277441fe39b31733571fbad0de3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 12 Feb 2026 08:30:03 -0800
Subject: [PATCH 029/145] Apply remarks for c++ files

---
 .../include/kernels/copy_and_cast.hpp         | 18 ++++-----------
 .../include/kernels/copy_as_contiguous.hpp    | 19 ++++-----------
 .../source/copy_and_cast_usm_to_usm.cpp       | 23 ++++---------------
 .../source/copy_and_cast_usm_to_usm.hpp       | 11 ++-------
 .../libtensor/source/copy_as_contig.cpp       | 14 ++++-------
 .../libtensor/source/copy_as_contig.hpp       | 11 ++-------
 .../source/device_support_queries.cpp         | 13 ++++-------
 .../source/device_support_queries.hpp         | 12 ++--------
 .../source/simplify_iteration_space.cpp       | 12 ++++------
 .../source/simplify_iteration_space.hpp       | 11 +++------
 .../tensor/libtensor/source/tensor_ctors.cpp  | 10 ++++----
 11 files changed, 43 insertions(+), 111 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
index a07d311a7fcb..d6001a11e471 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -33,11 +33,12 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
+#include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>
-#include <type_traits>
+#include <vector>
 
 #include "dpctl_tensor_types.hpp"
 #include "kernels/alignment.hpp"
@@ -45,13 +46,7 @@
 #include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace copy_and_cast
+namespace dpctl::tensor::kernels::copy_and_cast
 {
 
 using dpctl::tensor::ssize_t;
@@ -1282,7 +1277,4 @@ struct CopyForRollNDShiftFactory
     }
 };
 
-} // namespace copy_and_cast
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::copy_and_cast
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
index b4f367448758..37126a22dc64 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -33,11 +33,12 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
+#include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>
-#include <type_traits>
+#include <vector>
 
 #include "dpctl_tensor_types.hpp"
 #include "kernels/alignment.hpp"
@@ -45,13 +46,7 @@
 #include "utils/sycl_utils.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace copy_as_contig
+namespace dpctl::tensor::kernels::copy_as_contig
 {
 
 using dpctl::tensor::ssize_t;
@@ -648,8 +643,4 @@ struct AsCContigNDBatchOfSquareMatricesFactory
         return as_c_contiguous_nd_batch_of_square_matrices_impl<T>;
     }
 };
-
-} // namespace copy_as_contig
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::copy_as_contig
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 0458aa75ac32..3d20be02f885 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -32,21 +32,15 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-#include <complex>
+#include <array>
 #include <cstddef>
-#include <cstdint>
-#include <stdexcept>
 #include <sycl/sycl.hpp>
-#include <thread>
-#include <type_traits>
+#include <tuple>
 #include <utility>
+#include <vector>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "kernels/copy_and_cast.hpp"
 #include "utils/memory_overlap.hpp"
@@ -54,16 +48,11 @@
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
 
 #include "copy_as_contig.hpp"
 #include "simplify_iteration_space.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace td_ns = dpctl::tensor::type_dispatch;
@@ -305,6 +294,4 @@ void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
     dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
index d2a2dcaf7b85..d2e07b08d38f 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -38,13 +38,8 @@
 #include <vector>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
@@ -55,6 +50,4 @@ extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
 
 extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
index 53b39ff5874c..7105202fe2ff 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -32,10 +32,11 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
 #include <cstddef>
+#include <iterator>
 #include <numeric>
 #include <stdexcept>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -54,13 +55,10 @@
 #include "copy_as_contig.hpp"
 #include "simplify_iteration_space.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
+namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
 using dpctl::tensor::kernels::copy_as_contig::
@@ -753,6 +751,4 @@ std::pair<sycl::event, sycl::event>
                           ascontig_ev);
 }
 
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
index 2de67098b7fa..bfe3159c8813 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp
@@ -32,14 +32,9 @@
 #include <vector>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 std::pair<sycl::event, sycl::event>
@@ -56,6 +51,4 @@ std::pair<sycl::event, sycl::event>
 
 void init_copy_as_contig_dispatch_vectors(void);
 
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
index 51eb7dba1b6c..97a8ba83831e 100644
--- a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp
@@ -39,13 +39,11 @@
 #include <pybind11/stl.h>
 #include <sycl/sycl.hpp>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
+namespace py = pybind11;
+
 namespace
 {
 
@@ -61,7 +59,6 @@ std::string _default_device_fp_type(const sycl::device &d)
 
 int get_numpy_major_version()
 {
-    namespace py = pybind11;
 
     py::module_ numpy = py::module_::import("numpy");
     py::str version_string = numpy.attr("__version__");
@@ -179,6 +176,4 @@ std::string default_device_index_type(const py::object &arg)
     return _default_device_index_type(d);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
index 6ea01dcd49d7..adde7aefe3dd 100644
--- a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
+++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp
@@ -36,14 +36,8 @@
 
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::string default_device_fp_type(const py::object &);
@@ -53,6 +47,4 @@ extern std::string default_device_bool_type(const py::object &);
 extern std::string default_device_complex_type(const py::object &);
 extern std::string default_device_index_type(const py::object &);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
index 2526f022e0ac..e3cff701ed50 100644
--- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -34,15 +34,13 @@
 
 #include "simplify_iteration_space.hpp"
 #include "utils/strided_iters.hpp"
+#include <algorithm>
 #include <cstddef>
+#include <iterator>
 #include <pybind11/pybind11.h>
 #include <vector>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace py = pybind11;
@@ -539,6 +537,4 @@ std::vector<py::ssize_t> _unravel_index_f(py::ssize_t flat_index,
     return mi;
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
index d3448ee1f5fd..acbc833157d1 100644
--- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -36,11 +36,7 @@
 #include <pybind11/pybind11.h>
 #include <vector>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace py = pybind11;
@@ -125,6 +121,5 @@ std::vector<py::ssize_t> _unravel_index_c(py::ssize_t,
                                           std::vector<py::ssize_t> const &);
 std::vector<py::ssize_t> _unravel_index_f(py::ssize_t,
                                           std::vector<py::ssize_t> const &);
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 911d75ebd925..be69ee1a8c7e 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -32,15 +32,17 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-#include <complex>
-#include <cstdint>
+// #include <algorithm>
+// #include <complex>
+// #include <cstdint>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <sycl/sycl.hpp>
-#include <thread>
+// #include <thread>
+#include <optional>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "dpnp4pybind11.hpp"
 

From cfa6cd69735591e79ca3437cc05c326ce115ffc9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 11:31:42 -0800
Subject: [PATCH 030/145] Remove linear-sequence implementations

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/constructors.hpp          | 177 ----------
 .../libtensor/source/linear_sequences.cpp     | 312 ------------------
 .../libtensor/source/linear_sequences.hpp     |  69 ----
 .../tensor/libtensor/source/tensor_ctors.cpp  |  38 +--
 5 files changed, 19 insertions(+), 579 deletions(-)
 delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
 delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 1375c8316754..baf8ef5ce5f6 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -48,7 +48,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 8d53655b2754..f43614e13766 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -58,189 +58,12 @@ using dpctl::tensor::ssize_t;
   @defgroup CtorKernels
  */
 
-template <typename Ty>
-class linear_sequence_step_kernel;
-template <typename Ty, typename wTy>
-class linear_sequence_affine_kernel;
 template <typename Ty>
 class full_strided_kernel;
 // template <typename Ty> class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
-template <typename Ty>
-class LinearSequenceStepFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty step_v;
-
-public:
-    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<Ty>::value) {
-            p[i] = Ty{start_v.real() + i * step_v.real(),
-                      start_v.imag() + i * step_v.imag()};
-        }
-        else {
-            p[i] = start_v + i * step_v;
-        }
-    }
-};
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by typed starting value and
- * increment.
- *
- * @param q  Sycl queue to which the kernel is submitted
- * @param nelems Length of the sequence
- * @param start_v Typed starting value of the sequence
- * @param step_v  Typed increment of the sequence
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_step_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                Ty start_v,
-                                Ty step_v,
-                                char *array_data,
-                                const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
-            sycl::range<1>{nelems},
-            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
-    });
-
-    return lin_space_step_event;
-}
-
-// Constructor to populate tensor with linear sequence defined by
-// start and and data
-
-template <typename Ty, typename wTy>
-class LinearSequenceAffineFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty end_v;
-    std::size_t n;
-
-public:
-    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
-          n((den == 0) ? 1 : den)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        wTy wc = wTy(i) / n;
-        wTy w = wTy(n - i) / n;
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<Ty>::value) {
-            using reT = typename Ty::value_type;
-            auto _w = static_cast<reT>(w);
-            auto _wc = static_cast<reT>(wc);
-            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
-            re_comb =
-                sycl::fma(end_v.real(), _wc,
-                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
-            auto im_comb =
-                sycl::fma(start_v.imag(), _w,
-                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
-            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
-            Ty affine_comb = Ty{re_comb, im_comb};
-            p[i] = affine_comb;
-        }
-        else if constexpr (std::is_floating_point<Ty>::value) {
-            Ty _w = static_cast<Ty>(w);
-            Ty _wc = static_cast<Ty>(wc);
-            auto affine_comb =
-                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
-            affine_comb = sycl::fma(end_v, _wc, affine_comb);
-            p[i] = affine_comb;
-        }
-        else {
-            using dpctl::tensor::type_utils::convert_impl;
-            auto affine_comb = start_v * w + end_v * wc;
-            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
-        }
-    }
-};
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by typed starting and end values.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence.
- * @param start_v Stating value of the sequence.
- * @param end_v   End-value of the sequence.
- * @param include_endpoint  Whether the end-value is included in the sequence.
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_affine_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  Ty start_v,
-                                  Ty end_v,
-                                  bool include_endpoint,
-                                  char *array_data,
-                                  const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-
-    const bool device_supports_doubles =
-        exec_q.get_device().has(sycl::aspect::fp64);
-    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
-
-    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        if (device_supports_doubles) {
-            using KernelName = linear_sequence_affine_kernel<Ty, double>;
-            using Impl = LinearSequenceAffineFunctor<Ty, double>;
-
-            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                         Impl(array_data, start_v, end_v, den));
-        }
-        else {
-            using KernelName = linear_sequence_affine_kernel<Ty, float>;
-            using Impl = LinearSequenceAffineFunctor<Ty, float>;
-
-            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                         Impl(array_data, start_v, end_v, den));
-        }
-    });
-
-    return lin_space_affine_event;
-}
-
 /* ================ Full ================== */
 
 /*!
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
deleted file mode 100644
index 02c4a8ad0fa1..000000000000
--- a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
+++ /dev/null
@@ -1,312 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2026, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpnp4pybind11.hpp"
-#include <complex>
-#include <cstddef>
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "kernels/constructors.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "linear_sequences.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-// Constructor to populate tensor with linear sequence defined by
-// start and step data
-
-typedef sycl::event (*lin_space_step_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t, // num_elements
-    const py::object &start,
-    const py::object &step,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by starting value and increment
- * given as Python objects.
- *
- * @param q  Sycl queue to which the kernel is submitted
- * @param nelems Length of the sequence
- * @param start Starting value of the sequence as Python object. Must be
- * convertible to array element data type `Ty`.
- * @param step  Increment of the sequence as Python object. Must be convertible
- * to array element data type `Ty`.
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_step_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const py::object &start,
-                                const py::object &step,
-                                char *array_data,
-                                const std::vector<sycl::event> &depends)
-{
-    Ty start_v = py::cast<Ty>(start);
-    Ty step_v = py::cast<Ty>(step);
-
-    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
-
-    auto lin_space_step_event = lin_space_step_impl<Ty>(
-        exec_q, nelems, start_v, step_v, array_data, depends);
-
-    return lin_space_step_event;
-}
-
-typedef sycl::event (*lin_space_affine_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t, // num_elements
-    const py::object &start,
-    const py::object &end,
-    bool include_endpoint,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified  by starting and end values given
- * as Python objects.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence
- * @param start Stating value of the sequence as Python object. Must be
- * convertible to array data element type `Ty`.
- * @param end   End-value of the sequence as Python object. Must be convertible
- * to array data element type `Ty`.
- * @param include_endpoint  Whether the end-value is included in the sequence
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_affine_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  const py::object &start,
-                                  const py::object &end,
-                                  bool include_endpoint,
-                                  char *array_data,
-                                  const std::vector<sycl::event> &depends)
-{
-    Ty start_v = py::cast<Ty>(start);
-    Ty end_v = py::cast<Ty>(end);
-
-    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
-
-    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
-        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
-
-    return lin_space_affine_event;
-}
-
-using dpctl::utils::keep_args_alive;
-
-static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
-
-static lin_space_affine_fn_ptr_t
-    lin_space_affine_dispatch_vector[td_ns::num_types];
-
-std::pair<sycl::event, sycl::event>
-    usm_ndarray_linear_sequence_step(const py::object &start,
-                                     const py::object &dt,
-                                     const dpctl::tensor::usm_ndarray &dst,
-                                     sycl::queue &exec_q,
-                                     const std::vector<sycl::event> &depends)
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with the allocation queue");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_step_event;
-
-    auto fn = lin_space_step_dispatch_vector[dst_typeid];
-
-    linspace_step_event =
-        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
-
-    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
-                          linspace_step_event);
-}
-
-std::pair<sycl::event, sycl::event>
-    usm_ndarray_linear_sequence_affine(const py::object &start,
-                                       const py::object &end,
-                                       const dpctl::tensor::usm_ndarray &dst,
-                                       bool include_endpoint,
-                                       sycl::queue &exec_q,
-                                       const std::vector<sycl::event> &depends)
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation context");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_affine_event;
-
-    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
-
-    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
-                               end, include_endpoint, dst_data, depends);
-
-    return std::make_pair(
-        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
-        linspace_affine_event);
-}
-
-/*!
- * @brief  Factor to get function pointer of type `fnT` for array with elements
- * of type `Ty`.
- * @defgroup CtorKernels
- */
-template <typename fnT, typename Ty>
-struct LinSpaceStepFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_step_impl<Ty>;
-        return f;
-    }
-};
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for array data type
- * `Ty`.
- */
-template <typename fnT, typename Ty>
-struct LinSpaceAffineFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_affine_impl<Ty>;
-        return f;
-    }
-};
-
-void init_linear_sequences_dispatch_vectors(void)
-{
-    using namespace td_ns;
-
-    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
-
-    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
deleted file mode 100644
index 321cd2f23efe..000000000000
--- a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2026, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpnp4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_step(
-    const py::object &start,
-    const py::object &dt,
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends = {});
-
-extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
-    const py::object &start,
-    const py::object &end,
-    const dpctl::tensor::usm_ndarray &dst,
-    bool include_endpoint,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends = {});
-
-extern void init_linear_sequences_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index f2afce105f7f..7e4253c0cbb6 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -57,7 +57,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
-#include "linear_sequences.hpp"
+// #include "linear_sequences.hpp"
 // #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "triul_ctor.hpp"
@@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ============= linear-sequence ==================== */
 
-using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
-using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 /* ================ Full ================== */
 
@@ -158,7 +158,7 @@ void init_dispatch_vectors(void)
     init_copy_as_contig_dispatch_vectors();
     // init_copy_for_reshape_dispatch_vectors();
     // init_copy_for_roll_dispatch_vectors();
-    init_linear_sequences_dispatch_vectors();
+    // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     // init_eye_ctor_dispatch_vectors();
@@ -300,22 +300,20 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
     //       py::list());
 
-    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-          "specified by "
-          "starting point `start` and step `dt`. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("start"), py::arg("dt"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-          "specified by "
-          "starting point `start` and end point `end`. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("start"), py::arg("end"), py::arg("dst"),
-          py::arg("include_endpoint"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
+    //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
+    //           sequence " "specified by " "starting point `start` and step
+    //           `dt`. " "Returns a tuple of events: (ht_event, comp_event)",
+    //           py::arg("start"), py::arg("dt"), py::arg("dst"),
+    //           py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    //     m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
+    //           sequence " "specified by " "starting point `start` and end
+    //           point `end`. " "Returns a tuple of events: (ht_event,
+    //           comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"),
+    //           py::arg("include_endpoint"), py::arg("sycl_queue"),
+    //           py::arg("depends") = py::list());
 
     // m.def("_copy_numpy_ndarray_into_usm_ndarray",
     //       &copy_numpy_ndarray_into_usm_ndarray,

From 087a2ecbfff6262224ff115c9948202ecf45e6ba Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 11:58:15 -0800
Subject: [PATCH 031/145] Use _tensor_impl from dpctl_ext in dpnp

---
 dpnp/dpnp_algo/dpnp_fill.py      |  3 +++
 dpnp/dpnp_iface.py               |  1 +
 dpnp/dpnp_iface_indexing.py      | 11 +++++++----
 dpnp/fft/dpnp_utils_fft.py       | 14 +++++++++++---
 dpnp/linalg/dpnp_utils_linalg.py |  3 +++
 dpnp/scipy/linalg/_utils.py      |  1 +
 6 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 0d6640c3b8b5..4137a2794747 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -33,6 +33,9 @@
 from dpctl.tensor._ctors import _cast_fill_val
 
 import dpnp
+
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 50b474014666..533bdc36c617 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -50,6 +50,7 @@
 import numpy
 from dpctl.tensor._device import normalize_queue_device
 
+# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 6421f39fd4e4..a01a036e16cc 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -45,7 +45,6 @@
 from collections.abc import Iterable
 
 import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._copy_utils import _nonzero_impl
@@ -53,7 +52,11 @@
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
 import dpctl_ext.tensor as dpt_ext
-import dpctl_ext.tensor._tensor_impl as ti_ext
+
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -297,7 +300,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
                 "Input and output allocation queues are not compatible"
             )
 
-        if ti_ext._array_overlap(x, out):
+        if ti._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
             out = dpt.empty_like(out)
     else:
@@ -306,7 +309,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
 
-    h_ev, take_ev = ti_ext._take(
+    h_ev, take_ev = ti._take(
         src=x,
         ind=(inds,),
         dst=out,
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index c692774a424f..60f89a933284 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -42,6 +42,11 @@
 from collections.abc import Sequence
 
 import dpctl
+
+# pylint: disable=no-name-in-module
+# TODO: remove it when ti.__linspace_step
+# is migrated to dpctl_ext/tensor
+import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -50,7 +55,10 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
-import dpctl_ext.tensor._tensor_impl as ti
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti_ext
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 
@@ -196,7 +204,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides):
         if (
             out is not None
             and out.strides == tuple(out_strides)
-            and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
+            and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
         ):
             res_usm = dpnp.get_usm_ndarray(out)
             result = out
@@ -524,7 +532,7 @@ def _truncate_or_pad(a, shape, axes):
             )
             _manager = dpu.SequentialOrderManager[exec_q]
             dep_evs = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray(
                 src=dpnp.get_usm_ndarray(a),
                 dst=z.get_array()[tuple(index)],
                 sycl_queue=exec_q,
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 5fb1c099dde2..171ac38a141c 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -47,6 +47,9 @@
 from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy import prod
 
+# pylint: disable=no-name-in-module
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index ce832d8f4529..665a4e1595ad 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -44,6 +44,7 @@
 
 import dpctl.utils as dpu
 
+# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti

From f4492fbc8048d2fcc598a089715b85ed6504f02d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 12:28:16 -0800
Subject: [PATCH 032/145] Add missing include

---
 .../tensor/libtensor/include/kernels/constructors.hpp  |  3 ++-
 .../include/kernels/integer_advanced_indexing.hpp      |  4 +---
 dpctl_ext/tensor/libtensor/source/full_ctor.cpp        |  8 ++++----
 dpctl_ext/tensor/libtensor/source/full_ctor.hpp        |  5 ++++-
 .../libtensor/source/integer_advanced_indexing.cpp     | 10 ++++++----
 .../libtensor/source/integer_advanced_indexing.hpp     |  6 +++++-
 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp       |  3 +--
 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp       |  2 ++
 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp       |  7 ++-----
 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp       |  3 ++-
 10 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index f43614e13766..3bc4a1d16271 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -33,8 +33,9 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
+#include <array>
 #include <cstddef>
+#include <vector>
 
 #include <sycl/sycl.hpp>
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 1b2c79d2e2a5..d0ec5227731c 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -33,12 +33,10 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <algorithm>
-#include <complex>
 #include <cstddef>
-#include <cstdint>
 #include <sycl/sycl.hpp>
 #include <type_traits>
+#include <vector>
 
 #include "dpctl_tensor_types.hpp"
 #include "utils/indexing_utils.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
index e1f61be4a12a..279bb9f470bc 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -32,15 +32,15 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <complex>
 #include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
+#include <cstdint>
+#include <tuple>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
 #include "kernels/constructors.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
index d664b2013506..43b30fc8341c 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
@@ -33,13 +33,16 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
+namespace py = pybind11;
+
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
index 244acfe3955f..ed72096bff8f 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -34,21 +34,23 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
-#include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
+#include <memory>
 #include <stdexcept>
-#include <sycl/sycl.hpp>
+#include <string>
 #include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "kernels/integer_advanced_indexing.hpp"
 #include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
index 57f0ddda132c..5dfbd2f04d93 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -34,13 +34,17 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <sycl/sycl.hpp>
+#include <cstdint>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
+namespace py = pybind11;
+
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
index 0890dfdb4766..f0f592c52938 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
@@ -34,8 +34,8 @@
 
 #include <algorithm> // for std::copy
 #include <cstddef>   // for std::size_t
+#include <iterator>  // for std::begin, std::end
 #include <memory>    // for std::make_shared
-#include <stdexcept> // for std::runtime_error
 #include <utility>   // for std::pair, std::move
 #include <vector>    // for std::vector, std::begin, std::end
 
@@ -47,7 +47,6 @@
 #include "kernels/constructors.hpp"
 #include "simplify_iteration_space.hpp"
 #include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
index 08889df6227f..c61d95eef7ec 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
@@ -40,6 +40,8 @@
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
+namespace py = pybind11;
+
 namespace dpctl
 {
 namespace tensor
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
index 4558743b3c22..d7370f55e8cb 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -32,21 +32,18 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <complex>
 #include <cstddef>
 #include <stdexcept>
-#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
-#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
-#include "kernels/constructors.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
 
 #include "zeros_ctor.hpp"
 
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
index 51270a3443cc..ec3bce994ef6 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -33,10 +33,11 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 

From b367c9fd3b4b538e132afb5838584137a6f8a25c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 16 Feb 2026 12:36:24 -0800
Subject: [PATCH 033/145] Use nested namespace syntax

---
 .../libtensor/include/kernels/constructors.hpp      | 13 ++-----------
 .../include/kernels/integer_advanced_indexing.hpp   | 13 ++-----------
 dpctl_ext/tensor/libtensor/source/full_ctor.cpp     | 10 ++--------
 dpctl_ext/tensor/libtensor/source/full_ctor.hpp     | 10 ++--------
 .../libtensor/source/integer_advanced_indexing.cpp  | 10 ++--------
 .../libtensor/source/integer_advanced_indexing.hpp  | 10 ++--------
 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp    | 10 ++--------
 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp    | 10 ++--------
 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp    | 10 ++--------
 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp    | 10 ++--------
 10 files changed, 20 insertions(+), 86 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 3bc4a1d16271..47726319b3e1 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -44,13 +44,7 @@
 #include "utils/strided_iters.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace constructors
+namespace dpctl::tensor::kernels::constructors
 {
 
 using dpctl::tensor::ssize_t;
@@ -305,7 +299,4 @@ struct TriuGenericFactory
     }
 };
 
-} // namespace constructors
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::constructors
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index d0ec5227731c..7351502dbc11 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -43,13 +43,7 @@
 #include "utils/offset_utils.hpp"
 #include "utils/type_utils.hpp"
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace indexing
+namespace dpctl::tensor::kernels::indexing
 {
 
 using dpctl::tensor::ssize_t;
@@ -419,7 +413,4 @@ struct PutClipFactory
     }
 };
 
-} // namespace indexing
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::kernels::indexing
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
index 279bb9f470bc..ca4a17f28f77 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -53,11 +53,7 @@
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 using dpctl::utils::keep_args_alive;
@@ -310,6 +306,4 @@ void init_full_ctor_dispatch_vectors(void)
     dvb2.populate_dispatch_vector(full_strided_dispatch_vector);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
index 43b30fc8341c..18c15de87a40 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp
@@ -43,11 +43,7 @@
 
 namespace py = pybind11;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -58,6 +54,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_full_ctor_dispatch_vectors(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
index ed72096bff8f..77322381d517 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -62,11 +62,7 @@
 #define WRAP_MODE      0
 #define CLIP_MODE      1
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 namespace td_ns = dpctl::tensor::type_dispatch;
@@ -816,6 +812,4 @@ void init_advanced_indexing_dispatch_tables(void)
     dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
index 5dfbd2f04d93..bc0136288e1c 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -45,11 +45,7 @@
 
 namespace py = pybind11;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -72,6 +68,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_advanced_indexing_dispatch_tables(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
index f0f592c52938..13e909196460 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp
@@ -54,11 +54,7 @@
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 using dpctl::utils::keep_args_alive;
@@ -247,6 +243,4 @@ void init_triul_ctor_dispatch_vectors(void)
     dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
index c61d95eef7ec..47cc4ce8892d 100644
--- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp
@@ -42,11 +42,7 @@
 
 namespace py = pybind11;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -59,6 +55,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_triul_ctor_dispatch_vectors(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
index d7370f55e8cb..b9a2e01bea4a 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -50,11 +50,7 @@
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 using dpctl::utils::keep_args_alive;
@@ -160,6 +156,4 @@ void init_zeros_ctor_dispatch_vectors(void)
     return;
 }
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
index ec3bce994ef6..51a1903a0f36 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp
@@ -41,11 +41,7 @@
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
+namespace dpctl::tensor::py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
@@ -55,6 +51,4 @@ extern std::pair<sycl::event, sycl::event>
 
 extern void init_zeros_ctor_dispatch_vectors(void);
 
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
+} // namespace dpctl::tensor::py_internal

From 3113716a13a131dc44f819140489176be5ff7cba Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 02:50:47 -0800
Subject: [PATCH 034/145] Add missing include complex

---
 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp   | 1 +
 .../libtensor/include/kernels/integer_advanced_indexing.hpp   | 4 +++-
 dpctl_ext/tensor/libtensor/source/full_ctor.cpp               | 2 ++
 .../tensor/libtensor/source/integer_advanced_indexing.cpp     | 2 ++
 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp              | 2 ++
 5 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 47726319b3e1..22189ee3129c 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -34,6 +34,7 @@
 
 #pragma once
 #include <array>
+#include <complex>
 #include <cstddef>
 #include <vector>
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 7351502dbc11..7be2b3ea8591 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -33,11 +33,13 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
+#include <complex>
 #include <cstddef>
-#include <sycl/sycl.hpp>
 #include <type_traits>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "dpctl_tensor_types.hpp"
 #include "utils/indexing_utils.hpp"
 #include "utils/offset_utils.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
index ca4a17f28f77..aef57836666e 100644
--- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp
@@ -32,6 +32,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <tuple>
@@ -41,6 +42,7 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
 #include "kernels/constructors.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
index 77322381d517..925cc2e895ed 100644
--- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -34,6 +34,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -46,6 +47,7 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
index b9a2e01bea4a..2eb05e49f382 100644
--- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp
@@ -32,6 +32,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
+#include <complex>
 #include <cstddef>
 #include <stdexcept>
 #include <utility>
@@ -40,6 +41,7 @@
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 
 #include "utils/output_validation.hpp"

From 978afee9115d8feaebe72c80ce3e827e13c66770 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 03:13:50 -0800
Subject: [PATCH 035/145] Add missing memory and queue checks

---
 .../libtensor/source/copy_as_contig.cpp       | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
index 7105202fe2ff..bbee24c95d4d 100644
--- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
+++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp
@@ -189,6 +189,12 @@ std::pair<sycl::event, sycl::event>
             "Execution queue is not compatible with allocation queues");
     }
 
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     const auto &src_strides_vec = src.get_strides_vector();
 
     if (src_nd >= 2) {
@@ -314,6 +320,12 @@ std::pair<sycl::event, sycl::event>
             "Execution queue is not compatible with allocation queues");
     }
 
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     const auto &src_strides_vec = src.get_strides_vector();
 
     if (src_nd >= 2) {
@@ -459,6 +471,12 @@ std::pair<sycl::event, sycl::event>
             "Execution queue is not compatible with allocation queues");
     }
 
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     if (nelems == 0) {
         // nothing to do
         return std::make_pair(sycl::event(), sycl::event());
@@ -624,6 +642,20 @@ std::pair<sycl::event, sycl::event>
         throw py::value_error("Unexpected destination array layout");
     }
 
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     int src_typenum = src.get_typenum();
     int dst_typenum = dst.get_typenum();
 

From fec84ec7eafcfbd9a3d0e80f6a1c0e35c5312769 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 04:17:56 -0800
Subject: [PATCH 036/145] Move ti._copy_numpy_ndarray_into_usm_ndarray()

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../copy_numpy_ndarray_into_usm_ndarray.cpp   | 368 ++++++++++++++++++
 .../copy_numpy_ndarray_into_usm_ndarray.hpp   |  57 +++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  16 +-
 4 files changed, 434 insertions(+), 9 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 53a0dfd27ba3..4e3fa580f99e 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -48,7 +48,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
new file mode 100644
index 000000000000..e97e8aeb1ca1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -0,0 +1,368 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_blocking_fn_ptr_t
+    copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_contig_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_contig_blocking_fn_ptr_t
+    copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types]
+                                                          [td_ns::num_types];
+
+void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends)
+{
+    int src_ndim = npy_src.ndim();
+    int dst_ndim = dst.get_ndim();
+
+    if (src_ndim != dst_ndim) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "different array ranks, "
+                              "i.e. different number of indices needed to "
+                              "address array elements.");
+    }
+
+    const py::ssize_t *src_shape = npy_src.shape();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+    for (int i = 0; shapes_equal && (i < src_ndim); ++i) {
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "difference shapes.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return;
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // here we assume that NumPy's type numbers agree with ours for types
+    // supported in both
+    int src_typenum =
+        py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::buffer_info src_pybuf = npy_src.request();
+    const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
+    char *dst_data = dst.get_data();
+
+    int src_flags = npy_src.flags();
+
+    // check for applicability of special cases:
+    //      (same type && (both C-contiguous || both F-contiguous)
+    const bool both_c_contig =
+        ((src_flags & py::array::c_style) && dst.is_c_contiguous());
+    const bool both_f_contig =
+        ((src_flags & py::array::f_style) && dst.is_f_contiguous());
+
+    const bool same_data_types = (src_type_id == dst_type_id);
+
+    if (both_c_contig || both_f_contig) {
+        if (same_data_types) {
+            int src_elem_size = npy_src.itemsize();
+
+            sycl::event copy_ev =
+                exec_q.memcpy(static_cast<void *>(dst_data),
+                              static_cast<const void *>(src_data),
+                              src_nelems * src_elem_size, depends);
+
+            {
+                // wait for copy_ev to complete
+                // release GIL to allow other threads (host_tasks)
+                // a chance to acquire GIL
+                py::gil_scoped_release lock{};
+                copy_ev.wait();
+            }
+
+            return;
+        }
+        else {
+            py::gil_scoped_release lock{};
+
+            auto copy_and_cast_from_host_contig_blocking_fn =
+                copy_and_cast_from_host_contig_blocking_dispatch_table
+                    [dst_type_id][src_type_id];
+
+            static constexpr py::ssize_t zero_offset(0);
+
+            copy_and_cast_from_host_contig_blocking_fn(
+                exec_q, src_nelems, src_data, zero_offset, dst_data,
+                zero_offset, depends);
+
+            return;
+        }
+    }
+
+    auto const &dst_strides =
+        dst.get_strides_vector(); // N.B.: strides in elements
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_ndim;
+    const py::ssize_t *shape = src_shape;
+
+    const py::ssize_t *src_strides_p =
+        npy_src.strides();                         // N.B.: strides in bytes
+    py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
+
+    bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
+    bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
+
+    shT src_strides_in_elems;
+    if (src_strides_p) {
+        src_strides_in_elems.resize(nd);
+        // copy and convert strides from bytes to elements
+        std::transform(
+            src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems),
+            [src_itemsize](py::ssize_t el) {
+                py::ssize_t q = el / src_itemsize;
+                if (q * src_itemsize != el) {
+                    throw std::runtime_error(
+                        "NumPy array strides are not multiple of itemsize");
+                }
+                return q;
+            });
+    }
+    else {
+        if (is_src_c_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::c_contiguous_strides(nd, src_shape);
+        }
+        else if (is_src_f_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::f_contiguous_strides(nd, src_shape);
+        }
+        else {
+            throw py::value_error("NumPy source array has null strides but is "
+                                  "neither C- nor F-contiguous.");
+        }
+    }
+
+    // nd, simplified_* vectors and offsets are modified by reference
+    simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides,
+                             // outputs
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+    assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+    assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+    // handle nd == 0
+    if (nd == 0) {
+        nd = 1;
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(1);
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.push_back(1);
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.push_back(1);
+    }
+
+    const bool is_contig_vector =
+        ((nd == 1) && (simplified_src_strides.front() == 1) &&
+         (simplified_dst_strides.front() == 1));
+
+    const bool can_use_memcpy = (same_data_types && is_contig_vector &&
+                                 (src_offset == 0) && (dst_offset == 0));
+
+    if (can_use_memcpy) {
+        int src_elem_size = npy_src.itemsize();
+
+        sycl::event copy_ev = exec_q.memcpy(
+            static_cast<void *>(dst_data), static_cast<const void *>(src_data),
+            src_nelems * src_elem_size, depends);
+
+        {
+            // wait for copy_ev to complete
+            // release GIL to allow other threads (host_tasks)
+            // a chance to acquire GIL
+            py::gil_scoped_release lock{};
+
+            copy_ev.wait();
+        }
+
+        return;
+    }
+
+    // Minimum and maximum element offsets for source np.ndarray
+    py::ssize_t npy_src_min_nelem_offset(src_offset);
+    py::ssize_t npy_src_max_nelem_offset(src_offset);
+    for (int i = 0; i < nd; ++i) {
+        if (simplified_src_strides[i] < 0) {
+            npy_src_min_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+        else {
+            npy_src_max_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+    }
+
+    if (is_contig_vector) {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        auto copy_and_cast_from_host_contig_blocking_fn =
+            copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id]
+                                                                  [src_type_id];
+
+        copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data,
+                                                   src_offset, dst_data,
+                                                   dst_offset, depends);
+
+        return;
+    }
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(1);
+
+    // Copy shape strides into device memory
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        // Get implementation function pointer
+        auto copy_and_cast_from_host_blocking_fn =
+            copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
+                                                           [src_type_id];
+
+        copy_and_cast_from_host_blocking_fn(
+            exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+            npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
+            dst_offset, depends, {copy_shape_ev});
+
+        // invoke USM deleter in smart pointer while GIL is held
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return;
+}
+
+void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
+                         CopyAndCastFromHostFactory, num_types>
+        dtb_copy_from_numpy;
+
+    dtb_copy_from_numpy.populate_dispatch_table(
+        copy_and_cast_from_host_blocking_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::
+        CopyAndCastFromHostContigFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_contig_blocking_fn_ptr_t,
+                         CopyAndCastFromHostContigFactory, num_types>
+        dtb_copy_from_numpy_contig;
+
+    dtb_copy_from_numpy_contig.populate_dispatch_table(
+        copy_and_cast_from_host_contig_blocking_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
new file mode 100644
index 000000000000..f2de95f97cca
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 182124ac4ae5..07c8fd1bf99f 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -53,7 +53,7 @@
 #include "copy_as_contig.hpp"
 // #include "copy_for_reshape.hpp"
 // #include "copy_for_roll.hpp"
-// #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
 #include "full_ctor.hpp"
@@ -96,7 +96,7 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
-// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
 
 /* ============= linear-sequence ==================== */
 
@@ -146,7 +146,7 @@ void init_dispatch_tables(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_and_cast_usm_to_usm_dispatch_tables();
-    // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
     init_advanced_indexing_dispatch_tables();
     // init_where_dispatch_tables();
     return;
@@ -317,11 +317,11 @@ PYBIND11_MODULE(_tensor_impl, m)
     //           py::arg("include_endpoint"), py::arg("sycl_queue"),
     //           py::arg("depends") = py::list());
 
-    // m.def("_copy_numpy_ndarray_into_usm_ndarray",
-    //       &copy_numpy_ndarray_into_usm_ndarray,
-    //       "Copy from numpy array `src` into usm_ndarray `dst`
-    //       synchronously.", py::arg("src"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_numpy_ndarray_into_usm_ndarray",
+          &copy_numpy_ndarray_into_usm_ndarray,
+          "Copy from numpy array `src` into usm_ndarray `dst` synchronously.",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
           "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),

From 497e81030c9e64b7129191d4347c7516224fe234 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 04:43:05 -0800
Subject: [PATCH 037/145] Move asnumpy(),from_numpy(), to_numpy() to
 dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py    |   8 ++
 dpctl_ext/tensor/_copy_utils.py | 202 ++++++++++++++++++++++++++++++++
 2 files changed, 210 insertions(+)
 create mode 100644 dpctl_ext/tensor/_copy_utils.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 3c6939eff7a0..2ce61a9ab242 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,6 +27,11 @@
 # *****************************************************************************
 
 
+from dpctl_ext.tensor._copy_utils import (
+    asnumpy,
+    from_numpy,
+    to_numpy,
+)
 from dpctl_ext.tensor._ctors import (
     full,
     tril,
@@ -38,9 +43,12 @@
 )
 
 __all__ = [
+    "asnumpy",
+    "from_numpy",
     "full",
     "put",
     "take",
+    "to_numpy",
     "tril",
     "triu",
 ]
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
new file mode 100644
index 000000000000..9041be7686f6
--- /dev/null
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -0,0 +1,202 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.memory as dpm
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._device import normalize_queue_device
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
+
+__doc__ = (
+    "Implementation module for copy- and cast- operations on "
+    ":class:`dpctl.tensor.usm_ndarray`."
+)
+
+int32_t_max = 1 + np.iinfo(np.int32).max
+
+
+def _copy_to_numpy(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}")
+    if ary.size == 0:
+        # no data needs to be copied for zero sized array
+        return np.ndarray(ary.shape, dtype=ary.dtype)
+    nb = ary.usm_data.nbytes
+    q = ary.sycl_queue
+    hh = dpm.MemoryUSMHost(nb, queue=q)
+    h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype)
+    itsz = ary.itemsize
+    strides_bytes = tuple(si * itsz for si in ary.strides)
+    offset = ary._element_offset * itsz
+    # ensure that content of ary.usm_data is final
+    q.wait()
+    hh.copy_from_device(ary.usm_data)
+    return np.ndarray(
+        ary.shape,
+        dtype=ary.dtype,
+        buffer=h,
+        strides=strides_bytes,
+        offset=offset,
+    )
+
+
+def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
+    """Copies numpy array `np_ary` into a new usm_ndarray"""
+    # This may perform a copy to meet stated requirements
+    Xnp = np.require(np_ary, requirements=["A", "E"])
+    alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None)
+    dt = Xnp.dtype
+    if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False:
+        Xusm_dtype = (
+            dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64")
+        )
+    else:
+        Xusm_dtype = dt
+    Xusm = dpt.empty(
+        Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
+    )
+    _copy_from_numpy_into(Xusm, Xnp)
+    return Xusm
+
+
+def _copy_from_numpy_into(dst, np_ary):
+    """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray"""
+    if not isinstance(np_ary, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}")
+    if not isinstance(dst, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(dst)}")
+    if np_ary.flags["OWNDATA"]:
+        Xnp = np_ary
+    else:
+        # Determine base of input array
+        base = np_ary.base
+        while isinstance(base, np.ndarray):
+            base = base.base
+        if isinstance(base, dpm._memory._Memory):
+            # we must perform a copy, since subsequent
+            # _copy_numpy_ndarray_into_usm_ndarray is implemented using
+            # sycl::buffer, and using USM-pointers with sycl::buffer
+            # results is undefined behavior
+            Xnp = np_ary.copy()
+        else:
+            Xnp = np_ary
+    src_ary = np.broadcast_to(Xnp, dst.shape)
+    copy_q = dst.sycl_queue
+    if copy_q.sycl_device.has_aspect_fp64 is False:
+        src_ary_dt_c = src_ary.dtype.char
+        if src_ary_dt_c == "d":
+            src_ary = src_ary.astype(np.float32)
+        elif src_ary_dt_c == "D":
+            src_ary = src_ary.astype(np.complex64)
+    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    dep_ev = _manager.submitted_events
+    # synchronizing call
+    ti._copy_numpy_ndarray_into_usm_ndarray(
+        src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev
+    )
+
+
+def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
+    """
+    from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` from instance of
+    :class:`numpy.ndarray`.
+
+    Args:
+        arg:
+            Input convertible to :class:`numpy.ndarray`
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is ``None``,
+            returned array is created on the default-selected device.
+            Default: ``None``
+        usm_type (str): The requested USM allocation type for the
+            output array. Recognized values are ``"device"``,
+            ``"shared"``, or ``"host"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            A SYCL queue that determines output array allocation device
+            as well as execution placement of data movement operations.
+            The ``device`` and ``sycl_queue`` arguments
+            are equivalent. Only one of them should be specified. If both
+            are provided, they must be consistent and result in using the
+            same execution queue. Default: ``None``
+
+    The returned array has the same shape, and the same data type kind.
+    If the device does not support the data type of input array, a
+    closest support data type of the same kind may be returned, e.g.
+    input array of type ``float16`` may be upcast to ``float32`` if the
+    target device does not support 16-bit floating point type.
+    """
+    q = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q)
+
+
+def to_numpy(usm_ary, /):
+    """
+    to_numpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content of
+            ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)
+
+
+def asnumpy(usm_ary):
+    """
+    asnumpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data
+    type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content
+            of ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)

From 3be4e1498182891b7d59facdfe4c2a9f6ed3dc9f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 04:45:48 -0800
Subject: [PATCH 038/145] Update dpnp.asnumpy to use dpctl_ext functions

---
 dpnp/dpnp_array.py | 5 ++++-
 dpnp/dpnp_iface.py | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 6a2b2fd1977f..d122aff2c13f 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -41,6 +41,9 @@
 import dpctl.tensor._type_utils as dtu
 from dpctl.tensor._numpy_helper import AxisError
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from . import memory as dpm
@@ -764,7 +767,7 @@ def asnumpy(self):
 
         """
 
-        return dpt.asnumpy(self._array_obj)
+        return dpt_ext.asnumpy(self._array_obj)
 
     def astype(
         self,
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 533bdc36c617..6c050a208981 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -53,6 +53,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -136,7 +137,7 @@ def asnumpy(a, order="C"):
         return a.asnumpy()
 
     if isinstance(a, dpt.usm_ndarray):
-        return dpt.asnumpy(a)
+        return dpt_ext.asnumpy(a)
 
     return numpy.asarray(a, order=order)
 

From 1d883652555cac390ba19d6fa294284690688ed1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 05:02:03 -0800
Subject: [PATCH 039/145] Move copy(), astype() to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py    |   4 +
 dpctl_ext/tensor/_copy_utils.py | 553 ++++++++++++++++++++++++++++++++
 2 files changed, 557 insertions(+)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 2ce61a9ab242..a02cf85ed591 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -29,6 +29,8 @@
 
 from dpctl_ext.tensor._copy_utils import (
     asnumpy,
+    astype,
+    copy,
     from_numpy,
     to_numpy,
 )
@@ -44,6 +46,8 @@
 
 __all__ = [
     "asnumpy",
+    "astype",
+    "copy",
     "from_numpy",
     "full",
     "put",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 9041be7686f6..c62218893a2c 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -26,12 +26,16 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+import builtins
+
 import dpctl
 import dpctl.memory as dpm
 import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
+from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
+from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
@@ -200,3 +204,552 @@ def asnumpy(usm_ary):
             of ``usm_ary``
     """
     return _copy_to_numpy(usm_ary)
+
+
+class Dummy:
+    """Helper class with specified ``__sycl_usm_array_interface__`` attribute"""
+
+    def __init__(self, iface):
+        self.__sycl_usm_array_interface__ = iface
+
+
+def _copy_overlapping(dst, src):
+    """Assumes src and dst have the same shape."""
+    q = normalize_queue_device(sycl_queue=dst.sycl_queue)
+    tmp = dpt.usm_ndarray(
+        src.shape,
+        dtype=src.dtype,
+        buffer="device",
+        order="C",
+        buffer_ctor_kwargs={"queue": q},
+    )
+    _manager = dpctl.utils.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=tmp, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(hcp1, cp1)
+    hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=tmp, dst=dst, sycl_queue=q, depends=[cp1]
+    )
+    _manager.add_event_pair(hcp2, cp2)
+
+
+def _copy_same_shape(dst, src):
+    """Assumes src and dst have the same shape."""
+    # check that memory regions do not overlap
+    if ti._array_overlap(dst, src):
+        if src._pointer == dst._pointer and (
+            src is dst
+            or (src.strides == dst.strides and src.dtype == dst.dtype)
+        ):
+            return
+        _copy_overlapping(src=src, dst=dst)
+        return
+
+    copy_q = dst.sycl_queue
+    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    dep_evs = _manager.submitted_events
+    hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs
+    )
+    _manager.add_event_pair(hev, cpy_ev)
+
+
+if hasattr(np, "broadcast_shapes"):
+
+    def _broadcast_shapes(sh1, sh2):
+        return np.broadcast_shapes(sh1, sh2)
+
+else:
+
+    def _broadcast_shapes(sh1, sh2):
+        # use arrays with zero strides, whose memory footprint
+        # is independent of the number of array elements
+        return np.broadcast(
+            np.empty(sh1, dtype=[]),
+            np.empty(sh2, dtype=[]),
+        ).shape
+
+
+def _broadcast_strides(X_shape, X_strides, res_ndim):
+    """
+    Broadcasts strides to match the given dimensions;
+    returns tuple type strides.
+    """
+    out_strides = [0] * res_ndim
+    X_shape_len = len(X_shape)
+    str_dim = -X_shape_len
+    for i in range(X_shape_len):
+        shape_value = X_shape[i]
+        if not shape_value == 1:
+            out_strides[str_dim] = X_strides[i]
+        str_dim += 1
+
+    return tuple(out_strides)
+
+
+def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
+    if any(
+        not isinstance(arg, dpt.usm_ndarray)
+        for arg in (
+            dst,
+            src,
+        )
+    ):
+        raise TypeError(
+            "Both types are expected to be dpctl.tensor.usm_ndarray, "
+            f"got {type(dst)} and {type(src)}."
+        )
+
+    if dst.ndim == src.ndim and dst.shape == src.shape:
+        _copy_same_shape(dst, src)
+        return
+
+    try:
+        common_shape = _broadcast_shapes(dst.shape, src.shape)
+    except ValueError as exc:
+        raise ValueError("Shapes of two arrays are not compatible") from exc
+
+    if dst.size < src.size and dst.size < np.prod(common_shape):
+        raise ValueError("Destination is smaller ")
+
+    if len(common_shape) > dst.ndim:
+        ones_count = len(common_shape) - dst.ndim
+        for k in range(ones_count):
+            if common_shape[k] != 1:
+                raise ValueError
+        common_shape = common_shape[ones_count:]
+
+    if src.ndim < len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    elif src.ndim == len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    else:
+        # since broadcasting succeeded, src.ndim is greater because of
+        # leading sequence of ones, so we trim it
+        n = len(common_shape)
+        new_src_strides = _broadcast_strides(
+            src.shape[-n:], src.strides[-n:], n
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src.usm_data,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+
+    _copy_same_shape(dst, src_same_shape)
+
+
+def _make_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty array with shape and strides like `x`, with dtype `dt`,
+    USM type `usm_type`, on device `dev`.
+    """
+    st = list(x.strides)
+    perm = sorted(
+        range(x.ndim),
+        key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0,
+        reverse=True,
+    )
+    inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
+    sh = x.shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if min(st) < 0:
+        st_sorted = [st[i] for i in perm]
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if st_sorted[i] < 0
+                else slice(None, None, None)
+            )
+            for i in range(x.ndim)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_orderK(x, dt, usm_type=None, dev=None):
+    """
+    Returns empty array like `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(x)}")
+    if usm_type is None:
+        usm_type = x.usm_type
+    if dev is None:
+        dev = x.device
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty usm_ndarray like NumPy array `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    if nd1 > nd2 and X1.shape == res_shape:
+        return _empty_like_orderK(X1, dt, usm_type, dev)
+    elif nd1 < nd2 and X2.shape == res_shape:
+        return _empty_like_orderK(X2, dt, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    if fl1["C"] or fl2["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    max_ndim = max(nd1, nd2)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (st1_sorted[i] < 0 and st2_sorted[i] < 0)
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    if not isinstance(X3, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X3)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    nd3 = X3.ndim
+    if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3:
+        return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev)
+    elif (
+        X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1
+    ):
+        return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev)
+    elif (
+        X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2
+    ):
+        return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    fl3 = X3.flags
+    if fl1["C"] or fl2["C"] or fl3["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"] and fl3["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    st3 = list(X3.strides)
+    max_ndim = max(nd1, nd2, nd3)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    st3 += [0] * (max_ndim - len(st3))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    sh3 = list(X3.shape) + [0] * (max_ndim - nd3)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+            builtins.abs(st3[d]) if sh3[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    st3_sorted = [st3[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (
+                    st1_sorted[i] < 0
+                    and st2_sorted[i] < 0
+                    and st3_sorted[i] < 0
+                )
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def copy(usm_ary, /, *, order="K"):
+    """copy(ary, order="K")
+
+    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        ary (usm_ndarray):
+            Input array
+        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
+            Controls the memory layout of the output array
+    Returns:
+        usm_ndarray:
+            A copy of the input array.
+
+    Memory layout of the copy is controlled by ``order`` keyword,
+    following NumPy's conventions. The ``order`` keywords can be
+    one of the following:
+
+    .. list-table::
+
+        * - ``"C"``
+          - C-contiguous memory layout
+        * - ``"F"``
+          - Fortran-contiguous memory layout
+        * - ``"A"``
+          - Fortran-contiguous if the input array is also Fortran-contiguous,
+            otherwise C-contiguous
+        * - ``"K"``
+          - match the layout of ``usm_ary`` as closely as possible.
+
+    """
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, usm_ary.dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=usm_ary.dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_same_shape(R, usm_ary)
+    return R
+
+
+def astype(
+    usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None
+):
+    """astype(array, new_dtype, order="K", casting="unsafe", \
+            copy=True, device=None)
+
+    Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a
+    specified type.
+
+    Args:
+        array (usm_ndarray):
+            An input array.
+        new_dtype (dtype):
+            The data type of the resulting array. If `None`, gives default
+            floating point type supported by device where the resulting array
+            will be located.
+        order ({"C", "F", "A", "K"}, optional):
+            Controls memory layout of the resulting array if a copy
+            is returned.
+        casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional):
+            Controls what kind of data casting may occur. Please see
+            :meth:`numpy.ndarray.astype` for description of casting modes.
+        copy (bool, optional):
+            By default, `astype` always returns a newly allocated array.
+            If this keyword is set to `False`, a view of the input array
+            may be returned when possible.
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is `None`,
+            returned array is created on the same device as `array`.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array with requested data type.
+
+    A view can be returned, if possible, when `copy=False` is used.
+    """
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        return TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    ary_dtype = usm_ary.dtype
+    if device is not None:
+        if not isinstance(device, dpctl.SyclQueue):
+            if isinstance(device, dpt.Device):
+                device = device.sycl_queue
+            else:
+                device = dpt.Device.create_device(device).sycl_queue
+        d = device.sycl_device
+        target_dtype = _get_dtype(newdtype, device)
+        if not _dtype_supported_by_device_impl(
+            target_dtype, d.has_aspect_fp16, d.has_aspect_fp64
+        ):
+            raise ValueError(
+                f"Requested dtype '{target_dtype}' is not supported by the "
+                "target device"
+            )
+        usm_ary = usm_ary.to_device(device)
+    else:
+        target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
+
+    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
+        raise TypeError(
+            f"Can not cast from {ary_dtype} to {newdtype} "
+            f"according to rule {casting}."
+        )
+    c_contig = usm_ary.flags.c_contiguous
+    f_contig = usm_ary.flags.f_contiguous
+    needs_copy = copy or not ary_dtype == target_dtype
+    if not needs_copy and (order != "K"):
+        # ensure that order="F" for C-contig input triggers copy,
+        # and order="C" for F-contig input triggers copy too.
+        # 1D arrays which are both C- and F- contig should not
+        # force copying for neither order="F", nor order="C", see gh-1926
+        needs_copy = (
+            c_contig and not f_contig and order not in ["A", "C"]
+        ) or (not c_contig and f_contig and order not in ["A", "F"])
+    if not needs_copy:
+        return usm_ary
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, target_dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=target_dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
+    return R

From fd18db07dc59fa927e14d39f730fce9d6c2b42ec Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 05:24:47 -0800
Subject: [PATCH 040/145] reuse astype(), copy() from dpctl_ext

---
 dpctl_ext/tensor/_ctors.py                |  5 ++++-
 dpctl_ext/tensor/_indexing_functions.py   |  5 ++++-
 dpnp/dpnp_algo/dpnp_arraycreation.py      |  5 ++++-
 dpnp/dpnp_algo/dpnp_elementwise_common.py | 13 +++++++------
 dpnp/dpnp_algo/dpnp_fill.py               |  6 +++---
 dpnp/dpnp_container.py                    |  4 +++-
 dpnp/dpnp_iface_arraycreation.py          |  7 +++++--
 dpnp/dpnp_iface_indexing.py               | 15 ++++++++-------
 dpnp/dpnp_iface_statistics.py             |  5 ++++-
 9 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index a0e7b28e66ff..5a39e9367e9c 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -36,6 +36,9 @@
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 
@@ -147,7 +150,7 @@ def full(
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
+        return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order)
     else:
         _validate_fill_value(fill_value)
 
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 106df09cf97e..df4f3e953042 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -32,6 +32,9 @@
 import dpctl.tensor as dpt
 import dpctl.utils
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
@@ -185,7 +188,7 @@ def put_vec_duplicates(vec, ind, vals):
     if vals.dtype == x.dtype:
         rhs = vals
     else:
-        rhs = dpt.astype(vals, x.dtype)
+        rhs = dpt_ext.astype(vals, x.dtype)
     rhs = dpt.broadcast_to(rhs, val_shape)
 
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index d94a031801f3..27f095158a85 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -33,6 +33,9 @@
 import dpctl.utils as dpu
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
@@ -256,7 +259,7 @@ def dpnp_linspace(
     if dpnp.issubdtype(dtype, dpnp.integer):
         dpt.floor(usm_res, out=usm_res)
 
-    res = dpt.astype(usm_res, dtype, copy=False)
+    res = dpt_ext.astype(usm_res, dtype, copy=False)
     res = dpnp_array._create_from_usm_ndarray(res)
 
     if retstep is True:
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 88abcee5035c..55d74e8c1803 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -47,6 +47,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
@@ -212,7 +213,7 @@ def __call__(
 
         x_usm = dpnp.get_usm_ndarray(x)
         if dtype is not None:
-            x_usm = dpt.astype(x_usm, dtype, copy=False)
+            x_usm = dpt_ext.astype(x_usm, dtype, copy=False)
 
         out = self._unpack_out_kw(out)
         out_usm = None if out is None else dpnp.get_usm_ndarray(out)
@@ -718,9 +719,9 @@ def __call__(
                     sycl_queue=x2.sycl_queue,
                     usm_type=x2.usm_type,
                 )
-                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
+                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
             elif dpnp.isscalar(x2):
-                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
+                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
                 x2_usm = dpt.asarray(
                     x2,
                     dtype=dtype,
@@ -728,8 +729,8 @@ def __call__(
                     usm_type=x1.usm_type,
                 )
             else:
-                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
+                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
+                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
 
         res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order)
 
@@ -1325,7 +1326,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None):
                 res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm)
 
             if dtype is not None:
-                res_usm = dpt.astype(res_usm, dtype, copy=False)
+                res_usm = dpt_ext.astype(res_usm, dtype, copy=False)
 
             if out is not None and isinstance(out, dpnp_array):
                 return out
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 4137a2794747..ddba9f634cb1 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -32,10 +32,10 @@
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
 
-import dpnp
-
 # TODO: revert to `from dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpnp
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
@@ -56,7 +56,7 @@ def dpnp_fill(arr, val):
             raise dpu.ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
-        a_val = dpt.astype(val, arr.dtype)
+        a_val = dpt_ext.astype(val, arr.dtype)
         a_val = dpt.broadcast_to(a_val, arr.shape)
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index c8e28529cd57..acda579a5f5e 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -38,6 +38,8 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
@@ -141,7 +143,7 @@ def copy(x1, /, *, order="K"):
     if order is None:
         order = "K"
 
-    array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order)
+    array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 8d4ebdd1a6c2..404d9e0fe37d 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -46,6 +46,9 @@
 import dpctl.tensor as dpt
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp import dpnp_container
 
@@ -934,7 +937,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None):
         order = "K"
 
     usm_x = dpnp.get_usm_ndarray(x)
-    usm_res = dpt.astype(
+    usm_res = dpt_ext.astype(
         usm_x, dtype, order=order, casting=casting, copy=copy, device=device
     )
 
@@ -3129,7 +3132,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
         output = dpt.broadcast_arrays(*output)
 
     if copy:
-        output = [dpt.copy(x) for x in output]
+        output = [dpt_ext.copy(x) for x in output]
 
     return [dpnp_array._create_from_usm_ndarray(x) for x in output]
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index a01a036e16cc..a77877d8d685 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -51,11 +51,10 @@
 from dpctl.tensor._indexing_functions import _get_indexing_mode
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
-import dpctl_ext.tensor as dpt_ext
-
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -243,7 +242,7 @@ def choose(a, choices, out=None, mode="wrap"):
         # NumPy will cast up to int64 in general but
         # int32 is more than safe for bool
         if ind_dt == dpnp.bool:
-            inds = dpt.astype(inds, dpt.int32)
+            inds = dpt_ext.astype(inds, dpt.int32)
         else:
             raise TypeError("input index array must be of integer data type")
 
@@ -256,7 +255,7 @@ def choose(a, choices, out=None, mode="wrap"):
         choices = tuple(
             map(
                 lambda chc: (
-                    chc if chc.dtype == res_dt else dpt.astype(chc, res_dt)
+                    chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt)
                 ),
                 choices,
             )
@@ -1616,7 +1615,9 @@ def place(a, mask, vals):
     if usm_vals.dtype != usm_a.dtype:
         # dpt.place casts values to a.dtype with "unsafe" rule,
         # while numpy.place does that with "safe" casting rule
-        usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False)
+        usm_vals = dpt_ext.astype(
+            usm_vals, usm_a.dtype, casting="safe", copy=False
+        )
 
     dpt.place(usm_a, usm_mask, usm_vals)
 
@@ -1712,7 +1713,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.put supports only integer dtype for array of indices
-        usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe")
+        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe")
 
     in_usm_a = usm_a
     if axis is None and usm_a.ndim > 1:
@@ -2171,7 +2172,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.take supports only integer dtype for array of indices
-        usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
+        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
 
     usm_res = _take_index(
         usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 7e092184366c..daff981d5cc4 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -47,6 +47,9 @@
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -1204,7 +1207,7 @@ def mean(a, /, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_res = dpt.mean(usm_a, axis=axis, keepdims=keepdims)
     if dtype is not None:
-        usm_res = dpt.astype(usm_res, dtype)
+        usm_res = dpt_ext.astype(usm_res, dtype)
 
     return dpnp.get_result_array(usm_res, out, casting="unsafe")
 

From 09761710153cee6060ffe27461af6444d3e4699b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 06:00:04 -0800
Subject: [PATCH 041/145] Move _copy_usm_ndarray_for_reshape

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../libtensor/source/copy_for_reshape.cpp     | 184 ++++++++++++++++++
 .../libtensor/source/copy_for_reshape.hpp     |  54 +++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  17 +-
 4 files changed, 248 insertions(+), 9 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 4e3fa580f99e..b30a1ac8b029 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -49,7 +49,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
new file mode 100644
index 000000000000..524bfcfdb98b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
@@ -0,0 +1,184 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_reshape.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_reshape_fn_ptr_t
+    copy_for_reshape_generic_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    py::ssize_t src_nelems = src.get_size();
+    py::ssize_t dst_nelems = dst.get_size();
+
+    // Must have the same number of elements
+    if (src_nelems != dst_nelems) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same number of elements.");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    // dimensions may be different
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    auto fn = copy_for_reshape_generic_dispatch_vector[type_id];
+
+    auto src_shape = src.get_shape_vector();
+    auto src_strides = src.get_strides_vector();
+
+    auto dst_shape = dst.get_shape_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_shape, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape, src_strides, dst_shape,
+        dst_strides);
+    auto copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_reshape_event =
+        fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data,
+           dst_data, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_reshape_event}, shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_reshape_event);
+}
+
+void init_copy_for_reshape_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory;
+
+    DispatchVectorBuilder<copy_for_reshape_fn_ptr_t,
+                          CopyForReshapeGenericFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
new file mode 100644
index 000000000000..c5af885ad6cd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_reshape_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 07c8fd1bf99f..4903f22bd481 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -51,7 +51,7 @@
 // #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
-// #include "copy_for_reshape.hpp"
+#include "copy_for_reshape.hpp"
 // #include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
@@ -87,7 +87,7 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* =========================== Copy for reshape ============================= */
 
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
 /* =========================== Copy for roll ============================= */
 
@@ -279,12 +279,13 @@ PYBIND11_MODULE(_tensor_impl, m)
         },
         "");
 
-    // m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "number of elements using underlying 'C'-contiguous order for
-    //       flat " "traversal. " "Returns a tuple of events: (ht_event,
-    //       comp_event)", py::arg("src"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "number of elements using underlying 'C'-contiguous order for flat "
+          "traversal. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
     //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same

From 318692ef0f33820ddf5850ddf5a9a0a030b139a9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 06:26:33 -0800
Subject: [PATCH 042/145] Move reshape() to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py |   2 +
 dpctl_ext/tensor/_reshape.py | 206 +++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+)
 create mode 100644 dpctl_ext/tensor/_reshape.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a02cf85ed591..416216074f8a 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -43,6 +43,7 @@
     put,
     take,
 )
+from dpctl_ext.tensor._reshape import reshape
 
 __all__ = [
     "asnumpy",
@@ -51,6 +52,7 @@
     "from_numpy",
     "full",
     "put",
+    "reshape",
     "take",
     "to_numpy",
     "tril",
diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py
new file mode 100644
index 000000000000..6afa1dc245c3
--- /dev/null
+++ b/dpctl_ext/tensor/_reshape.py
@@ -0,0 +1,206 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._tensor_impl import (
+    _copy_usm_ndarray_for_reshape,
+    _ravel_multi_index,
+    _unravel_index,
+)
+
+__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`."
+
+
+def _make_unit_indexes(shape):
+    """
+    Construct a diagonal matrix with with one on the diagonal
+    except if the corresponding element of shape is 1.
+    """
+    nd = len(shape)
+    mi = np.zeros((nd, nd), dtype="u4")
+    for i, dim in enumerate(shape):
+        mi[i, i] = 1 if dim > 1 else 0
+    return mi
+
+
+def ti_unravel_index(flat_index, shape, order="C"):
+    return _unravel_index(flat_index, shape, order)
+
+
+def ti_ravel_multi_index(multi_index, shape, order="C"):
+    return _ravel_multi_index(multi_index, shape, order)
+
+
+def reshaped_strides(old_sh, old_sts, new_sh, order="C"):
+    """
+    When reshaping array with `old_sh` shape and `old_sts` strides
+    into the new shape `new_sh`, returns the new stride if the reshape
+    can be a view, otherwise returns `None`.
+    """
+    eye_new_mi = _make_unit_indexes(new_sh)
+    new_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                old_sts, ti_unravel_index(flat_index, old_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, new_sh, order=order)
+            for unitvec in eye_new_mi
+        ]
+    ]
+    eye_old_mi = _make_unit_indexes(old_sh)
+    check_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                new_sts, ti_unravel_index(flat_index, new_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, old_sh, order=order)
+            for unitvec in eye_old_mi
+        ]
+    ]
+    valid = all(
+        check_st == old_st or old_dim == 1
+        for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh)
+    )
+    return new_sts if valid else None
+
+
+def reshape(X, /, shape, *, order="C", copy=None):
+    """reshape(x, shape, order="C")
+
+    Reshapes array ``x`` into new shape.
+
+    Args:
+        x (usm_ndarray):
+            input array
+        shape (Tuple[int]):
+            the desired shape of the resulting array.
+        order ("C", "F", optional):
+            memory layout of the resulting array
+            if a copy is found to be necessary. Supported
+            choices are ``"C"`` for C-contiguous, or row-major layout;
+            and ``"F"`` for F-contiguous, or column-major layout.
+
+    Returns:
+        usm_ndarray:
+            Reshaped array is a view, if possible,
+            and a copy otherwise with memory layout as indicated
+            by ``order`` keyword.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError
+    if not isinstance(shape, (list, tuple)):
+        shape = (shape,)
+    if order in "cfCF":
+        order = order.upper()
+    else:
+        raise ValueError(
+            f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}"
+        )
+    if copy not in (True, False, None):
+        raise ValueError(
+            f"Keyword 'copy' not recognized. Expecting True, False, "
+            f"or None, got {copy}"
+        )
+    shape = [operator.index(d) for d in shape]
+    negative_ones_count = 0
+    for nshi in shape:
+        if nshi == -1:
+            negative_ones_count = negative_ones_count + 1
+        if (nshi < -1) or negative_ones_count > 1:
+            raise ValueError(
+                "Target shape should have at most 1 negative "
+                "value which can only be -1"
+            )
+    if negative_ones_count:
+        sz = -np.prod(shape)
+        if sz == 0:
+            raise ValueError(
+                f"Can not reshape array of size {X.size} into "
+                f"shape {tuple(i for i in shape if i >= 0)}"
+            )
+        v = X.size // sz
+        shape = [v if d == -1 else d for d in shape]
+    if X.size != np.prod(shape):
+        raise ValueError(f"Can not reshape into {shape}")
+    if X.size:
+        newsts = reshaped_strides(X.shape, X.strides, shape, order=order)
+    else:
+        newsts = (1,) * len(shape)
+    copy_required = newsts is None
+    if copy_required and (copy is False):
+        raise ValueError(
+            "Reshaping the array requires a copy, but no copying was "
+            "requested by using copy=False"
+        )
+    copy_q = X.sycl_queue
+    if copy_required or (copy is True):
+        # must perform a copy
+        copy_q = X.sycl_queue
+        flat_res = dpt.usm_ndarray(
+            (X.size,),
+            dtype=X.dtype,
+            buffer=X.usm_type,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+        _manager = dpctl.utils.SequentialOrderManager[copy_q]
+        dep_evs = _manager.submitted_events
+        if order == "C":
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        else:
+            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        _manager.add_event_pair(hev, r_e)
+        return dpt.usm_ndarray(
+            tuple(shape), dtype=X.dtype, buffer=flat_res, order=order
+        )
+    # can form a view
+    if (len(shape) == X.ndim) and all(
+        s1 == s2 for s1, s2 in zip(shape, X.shape)
+    ):
+        return X
+    return dpt.usm_ndarray(
+        shape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=tuple(newsts),
+        offset=X._element_offset,
+    )

From 3c0c1132df85fc9ae76e6c953392ba674e0dd4f3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 06:27:16 -0800
Subject: [PATCH 043/145] Reuse reshape from dpctl_ext in dpnp

---
 dpnp/dpnp_algo/dpnp_arraycreation.py |  4 +++-
 dpnp/dpnp_iface_arraycreation.py     |  8 ++++----
 dpnp/dpnp_iface_indexing.py          | 24 ++++++++++++------------
 dpnp/dpnp_iface_manipulation.py      |  9 ++++++---
 dpnp/dpnp_iface_sorting.py           |  5 ++++-
 dpnp/tests/test_arraycreation.py     |  5 ++++-
 6 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 27f095158a85..47edf63a68b4 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -233,7 +233,9 @@ def dpnp_linspace(
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False)
+        usm_res = dpt_ext.reshape(
+            usm_res, (-1,) + (1,) * delta.ndim, copy=False
+        )
 
         if step_num > 0:
             step = delta / step_num
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 404d9e0fe37d..c2dd5793f827 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -3117,7 +3117,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
 
     s0 = (1,) * ndim
     output = [
-        dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
+        dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
         for i, x in enumerate(xi)
     ]
 
@@ -3125,8 +3125,8 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
     _, _ = get_usm_allocations(output)
 
     if indexing == "xy" and ndim > 1:
-        output[0] = dpt.reshape(output[0], (1, -1) + s0[2:])
-        output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:])
+        output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:])
+        output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:])
 
     if not sparse:
         output = dpt.broadcast_arrays(*output)
@@ -3932,7 +3932,7 @@ def vander(
 
     tmp = m[:, ::-1] if not increasing else m
     dpnp.power(
-        dpt.reshape(usm_x, (-1, 1)),
+        dpt_ext.reshape(usm_x, (-1, 1)),
         dpt.arange(
             N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue
         ),
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index a77877d8d685..439ec288ebe2 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -814,14 +814,14 @@ def extract(condition, a):
     )
 
     if usm_cond.size != usm_a.size:
-        usm_a = dpt.reshape(usm_a, -1)
-        usm_cond = dpt.reshape(usm_cond, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
+        usm_cond = dpt_ext.reshape(usm_cond, -1)
 
         usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
-            usm_a = dpt.reshape(usm_a, -1)
-            usm_cond = dpt.reshape(usm_cond, -1)
+            usm_a = dpt_ext.reshape(usm_a, -1)
+            usm_cond = dpt_ext.reshape(usm_cond, -1)
 
         usm_res = dpt.extract(usm_cond, usm_a)
 
@@ -958,18 +958,18 @@ def fill_diagonal(a, val, wrap=False):
     # a.flat[:end:step] = val
     # but need to consider use case when `a` is usm_ndarray also
     a_sh = a.shape
-    tmp_a = dpt.reshape(usm_a, -1)
+    tmp_a = dpt_ext.reshape(usm_a, -1)
     if dpnp.isscalar(usm_val):
         tmp_a[:end:step] = usm_val
     else:
-        usm_val = dpt.reshape(usm_val, -1)
+        usm_val = dpt_ext.reshape(usm_val, -1)
 
         # Setitem can work only if index size equal val size.
         # Using loop for general case without dependencies of val size.
         for i in range(0, usm_val.size):
             tmp_a[step * i : end : step * (i + 1)] = usm_val[i]
 
-    tmp_a = dpt.reshape(tmp_a, a_sh)
+    tmp_a = dpt_ext.reshape(tmp_a, a_sh)
     usm_a[:] = tmp_a
 
 
@@ -1610,7 +1610,7 @@ def place(a, mask, vals):
 
     if usm_vals.ndim != 1:
         # dpt.place supports only 1-D array of values
-        usm_vals = dpt.reshape(usm_vals, -1)
+        usm_vals = dpt_ext.reshape(usm_vals, -1)
 
     if usm_vals.dtype != usm_a.dtype:
         # dpt.place casts values to a.dtype with "unsafe" rule,
@@ -1709,7 +1709,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
 
     if usm_ind.ndim != 1:
         # dpt.put supports only 1-D array of indices
-        usm_ind = dpt.reshape(usm_ind, -1, copy=False)
+        usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False)
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.put supports only integer dtype for array of indices
@@ -1717,11 +1717,11 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
 
     in_usm_a = usm_a
     if axis is None and usm_a.ndim > 1:
-        usm_a = dpt.reshape(usm_a, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
 
     dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
-        in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
+        in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False)
 
 
 def put_along_axis(a, ind, values, axis, mode="wrap"):
@@ -2163,7 +2163,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
     if axis is None:
         if a_ndim > 1:
             # flatten input array
-            usm_a = dpt.reshape(usm_a, -1)
+            usm_a = dpt_ext.reshape(usm_a, -1)
         axis = 0
     elif a_ndim == 0:
         axis = normalize_axis_index(operator.index(axis), 1)
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 9df5278bd16b..f992cc366e20 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -53,6 +53,9 @@
     normalize_axis_tuple,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .dpnp_array import dpnp_array
@@ -415,7 +418,7 @@ def _get_first_nan_index(usm_a):
             dpt.place(
                 usm_res.inverse_indices,
                 usm_res.inverse_indices > first_nan,
-                dpt.reshape(first_nan, 1),
+                dpt_ext.reshape(first_nan, 1),
             )
 
         result += (usm_res.inverse_indices,)
@@ -3057,7 +3060,7 @@ def reshape(a, /, shape, order="C", *, copy=None):
         )
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy)
+    usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3259,7 +3262,7 @@ def roll(x, shift, axis=None):
         shift = dpnp.asnumpy(shift)
 
     if axis is None:
-        return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape)
+        return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape)
 
     usm_res = dpt.roll(usm_x, shift=shift, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index db33a88c7488..3e5cdd4da6a6 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -43,6 +43,9 @@
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -84,7 +87,7 @@ def _wrap_sort_argsort(
 
     usm_a = dpnp.get_usm_ndarray(a)
     if axis is None:
-        usm_a = dpt.reshape(usm_a, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
         axis = -1
 
     axis = normalize_axis_index(axis, ndim=usm_a.ndim)
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index eb20f9b3ffe5..423004470bad 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -13,6 +13,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -969,7 +972,7 @@ def test_ones_like(array, dtype, order):
     ],
 )
 def test_dpctl_tensor_input(func, args):
-    x0 = dpt.reshape(dpt.arange(9), (3, 3))
+    x0 = dpt_ext.reshape(dpt.arange(9), (3, 3))
     new_args = [eval(val, {"x0": x0}) for val in args]
     X = getattr(dpt, func)(*new_args)
     Y = getattr(dpnp, func)(*new_args)

From 30f2c53ee0d6dcddfbc8d0933a667705b048e8b1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 06:44:48 -0800
Subject: [PATCH 044/145] Move _copy_usm_ndarray_for_roll

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../tensor/libtensor/source/copy_for_roll.cpp | 400 ++++++++++++++++++
 .../tensor/libtensor/source/copy_for_roll.hpp |  65 +++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  38 +-
 4 files changed, 485 insertions(+), 20 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index b30a1ac8b029..a39368c7baa3 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -50,7 +50,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
new file mode 100644
index 000000000000..a187b2247677
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
@@ -0,0 +1,400 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_roll.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_for_roll_ndshift_strided_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_roll_strided_fn_ptr_t
+    copy_for_roll_strided_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_contig_fn_ptr_t
+    copy_for_roll_contig_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_ndshift_strided_fn_ptr_t
+    copy_for_roll_ndshift_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    const bool is_src_c_contig = src.is_c_contiguous();
+    const bool is_src_f_contig = src.is_f_contiguous();
+
+    const bool is_dst_c_contig = dst.is_c_contiguous();
+    const bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const bool both_c_contig = is_src_c_contig && is_dst_c_contig;
+    const bool both_f_contig = is_src_f_contig && is_dst_f_contig;
+
+    // normalize shift parameter to be 0 <= offset < src_nelems
+    std::size_t offset =
+        (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (both_c_contig || both_f_contig) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
+                   zero_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape_ptr;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data,
+                   dst_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto fn = copy_for_roll_strided_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_roll_event}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    if (static_cast<std::size_t>(src_nd) != shifts.size()) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires shifts to "
+            "contain an integral shift for each array dimension.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check for compatible queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    std::vector<py::ssize_t> normalized_shifts{};
+    normalized_shifts.reserve(src_nd);
+
+    for (int i = 0; i < src_nd; ++i) {
+        // normalize shift parameter to be 0 <= offset < dim
+        py::ssize_t dim = src_shape_ptr[i];
+        std::size_t offset =
+            (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim);
+
+        normalized_shifts.push_back(offset);
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+    auto const &common_shape = src.get_shape_vector();
+
+    static constexpr py::ssize_t src_offset = 0;
+    static constexpr py::ssize_t dst_offset = 0;
+
+    auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, common_shape, src_strides, dst_strides,
+        normalized_shifts);
+    auto shape_strides_shifts_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {copy_for_roll_event}, shape_strides_shifts_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+void init_copy_for_roll_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory;
+
+    DispatchVectorBuilder<copy_for_roll_strided_fn_ptr_t,
+                          CopyForRollStridedFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory;
+    DispatchVectorBuilder<copy_for_roll_contig_fn_ptr_t,
+                          CopyForRollContigFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory;
+    DispatchVectorBuilder<copy_for_roll_ndshift_strided_fn_ptr_t,
+                          CopyForRollNDShiftFactory, num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
new file mode 100644
index 000000000000..cffbf9f6f0d6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
@@ -0,0 +1,65 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_roll_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 4903f22bd481..c1372c1c2406 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -52,7 +52,7 @@
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
 #include "copy_for_reshape.hpp"
-// #include "copy_for_roll.hpp"
+#include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
@@ -91,8 +91,8 @@ using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
 /* =========================== Copy for roll ============================= */
 
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
 
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
@@ -158,8 +158,8 @@ void init_dispatch_vectors(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_as_contig_dispatch_vectors();
-    // init_copy_for_reshape_dispatch_vectors();
-    // init_copy_for_roll_dispatch_vectors();
+    init_copy_for_reshape_dispatch_vectors();
+    init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
@@ -287,21 +287,21 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "shapes using underlying 'C'-contiguous order for flat "
-    //       "traversal with shift. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("src"), py::arg("dst"), py::arg("shift"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for flat "
+          "traversal with shift. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shift"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "shapes using underlying 'C'-contiguous order for " "traversal
-    //       with shifts along each axis. " "Returns a tuple of events:
-    //       (ht_event, comp_event)", py::arg("src"), py::arg("dst"),
-    //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
-    //       py::list());
+    m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for "
+          "traversal with shifts along each axis. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shifts"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
     //           "Fills input 1D contiguous usm_ndarray `dst` with linear

From 85c29daa5b6855db9d3b023e29777950aebaba18 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 06:57:28 -0800
Subject: [PATCH 045/145] Move roll() to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py                |   4 +
 dpctl_ext/tensor/_manipulation_functions.py | 120 ++++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 dpctl_ext/tensor/_manipulation_functions.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 416216074f8a..edb2c096bad1 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -43,6 +43,9 @@
     put,
     take,
 )
+from dpctl_ext.tensor._manipulation_functions import (
+    roll,
+)
 from dpctl_ext.tensor._reshape import reshape
 
 __all__ = [
@@ -53,6 +56,7 @@
     "full",
     "put",
     "reshape",
+    "roll",
     "take",
     "to_numpy",
     "tril",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
new file mode 100644
index 000000000000..fa8fc27876b3
--- /dev/null
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -0,0 +1,120 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl.tensor as dpt
+import dpctl.utils as dputils
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_tuple
+
+__doc__ = (
+    "Implementation module for array manipulation "
+    "functions in :module:`dpctl.tensor`"
+)
+
+
+def roll(x, /, shift, *, axis=None):
+    """
+    roll(x, shift, axis)
+
+    Rolls array elements along a specified axis.
+    Array elements that roll beyond the last position are re-introduced
+    at the first position. Array elements that roll beyond the first position
+    are re-introduced at the last position.
+
+    Args:
+        x (usm_ndarray): input array
+        shift (Union[int, Tuple[int,...]]): number of places by which the
+            elements are shifted. If `shift` is a tuple, then `axis` must be a
+            tuple of the same size, and each of the given axes must be shifted
+            by the corresponding element in `shift`. If `shift` is an `int`
+            and `axis` a tuple, then the same `shift` must be used for all
+            specified axes. If a `shift` is positive, then array elements is
+            shifted positively (toward larger indices) along the dimension of
+            `axis`.
+            If a `shift` is negative, then array elements must be shifted
+            negatively (toward smaller indices) along the dimension of `axis`.
+        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which
+            elements to shift. If `axis` is `None`, the array is
+            flattened, shifted, and then restored to its original shape.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array having the same `dtype`, `usm_type` and
+            `device` attributes as `x` and whose elements are shifted relative
+            to `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+    exec_q = x.sycl_queue
+    _manager = dputils.SequentialOrderManager[exec_q]
+    if axis is None:
+        shift = operator.index(shift)
+        res = dpt.empty(
+            x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+        )
+        sz = operator.index(x.size)
+        shift = (shift % sz) if sz > 0 else 0
+        dep_evs = _manager.submitted_events
+        hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d(
+            src=x,
+            dst=res,
+            shift=shift,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(hev, roll_ev)
+        return res
+    axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True)
+    broadcasted = np.broadcast(shift, axis)
+    if broadcasted.ndim > 1:
+        raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
+    shifts = [
+        0,
+    ] * x.ndim
+    shape = x.shape
+    for sh, ax in broadcasted:
+        n_i = operator.index(shape[ax])
+        shifted = shifts[ax] + operator.index(sh)
+        shifts[ax] = (shifted % n_i) if n_i > 0 else 0
+    res = dpt.empty(
+        x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    dep_evs = _manager.submitted_events
+    ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd(
+        src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e, roll_ev)
+    return res

From 6e8d857a5e85894a11c16a728ec437fac629a196 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 17 Feb 2026 06:58:11 -0800
Subject: [PATCH 046/145] Update dpnp.roll to use dpctl_ext

---
 dpnp/dpnp_iface_manipulation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index f992cc366e20..edd98348afe3 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -3264,7 +3264,7 @@ def roll(x, shift, axis=None):
     if axis is None:
         return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape)
 
-    usm_res = dpt.roll(usm_x, shift=shift, axis=axis)
+    usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From 19e93b99c7c2c238f1b697dfefe5b70525370819 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Feb 2026 04:34:09 -0800
Subject: [PATCH 047/145] Update .gitignore to ignore .so files in dpctl_ext

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5d2725d3186f..0cfebe53f623 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,7 @@ dpnp/**/*.cpython*.so
 dpnp/**/*.pyd
 *~
 core
+
+# TODO: revert to `dpctl/`
+# when dpnp fully migrates dpctl/tensor
+dpctl_ext/**/*.cpython*.so

From b111e49b784168180c835569d5dbe97958521f16 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Feb 2026 04:35:23 -0800
Subject: [PATCH 048/145] Remove unused includes in tensor_ctors.cpp

---
 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index be69ee1a8c7e..54d6adbc8f6e 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -32,18 +32,15 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-// #include <algorithm>
-// #include <complex>
-// #include <cstdint>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-// #include <thread>
 #include <optional>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
 #include "dpnp4pybind11.hpp"
 
 // #include "accumulators.hpp"

From c082224e07df5e4d4960112ef5ec4e5faef2a452 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 18 Feb 2026 05:40:59 -0800
Subject: [PATCH 049/145] Use Python::Module for dpctl_ext static lib to avoid
 libpython dependency

---
 dpctl_ext/tensor/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 28e7a4cb55f4..ed69b4f10cba 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -27,7 +27,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-find_package(Python COMPONENTS Development)
+find_package(Python COMPONENTS Development.Module)
 
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
@@ -74,7 +74,7 @@ target_include_directories(
         # ${Dpctl_INCLUDE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
 )
-target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python)
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module)
 set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 set(_py_trgts)

From c3bc8ba76728c212cc9117bac72cf4b68f88da92 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 04:58:04 -0800
Subject: [PATCH 050/145] Move ti.mask_positions() and ti._cumsum_1d()

---
 dpctl_ext/tensor/CMakeLists.txt               |    2 +-
 .../include/kernels/accumulators.hpp          | 1448 +++++++++++++++++
 .../tensor/libtensor/source/accumulators.cpp  |  406 +++++
 .../tensor/libtensor/source/accumulators.hpp  |   62 +
 .../tensor/libtensor/source/tensor_ctors.cpp  |   20 +-
 5 files changed, 1927 insertions(+), 11 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 93555981deaa..f18481d08189 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -45,7 +45,7 @@ set(_static_lib_sources
 )
 set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
new file mode 100644
index 000000000000..6451bc950006
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp
@@ -0,0 +1,1448 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for accumulators (cumulative sum, prod, etc.).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::accumulators
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename T>
+T ceiling_quotient(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename inputT, typename outputT>
+struct NonZeroIndicator
+{
+    constexpr NonZeroIndicator() {}
+
+    outputT operator()(const inputT &val) const
+    {
+        static constexpr outputT out_one(1);
+        static constexpr outputT out_zero(0);
+        static constexpr inputT val_zero(0);
+
+        return (val == val_zero) ? out_zero : out_one;
+    }
+};
+
+template <typename T>
+struct NoOpTransformer
+{
+    constexpr NoOpTransformer() {}
+
+    T operator()(const T &val) const
+    {
+        return val;
+    }
+};
+
+template <typename srcTy, typename dstTy>
+struct CastTransformer
+{
+    constexpr CastTransformer() {}
+
+    dstTy operator()(const srcTy &val) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(val);
+    }
+};
+
+template <typename ScanOpT, typename T>
+struct needs_workaround
+{
+    // workaround needed due to crash in JITing on CPU
+    // remove when CMPLRLLVM-65813 is resolved
+    static constexpr bool value = su_ns::IsSyclLogicalAnd<T, ScanOpT>::value ||
+                                  su_ns::IsSyclLogicalOr<T, ScanOpT>::value;
+};
+
+template <typename BinOpT, typename T>
+struct can_use_inclusive_scan_over_group
+{
+    static constexpr bool value = sycl::has_known_identity<BinOpT, T>::value &&
+                                  !needs_workaround<BinOpT, T>::value;
+};
+
+namespace detail
+{
+template <typename T>
+class stack_t
+{
+    T *src_;
+    std::size_t size_;
+    T *local_scans_;
+
+public:
+    stack_t() : src_{}, size_{}, local_scans_{} {}
+    stack_t(T *src, std::size_t sz, T *local_scans)
+        : src_(src), size_(sz), local_scans_(local_scans)
+    {
+    }
+    ~stack_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    std::size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+};
+
+template <typename T>
+class stack_strided_t
+{
+    T *src_;
+    std::size_t size_;
+    T *local_scans_;
+    std::size_t local_stride_;
+
+public:
+    stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {}
+    stack_strided_t(T *src,
+                    std::size_t sz,
+                    T *local_scans,
+                    std::size_t local_stride)
+        : src_(src), size_(sz), local_scans_(local_scans),
+          local_stride_(local_stride)
+    {
+    }
+    ~stack_strided_t(){};
+
+    T *get_src_ptr() const
+    {
+        return src_;
+    }
+
+    std::size_t get_size() const
+    {
+        return size_;
+    }
+
+    T *get_local_scans_ptr() const
+    {
+        return local_scans_;
+    }
+
+    std::size_t get_local_stride() const
+    {
+        return local_stride_;
+    }
+};
+
+} // end of namespace detail
+
+// Iterative cumulative summation
+
+using nwiT = std::uint32_t;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_iter_local_scan_blocked_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_iter_local_scan_striped_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event inclusive_scan_base_step_blocked(
+    sycl::queue &exec_q,
+    const std::uint32_t wg_size,
+    const std::size_t iter_nelems,
+    const std::size_t acc_nelems,
+    const inputT *input,
+    outputT *output,
+    const std::size_t s0,
+    const std::size_t s1,
+    const IterIndexerT &iter_indexer,
+    const InpIndexerT &inp_indexer,
+    const OutIndexerT &out_indexer,
+    TransformerT transformer,
+    const ScanOpT &scan_op,
+    outputT identity,
+    std::size_t &acc_groups,
+    const std::vector<sycl::event> &depends = {})
+{
+    acc_groups = ceiling_quotient<std::size_t>(acc_nelems, n_wi * wg_size);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<outputT, 1>;
+
+        auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size);
+        auto lws = sycl::range<1>(wg_size);
+
+        auto ndRange = sycl::nd_range<1>(gws, lws);
+
+        slmT slm_iscan_tmp(lws, cgh);
+
+        using KernelName = inclusive_scan_iter_local_scan_blocked_krn<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_id(0);
+            const std::size_t lid = it.get_local_id(0);
+
+            const std::uint32_t wg_size = it.get_local_range(0);
+            const std::size_t reduce_chunks = acc_groups * wg_size;
+            const std::size_t iter_gid = gid / reduce_chunks;
+            const std::size_t chunk_gid = gid - (iter_gid * reduce_chunks);
+
+            const std::size_t i = chunk_gid * n_wi;
+            const auto &iter_offsets = iter_indexer(iter_gid);
+            const auto &inp_iter_offset = iter_offsets.get_first_offset();
+            const auto &out_iter_offset = iter_offsets.get_second_offset();
+
+            std::array<outputT, n_wi> local_iscan;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                const std::size_t i_m_wi = i + m_wi;
+                if constexpr (!include_initial) {
+                    local_iscan[m_wi] =
+                        (i_m_wi < acc_nelems)
+                            ? transformer(input[inp_iter_offset +
+                                                inp_indexer(s0 + s1 * i_m_wi)])
+                            : identity;
+                }
+                else {
+                    // shift input to the left by a single element relative to
+                    // output
+                    local_iscan[m_wi] =
+                        (i_m_wi < acc_nelems && i_m_wi > 0)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer((s0 + s1 * i_m_wi) - 1)])
+                            : identity;
+                }
+            }
+
+#pragma unroll
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] =
+                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
+            }
+            // local_iscan is now result of
+            // inclusive scan of locally stored inputs
+
+            outputT wg_iscan_val;
+            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
+                                                            outputT>::value) {
+                wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_iscan.back(), scan_op, identity);
+            }
+            else {
+                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
+                    it.get_group(), it.get_sub_group(), slm_iscan_tmp,
+                    local_iscan.back(), identity, scan_op);
+                // ensure all finished reading from SLM, to avoid race condition
+                // with subsequent writes into SLM
+                it.barrier(sycl::access::fence_space::local_space);
+            }
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
+            }
+
+            const std::size_t start = std::min(i, acc_nelems);
+            const std::size_t end = std::min(i + n_wi, acc_nelems);
+            const nwiT m_max = static_cast<nwiT>(end - start);
+            for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) {
+                output[out_iter_offset + out_indexer(i + m_wi)] =
+                    local_iscan[m_wi];
+            }
+        });
+    });
+
+    return inc_scan_phase1_ev;
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event inclusive_scan_base_step_striped(
+    sycl::queue &exec_q,
+    const std::uint32_t wg_size,
+    const std::size_t iter_nelems,
+    const std::size_t acc_nelems,
+    const inputT *input,
+    outputT *output,
+    const std::size_t s0,
+    const std::size_t s1,
+    const IterIndexerT &iter_indexer,
+    const InpIndexerT &inp_indexer,
+    const OutIndexerT &out_indexer,
+    TransformerT transformer,
+    const ScanOpT &scan_op,
+    outputT identity,
+    std::size_t &acc_groups,
+    const std::vector<sycl::event> &depends = {})
+{
+    const std::uint32_t reduce_nelems_per_wg = n_wi * wg_size;
+    acc_groups =
+        ceiling_quotient<std::size_t>(acc_nelems, reduce_nelems_per_wg);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<outputT, 1>;
+
+        const auto &gRange = sycl::range<1>{iter_nelems * acc_groups * wg_size};
+        const auto &lRange = sycl::range<1>{wg_size};
+
+        const auto &ndRange = sycl::nd_range<1>{gRange, lRange};
+
+        slmT slm_iscan_tmp(reduce_nelems_per_wg, cgh);
+
+        using KernelName = inclusive_scan_iter_local_scan_striped_krn<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            const std::uint32_t lid = it.get_local_linear_id();
+            const std::uint32_t wg_size = it.get_local_range(0);
+
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t sgroup_id = sg.get_group_id()[0];
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t flat_group_id = it.get_group(0);
+            const std::size_t iter_gid = flat_group_id / acc_groups;
+            const std::size_t acc_group_id =
+                flat_group_id - (iter_gid * acc_groups);
+
+            const auto &iter_offsets = iter_indexer(iter_gid);
+            const auto &inp_iter_offset = iter_offsets.get_first_offset();
+            const auto &out_iter_offset = iter_offsets.get_second_offset();
+
+            std::array<outputT, n_wi> local_iscan{};
+
+            const std::size_t inp_id0 = acc_group_id * n_wi * wg_size +
+                                        sgroup_id * n_wi * sgSize + lane_id;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                const std::size_t inp_id = inp_id0 + m_wi * sgSize;
+                if constexpr (!include_initial) {
+                    local_iscan[m_wi] =
+                        (inp_id < acc_nelems)
+                            ? transformer(input[inp_iter_offset +
+                                                inp_indexer(s0 + s1 * inp_id)])
+                            : identity;
+                }
+                else {
+                    // shift input to the left by a single element relative to
+                    // output
+                    local_iscan[m_wi] =
+                        (inp_id < acc_nelems && inp_id > 0)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer((s0 + s1 * inp_id) - 1)])
+                            : identity;
+                }
+            }
+
+            // change layout from striped to blocked
+            {
+                {
+                    const std::uint32_t local_offset0 = lid * n_wi;
+#pragma unroll
+                    for (std::uint32_t i = 0; i < n_wi; ++i) {
+                        slm_iscan_tmp[local_offset0 + i] = local_iscan[i];
+                    }
+
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+
+                {
+                    const std::uint32_t block_offset =
+                        sgroup_id * sgSize * n_wi;
+                    const std::uint32_t disp0 = lane_id * n_wi;
+#pragma unroll
+                    for (nwiT i = 0; i < n_wi; ++i) {
+                        const std::uint32_t disp = disp0 + i;
+
+                        // disp == lane_id1 + i1 * sgSize;
+                        const std::uint32_t i1 = disp / sgSize;
+                        const std::uint32_t lane_id1 = disp - i1 * sgSize;
+
+                        const std::uint32_t disp_exchanged =
+                            (lane_id1 * n_wi + i1);
+
+                        local_iscan[i] =
+                            slm_iscan_tmp[block_offset + disp_exchanged];
+                    }
+
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+            }
+
+#pragma unroll
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] =
+                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
+            }
+            // local_iscan is now result of
+            // inclusive scan of locally stored inputs
+
+            outputT wg_iscan_val;
+            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
+                                                            outputT>::value) {
+                wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_iscan.back(), scan_op, identity);
+            }
+            else {
+                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
+                    it.get_group(), sg, slm_iscan_tmp, local_iscan.back(),
+                    identity, scan_op);
+                // ensure all finished reading from SLM, to avoid race condition
+                // with subsequent writes into SLM
+                it.barrier(sycl::access::fence_space::local_space);
+            }
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+
+            // convert back to blocked layout
+            {{const std::uint32_t local_offset0 = lid * n_wi;
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+                }
+            }
+
+            {
+        const std::uint32_t block_offset = sgroup_id * sgSize * n_wi + lane_id;
+#pragma unroll
+        for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+            const std::uint32_t m_wi_scaled = m_wi * sgSize;
+            const std::size_t out_id = inp_id0 + m_wi_scaled;
+            if (out_id < acc_nelems) {
+                output[out_iter_offset + out_indexer(out_id)] =
+                    slm_iscan_tmp[block_offset + m_wi_scaled];
+            }
+        }
+            }
+});
+});
+
+return inc_scan_phase1_ev;
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event
+    inclusive_scan_base_step(sycl::queue &exec_q,
+                             const std::uint32_t wg_size,
+                             const std::size_t iter_nelems,
+                             const std::size_t acc_nelems,
+                             const inputT *input,
+                             outputT *output,
+                             const std::size_t s0,
+                             const std::size_t s1,
+                             const IterIndexerT &iter_indexer,
+                             const InpIndexerT &inp_indexer,
+                             const OutIndexerT &out_indexer,
+                             TransformerT transformer,
+                             const ScanOpT &scan_op,
+                             outputT identity,
+                             std::size_t &acc_groups,
+                             const std::vector<sycl::event> &depends = {})
+{
+    // For small stride use striped load/store.
+    // Threshold value chosen experimentally.
+    if (s1 <= 16) {
+        return inclusive_scan_base_step_striped<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+    }
+    else {
+        return inclusive_scan_base_step_blocked<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+    }
+}
+
+template <typename outputT, nwiT n_wi, typename ScanOpT>
+class inclusive_scan_1d_iter_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename ScanOpT>
+sycl::event update_local_chunks_1d(sycl::queue &exec_q,
+                                   outputT *src,
+                                   std::size_t src_size,
+                                   const outputT *local_scans,
+                                   std::size_t chunk_size,
+                                   const sycl::event &dependent_event)
+{
+    const auto &ctx = exec_q.get_context();
+    const auto &dev = exec_q.get_device();
+
+    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    // output[ chunk_size * (i + 1) + j] += temp[i]
+    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_event);
+        cgh.use_kernel_bundle(kb);
+
+        static constexpr nwiT updates_per_wi = n_wi;
+        const std::size_t n_items =
+            ceiling_quotient<std::size_t>(src_size, sg_size * n_wi) * sg_size;
+
+        sycl::range<1> gRange{n_items};
+        sycl::range<1> lRange{sg_size};
+        sycl::nd_range<1> ndRange{gRange, lRange};
+
+        cgh.parallel_for<UpdateKernelName>(
+            ndRange,
+            [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) {
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
+                    su_ns::Identity<ScanOpT, outputT>::value;
+
+                const std::uint32_t lws = ndit.get_local_range(0);
+                const std::size_t block_offset = ndit.get_group(0) * n_wi * lws;
+#pragma unroll
+                for (std::size_t i = 0; i < updates_per_wi; ++i) {
+                    const std::size_t src_id =
+                        block_offset + ndit.get_local_id(0) + i * lws;
+                    if (src_id < src_size) {
+                        const std::size_t scan_id = (src_id / chunk_size);
+                        const outputT modifier =
+                            (scan_id > 0) ? local_scans[scan_id - 1] : identity;
+                        src[src_id] = scan_op(src[src_id], modifier);
+                    }
+                }
+            });
+    });
+
+    return update_event;
+}
+
+/*
+ * output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
+                                   const std::uint32_t wg_size,
+                                   const std::size_t n_elems,
+                                   const inputT *input,
+                                   outputT *output,
+                                   const std::size_t s0,
+                                   const std::size_t s1,
+                                   const IndexerT &indexer,
+                                   const TransformerT &transformer,
+                                   std::vector<sycl::event> &host_tasks,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
+
+    static constexpr std::size_t _iter_nelems = 1;
+
+    using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
+    static constexpr IterIndexerT _no_op_iter_indexer{};
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT _no_op_indexer{};
+
+    std::size_t n_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT, IndexerT,
+                                 NoOpIndexerT, TransformerT, ScanOpT,
+                                 include_initial>(
+            exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1,
+            _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op,
+            identity, n_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
+        const std::size_t chunk_size = wg_size * n_wi;
+
+        // how much of temporary allocation do we need
+        std::size_t n_groups_ = n_groups;
+        std::size_t temp_size = 0;
+        while (n_groups_ > 1) {
+            const std::size_t this_size = (n_groups_ - 1);
+            temp_size += this_size;
+            n_groups_ = ceiling_quotient(this_size, chunk_size);
+        }
+
+        // allocate
+        auto temp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(temp_size,
+                                                                     exec_q);
+        outputT *temp = temp_owner.get();
+
+        std::vector<detail::stack_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        n_groups_ = n_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        static constexpr NoOpTransformerT _no_op_transformer{};
+        std::size_t size_to_update = n_elems;
+        while (n_groups_ > 1) {
+
+            const std::size_t src_size = n_groups_ - 1;
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, _iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, _no_op_iter_indexer,
+                    _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op,
+                    identity, n_groups_, // n_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans});
+            src = local_scans;
+            local_scans += src_size;
+            size_to_update = src_size;
+        }
+
+        for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
+             ++reverse_stack_id)
+        {
+            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
+
+            const auto &stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            const std::size_t src_size = stack_elem.get_size();
+            const outputT *local_scans = stack_elem.get_local_scans_ptr();
+
+            using UpdateKernelName =
+                class inclusive_scan_1d_iter_chunk_update_krn<outputT, n_wi,
+                                                              ScanOpT>;
+
+            dependent_event = update_local_chunks_1d<UpdateKernelName, outputT,
+                                                     n_wi, ScanOpT>(
+                exec_q, src, src_size, local_scans, chunk_size,
+                dependent_event);
+        }
+
+        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {dependent_event}, temp_owner);
+
+        host_tasks.push_back(free_ev);
+    }
+
+    return dependent_event;
+}
+
+typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+    accumulate_1d_contig_impl(sycl::queue &q,
+                              std::size_t n_elems,
+                              const char *src,
+                              char *dst,
+                              std::vector<sycl::event> &host_tasks,
+                              const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+
+    sycl::event comp_ev;
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    return comp_ev;
+}
+
+template <typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename IndexerT,
+          typename ScanOpT>
+class inclusive_scan_final_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename OutIterIndexerT,
+          typename OutIndexerT,
+          typename ScanOpT>
+sycl::event final_update_local_chunks(sycl::queue &exec_q,
+                                      std::size_t iter_nelems,
+                                      outputT *src,
+                                      std::size_t src_size,
+                                      const outputT *local_scans,
+                                      std::size_t chunk_size,
+                                      std::size_t local_stride,
+                                      const OutIterIndexerT &out_iter_indexer,
+                                      const OutIndexerT &out_indexer,
+                                      sycl::event dependent_event)
+{
+    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    static constexpr nwiT updates_per_wi = n_wi;
+    const std::size_t updates_per_sg = sg_size * updates_per_wi;
+    const std::size_t update_nelems =
+        ceiling_quotient(src_size, updates_per_sg) * sg_size;
+
+    sycl::range<2> gRange{iter_nelems, update_nelems};
+    sycl::range<2> lRange{1, sg_size};
+
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_event);
+
+        cgh.parallel_for<UpdateKernelName>(
+            ndRange, [chunk_size, src_size, local_stride, src, local_scans,
+                      out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) {
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
+                    su_ns::Identity<ScanOpT, outputT>::value;
+
+                const std::uint32_t lws = ndit.get_local_range(1);
+
+                const std::size_t iter_gid = ndit.get_group(0);
+
+                const std::size_t src_axis_id0 =
+                    ndit.get_group(1) * updates_per_wi * lws +
+                    ndit.get_local_id(1);
+                const std::size_t src_iter_id = out_iter_indexer(iter_gid);
+#pragma unroll
+                for (nwiT i = 0; i < updates_per_wi; ++i) {
+                    const std::size_t src_axis_id = src_axis_id0 + i * lws;
+                    const std::size_t src_id =
+                        out_indexer(src_axis_id) + src_iter_id;
+
+                    if (src_axis_id < src_size) {
+                        const std::size_t scan_axis_id =
+                            src_axis_id / chunk_size;
+                        const std::size_t scan_id =
+                            scan_axis_id + iter_gid * local_stride;
+
+                        const outputT modifier = (scan_axis_id > 0)
+                                                     ? local_scans[scan_id - 1]
+                                                     : identity;
+
+                        src[src_id] = scan_op(src[src_id], modifier);
+                    }
+                }
+            });
+    });
+
+    return update_event;
+}
+
+template <typename outputT, nwiT n_wi, typename ScanOpT>
+class inclusive_scan_iter_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename ScanOpT>
+sycl::event update_local_chunks(sycl::queue &exec_q,
+                                std::size_t iter_nelems,
+                                outputT *src,
+                                std::size_t src_size,
+                                const outputT *local_scans,
+                                std::size_t chunk_size,
+                                std::size_t local_stride,
+                                sycl::event dependent_event)
+{
+    static constexpr NoOpIndexer out_indexer{};
+    static constexpr NoOpIndexer iter_out_indexer{};
+
+    return final_update_local_chunks<UpdateKernelName, outputT, n_wi,
+                                     NoOpIndexer, NoOpIndexer, ScanOpT>(
+        exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+        local_stride, iter_out_indexer, out_indexer, dependent_event);
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename InpIterIndexerT,
+          typename OutIterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter(sycl::queue &exec_q,
+                                const std::uint32_t wg_size,
+                                const std::size_t iter_nelems,
+                                const std::size_t acc_nelems,
+                                const inputT *input,
+                                outputT *output,
+                                const std::size_t s0,
+                                const std::size_t s1,
+                                const InpIterIndexerT &inp_iter_indexer,
+                                const OutIterIndexerT &out_iter_indexer,
+                                const InpIndexerT &inp_indexer,
+                                const OutIndexerT &out_indexer,
+                                const TransformerT &transformer,
+                                std::vector<sycl::event> &host_tasks,
+                                const std::vector<sycl::event> &depends = {})
+{
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
+
+    using IterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+            InpIterIndexerT, OutIterIndexerT>;
+    const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer};
+
+    std::size_t acc_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT,
+                                 InpIndexerT, OutIndexerT, TransformerT,
+                                 ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (acc_groups > 1) {
+        const std::size_t chunk_size = wg_size * n_wi;
+
+        // how much of temporary allocation do we need
+        std::size_t acc_groups_ = acc_groups;
+        std::size_t temp_size = 0;
+        while (acc_groups_ > 1) {
+            const std::size_t this_size = (acc_groups_ - 1);
+            temp_size += this_size;
+            acc_groups_ = ceiling_quotient<std::size_t>(this_size, chunk_size);
+        }
+
+        // allocate
+        auto temp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(
+                iter_nelems * temp_size, exec_q);
+        outputT *temp = temp_owner.get();
+
+        std::vector<detail::stack_strided_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        acc_groups_ = acc_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        static constexpr NoOpIndexerT _no_op_indexer{};
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        static constexpr NoOpTransformerT _no_op_transformer{};
+        std::size_t size_to_update = acc_nelems;
+
+        {
+            std::size_t src_size = acc_groups - 1;
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems,
+                                                      /* step */ src_size};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    OutIterIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{out_iter_indexer,
+                                             scan_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         OutIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, out_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        while (acc_groups_ > 1) {
+            std::size_t src_size = acc_groups_ - 1;
+
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan1_iter_indexer{
+                /* size */ iter_nelems,
+                /* step */ size_to_update};
+            const LocalScanIndexerT scan2_iter_indexer{/* size */ iter_nelems,
+                                                       /* step */ src_size};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    LocalScanIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{scan1_iter_indexer,
+                                             scan2_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        for (std::size_t reverse_stack_id = 0;
+             reverse_stack_id < stack.size() - 1; ++reverse_stack_id)
+        {
+            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
+
+            const auto &stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            std::size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            std::size_t local_stride = stack_elem.get_local_stride();
+
+            using UpdateKernelName =
+                class inclusive_scan_iter_chunk_update_krn<outputT, n_wi,
+                                                           ScanOpT>;
+
+            dependent_event =
+                update_local_chunks<UpdateKernelName, outputT, n_wi, ScanOpT>(
+                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+                    local_stride, dependent_event);
+        }
+
+        // last stack element is always directly to output
+        {
+            const auto &stack_elem = stack[0];
+            outputT *src = stack_elem.get_src_ptr();
+            const std::size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            const std::size_t local_stride = stack_elem.get_local_stride();
+
+            using UpdateKernelName =
+                class inclusive_scan_final_chunk_update_krn<
+                    outputT, n_wi, OutIterIndexerT, OutIndexerT, ScanOpT>;
+
+            dependent_event =
+                final_update_local_chunks<UpdateKernelName, outputT, n_wi,
+                                          OutIterIndexerT, OutIndexerT,
+                                          ScanOpT>(
+                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+                    local_stride, out_iter_indexer, out_indexer,
+                    dependent_event);
+        }
+
+        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {dependent_event}, temp_owner);
+        host_tasks.push_back(free_ev);
+    }
+
+    return dependent_event;
+}
+
+typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+    accumulate_strided_impl(sycl::queue &q,
+                            std::size_t iter_nelems,
+                            std::size_t acc_nelems,
+                            const char *src,
+                            int iter_nd,
+                            const ssize_t *iter_shape_strides,
+                            ssize_t inp_iter_offset,
+                            ssize_t out_iter_offset,
+                            int acc_nd,
+                            const ssize_t *acc_shape_strides,
+                            char *dst,
+                            std::vector<sycl::event> &host_tasks,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides};
+    const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset,
+                                       iter_shape_strides};
+
+    using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+    const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides,
+                                       acc_shape_strides + 2 * acc_nd};
+    const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset,
+                                       iter_shape_strides,
+                                       iter_shape_strides + 2 * iter_nd};
+
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+
+    return comp_ev;
+}
+
+typedef std::size_t (*cumsum_val_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+std::size_t cumsum_val_contig_impl(sycl::queue &q,
+                                   std::size_t n_elems,
+                                   const char *mask,
+                                   char *cumsum,
+                                   std::vector<sycl::event> &host_tasks,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
+
+    sycl::event comp_ev;
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    auto host_usm_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
+    cumsumT *last_elem_host_usm = host_usm_owner.get();
+
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
+    copy_e.wait();
+    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
+
+    // explicitly free USM host allocation, by invoking deleter of
+    // the unique_ptr
+    host_usm_owner.reset(nullptr);
+
+    return return_val;
+}
+
+template <typename fnT, typename T>
+struct MaskPositionsContigFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPositionsContigFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct Cumsum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                cumsum_val_contig_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+typedef std::size_t (*cumsum_val_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    int,
+    const ssize_t *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+std::size_t
+    cumsum_val_strided_impl(sycl::queue &q,
+                            std::size_t n_elems,
+                            const char *mask,
+                            int nd,
+                            const ssize_t *shape_strides,
+                            char *cumsum,
+                            std::vector<sycl::event> &host_tasks,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+
+    using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexerT strided_indexer{nd, 0, shape_strides};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
+
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
+    }
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    auto host_usm_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
+    cumsumT *last_elem_host_usm = host_usm_owner.get();
+
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
+    copy_e.wait();
+    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
+
+    // explicitly free USM-host temporary, by invoking deleter of
+    // the unique_ptr
+    host_usm_owner.reset(nullptr);
+
+    return return_val;
+}
+
+template <typename fnT, typename T>
+struct MaskPositionsStridedFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPositionsStridedFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct Cumsum1DStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                cumsum_val_strided_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::accumulators
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.cpp b/dpctl_ext/tensor/libtensor/source/accumulators.cpp
new file mode 100644
index 000000000000..82913010755a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators.cpp
@@ -0,0 +1,406 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+// Computation of positions of masked elements
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
+    mask_positions_contig_i64_dispatch_vector[td_ns::num_types];
+static cumsum_val_contig_impl_fn_ptr_t
+    mask_positions_contig_i32_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
+    mask_positions_strided_i64_dispatch_vector[td_ns::num_types];
+static cumsum_val_strided_impl_fn_ptr_t
+    mask_positions_strided_i32_dispatch_vector[td_ns::num_types];
+
+void populate_mask_positions_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt64;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt64,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt32;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt32,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector);
+
+    return;
+}
+
+std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
+                              const dpctl::tensor::usm_ndarray &cumsum,
+                              sycl::queue &exec_q,
+                              const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
+
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("Result array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
+    }
+
+    // cumsum.shape == (mask.size,)
+    auto mask_size = mask.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != mask_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (mask_size == 0) {
+        return 0;
+    }
+
+    int mask_typenum = mask.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // mask can be any type
+    const char *mask_data = mask.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int32_t/int64_t only
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int32 or int64 data-type.");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    std::vector<sycl::event> host_task_events;
+
+    if (mask.is_c_contiguous()) {
+        auto fn = (use_i32)
+                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
+                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
+
+        std::size_t total_set;
+
+        {
+            py::gil_scoped_release release;
+
+            total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
+                           host_task_events, depends);
+
+            sycl::event::wait(host_task_events);
+        }
+        return total_set;
+    }
+
+    const py::ssize_t *shape = mask.get_shape_raw();
+    auto const &strides_vector = mask.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int mask_nd = mask.get_ndim();
+    int nd = mask_nd;
+
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
+
+    // Strided implementation
+    auto strided_fn =
+        (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
+                  : mask_positions_strided_i64_dispatch_vector[mask_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+
+            // ensure deleter of smart pointer is invoked with GIL released
+            shape_strides_owner.reset(nullptr);
+        }
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    std::size_t total_set;
+
+    {
+        py::gil_scoped_release release;
+
+        total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides,
+                               cumsum_data, host_task_events, dependent_events);
+
+        sycl::event::wait(host_task_events);
+        // ensure deleter of smart pointer is invoked with GIL released
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return total_set;
+}
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
+    cumsum_1d_strided_dispatch_vector[td_ns::num_types];
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_vector[td_ns::num_types];
+
+void populate_cumsum_1d_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 Cumsum1DContigFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 Cumsum1DStridedFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector);
+
+    return;
+}
+
+std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
+                         const dpctl::tensor::usm_ndarray &cumsum,
+                         sycl::queue &exec_q,
+                         std::vector<sycl::event> const &depends)
+{
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    // cumsum.shape == (src.size,)
+    auto src_size = src.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != src_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
+
+    if (src_size == 0) {
+        return 0;
+    }
+
+    int src_typenum = src.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // src can be any type
+    const char *src_data = src.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // this cumsum must be int64_t only
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int64 data-type.");
+    }
+
+    std::vector<sycl::event> host_task_events;
+
+    if (src.is_c_contiguous()) {
+        auto fn = cumsum_1d_contig_dispatch_vector[src_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error(
+                "this cumsum requires integer type, got src_typeid=" +
+                std::to_string(src_typeid));
+        }
+        std::size_t total = fn(exec_q, src_size, src_data, cumsum_data,
+                               host_task_events, depends);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
+        return total;
+    }
+
+    const py::ssize_t *shape = src.get_shape_raw();
+    auto const &strides_vector = src.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int src_nd = src.get_ndim();
+    int nd = src_nd;
+
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
+
+    // Strided implementation
+    auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "this cumsum requires integer type, got src_typeid=" +
+            std::to_string(src_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+
+            // ensure USM deleter is called with GIL released
+            shape_strides_owner.reset(nullptr);
+        }
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    std::size_t total =
+        strided_fn(exec_q, src_size, src_data, nd, shape_strides, cumsum_data,
+                   host_task_events, dependent_events);
+
+    {
+        py::gil_scoped_release release;
+        sycl::event::wait(host_task_events);
+
+        // ensure USM deleter is called with GIL released
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return total;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.hpp b/dpctl_ext/tensor/libtensor/source/accumulators.hpp
new file mode 100644
index 000000000000..42503093789b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators.hpp
@@ -0,0 +1,62 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void populate_mask_positions_dispatch_vectors(void);
+
+extern std::size_t
+    py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
+                      const dpctl::tensor::usm_ndarray &cumsum,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void populate_cumsum_1d_dispatch_vectors(void);
+
+extern std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
+                                const dpctl::tensor::usm_ndarray &cumsum,
+                                sycl::queue &exec_q,
+                                std::vector<sycl::event> const &depends = {});
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 3e5be4d9e8fe..3c096df8e367 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -43,7 +43,7 @@
 
 #include "dpnp4pybind11.hpp"
 
-// #include "accumulators.hpp"
+#include "accumulators.hpp"
 // #include "boolean_advanced_indexing.hpp"
 // #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
@@ -113,12 +113,12 @@ using dpctl::tensor::py_internal::usm_ndarray_put;
 using dpctl::tensor::py_internal::usm_ndarray_take;
 
 // using dpctl::tensor::py_internal::py_extract;
-// using dpctl::tensor::py_internal::py_mask_positions;
+using dpctl::tensor::py_internal::py_mask_positions;
 // using dpctl::tensor::py_internal::py_nonzero;
 // using dpctl::tensor::py_internal::py_place;
 
 /* ================= Repeat ====================*/
-// using dpctl::tensor::py_internal::py_cumsum_1d;
+using dpctl::tensor::py_internal::py_cumsum_1d;
 // using dpctl::tensor::py_internal::py_repeat_by_scalar;
 // using dpctl::tensor::py_internal::py_repeat_by_sequence;
 
@@ -166,9 +166,9 @@ void init_dispatch_vectors(void)
     // populate_masked_extract_dispatch_vectors();
     // populate_masked_place_dispatch_vectors();
 
-    // populate_mask_positions_dispatch_vectors();
+    populate_mask_positions_dispatch_vectors();
 
-    // populate_cumsum_1d_dispatch_vectors();
+    populate_cumsum_1d_dispatch_vectors();
     // init_repeat_dispatch_vectors();
 
     // init_clip_dispatch_vectors();
@@ -408,12 +408,12 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
-    //       py::arg("cumsum"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+          py::arg("cumsum"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
     //       py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),

From 69b36b2a3bef5c46d5ea56e58afc8e4dcd2366a1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 05:50:52 -0800
Subject: [PATCH 051/145] Move ti._extract(), ti._place(), ti._nonzero()

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../kernels/boolean_advanced_indexing.hpp     | 853 +++++++++++++++++
 .../source/boolean_advanced_indexing.cpp      | 859 ++++++++++++++++++
 .../source/boolean_advanced_indexing.hpp      |  81 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  30 +-
 5 files changed, 1809 insertions(+), 16 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index f18481d08189..88507aa2192f 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -53,7 +53,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
new file mode 100644
index 000000000000..046ad87d7d78
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -0,0 +1,853 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::indexing
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename OrthogIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT,
+          typename LocalAccessorT>
+struct MaskedExtractStridedFunctor
+{
+    MaskedExtractStridedFunctor(const dataT *src_data_p,
+                                const indT *cumsum_data_p,
+                                dataT *dst_data_p,
+                                std::size_t masked_iter_size,
+                                const OrthogIndexerT &orthog_src_dst_indexer_,
+                                const MaskedSrcIndexerT &masked_src_indexer_,
+                                const MaskedDstIndexerT &masked_dst_indexer_,
+                                const LocalAccessorT &lacc_)
+        : src(src_data_p), cumsum(cumsum_data_p), dst(dst_data_p),
+          masked_nelems(masked_iter_size),
+          orthog_src_dst_indexer(orthog_src_dst_indexer_),
+          masked_src_indexer(masked_src_indexer_),
+          masked_dst_indexer(masked_dst_indexer_), lacc(lacc_)
+    {
+        static_assert(
+            std::is_same_v<indT, typename LocalAccessorT::value_type>);
+    }
+
+    void operator()(sycl::nd_item<2> ndit) const
+    {
+        const std::size_t orthog_i = ndit.get_global_id(0);
+        const std::uint32_t l_i = ndit.get_local_id(1);
+        const std::uint32_t lws = ndit.get_local_range(1);
+
+        const std::size_t masked_i = ndit.get_global_id(1);
+        const std::size_t masked_block_start = masked_i - l_i;
+
+        const std::size_t max_offset = masked_nelems + 1;
+        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+            const std::size_t offset = masked_block_start + i;
+            lacc[i] = (offset == 0)           ? indT(0)
+                      : (offset < max_offset) ? cumsum[offset - 1]
+                                              : cumsum[masked_nelems - 1] + 1;
+        }
+
+        sycl::group_barrier(ndit.get_group());
+
+        const indT current_running_count = lacc[l_i + 1];
+        const bool mask_set = (masked_i == 0)
+                                  ? (current_running_count == 1)
+                                  : (current_running_count == lacc[l_i] + 1);
+
+        // dst[cumsum[i] - 1, j] = src[i, j]
+        //     if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
+        if (mask_set && (masked_i < masked_nelems)) {
+            const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i);
+
+            const std::size_t total_src_offset =
+                masked_src_indexer(masked_i) +
+                orthog_offsets.get_first_offset();
+            const std::size_t total_dst_offset =
+                masked_dst_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst[total_dst_offset] = src[total_src_offset];
+        }
+    }
+
+private:
+    const dataT *src = nullptr;
+    const indT *cumsum = nullptr;
+    dataT *dst = nullptr;
+    std::size_t masked_nelems = 0;
+    // has nd, shape, src_strides, dst_strides for
+    // dimensions that ARE NOT masked
+    OrthogIndexerT orthog_src_dst_indexer;
+    // has nd, shape, src_strides for
+    // dimensions that ARE masked
+    MaskedSrcIndexerT masked_src_indexer;
+    // has 1, dst_strides for dimensions that ARE masked
+    MaskedDstIndexerT masked_dst_indexer;
+    LocalAccessorT lacc;
+};
+
+template <typename OrthogIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT,
+          typename LocalAccessorT>
+struct MaskedPlaceStridedFunctor
+{
+    MaskedPlaceStridedFunctor(dataT *dst_data_p,
+                              const indT *cumsum_data_p,
+                              const dataT *rhs_data_p,
+                              std::size_t masked_iter_size,
+                              const OrthogIndexerT &orthog_dst_rhs_indexer_,
+                              const MaskedDstIndexerT &masked_dst_indexer_,
+                              const MaskedRhsIndexerT &masked_rhs_indexer_,
+                              const LocalAccessorT &lacc_)
+        : dst(dst_data_p), cumsum(cumsum_data_p), rhs(rhs_data_p),
+          masked_nelems(masked_iter_size),
+          orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_),
+          masked_dst_indexer(masked_dst_indexer_),
+          masked_rhs_indexer(masked_rhs_indexer_), lacc(lacc_)
+    {
+        static_assert(
+            std::is_same_v<indT, typename LocalAccessorT::value_type>);
+    }
+
+    void operator()(sycl::nd_item<2> ndit) const
+    {
+        const std::size_t orthog_i = ndit.get_global_id(0);
+        const std::uint32_t l_i = ndit.get_local_id(1);
+        const std::uint32_t lws = ndit.get_local_range(1);
+
+        const std::size_t masked_i = ndit.get_global_id(1);
+        const std::size_t masked_block_start = masked_i - l_i;
+
+        const std::size_t max_offset = masked_nelems + 1;
+        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+            const std::size_t offset = masked_block_start + i;
+            lacc[i] = (offset == 0)           ? indT(0)
+                      : (offset < max_offset) ? cumsum[offset - 1]
+                                              : cumsum[masked_nelems - 1] + 1;
+        }
+
+        sycl::group_barrier(ndit.get_group());
+
+        const indT current_running_count = lacc[l_i + 1];
+        const bool mask_set = (masked_i == 0)
+                                  ? (current_running_count == 1)
+                                  : (current_running_count == lacc[l_i] + 1);
+
+        // src[i, j] = rhs[cumsum[i] - 1, j]
+        // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
+        if (mask_set && (masked_i < masked_nelems)) {
+            const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i);
+
+            const std::size_t total_dst_offset =
+                masked_dst_indexer(masked_i) +
+                orthog_offsets.get_first_offset();
+            const std::size_t total_rhs_offset =
+                masked_rhs_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst[total_dst_offset] = rhs[total_rhs_offset];
+        }
+    }
+
+private:
+    dataT *dst = nullptr;
+    const indT *cumsum = nullptr;
+    const dataT *rhs = nullptr;
+    std::size_t masked_nelems = 0;
+    // has nd, shape, dst_strides, rhs_strides for
+    // dimensions that ARE NOT masked
+    OrthogIndexerT orthog_dst_rhs_indexer;
+    // has nd, shape, dst_strides for
+    // dimensions that ARE masked
+    MaskedDstIndexerT masked_dst_indexer;
+    // has 1, rhs_strides for dimensions that ARE masked
+    MaskedRhsIndexerT masked_rhs_indexer;
+    LocalAccessorT lacc;
+};
+
+// ======= Masked extraction ================================
+
+namespace detail
+{
+
+template <std::size_t I, std::size_t... IR>
+std::size_t _get_lws_impl(std::size_t n)
+{
+    if constexpr (sizeof...(IR) == 0) {
+        return I;
+    }
+    else {
+        return (n < I) ? _get_lws_impl<IR...>(n) : I;
+    }
+}
+
+inline std::size_t get_lws(std::size_t n)
+{
+    static constexpr std::size_t lws0 = 256u;
+    static constexpr std::size_t lws1 = 128u;
+    static constexpr std::size_t lws2 = 64u;
+    return _get_lws_impl<lws0, lws1, lws2>(n);
+}
+
+} // end of namespace detail
+
+template <typename MaskedDstIndexerT, typename dataT, typename indT>
+class masked_extract_all_slices_contig_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_contig_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    ssize_t dst_size, // dst is 1D
+    ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    static constexpr NoOpIndexer masked_src_indexer{};
+    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
+                                              /* step */ dst_stride);
+
+    using KernelName =
+        class masked_extract_all_slices_contig_impl_krn<Strided1DIndexer, dataT,
+                                                        indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer, NoOpIndexer,
+                                           Strided1DIndexer, dataT, indT,
+                                           LocalAccessorT>;
+
+    const std::size_t masked_extent = iteration_size;
+
+    const std::size_t lws = detail::get_lws(masked_extent);
+
+    const std::size_t n_groups = (iteration_size + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_extent,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int nd,
+    const ssize_t
+        *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
+    ssize_t dst_size,              // dst is 1D
+    ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
+    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
+                                              /* step */ dst_stride);
+
+    using KernelName = class masked_extract_all_slices_strided_impl_krn<
+        StridedIndexer, Strided1DIndexer, dataT, indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer,
+                                           StridedIndexer, Strided1DIndexer,
+                                           dataT, indT, LocalAccessorT>;
+
+    const std::size_t masked_nelems = iteration_size;
+
+    const std::size_t lws = detail::get_lws(masked_nelems);
+
+    const std::size_t n_groups = (masked_nelems + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_nelems) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, iteration_size,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_some_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t orthog_nelems,
+    ssize_t masked_nelems,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int orthog_nd,
+    // [ortho_shape, ortho_src_strides, // ortho_dst_strides],
+    // length 3*ortho_nd
+    const ssize_t *packed_ortho_src_dst_shape_strides,
+    ssize_t ortho_src_offset,
+    ssize_t ortho_dst_offset,
+    int masked_nd,
+    // [masked_src_shape, masked_src_strides],
+    // length 2*masked_nd, mask_dst is 1D
+    const ssize_t *packed_masked_src_shape_strides,
+    ssize_t masked_dst_size,
+    ssize_t masked_dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    const TwoOffsets_StridedIndexer orthog_src_dst_indexer{
+        orthog_nd, ortho_src_offset, ortho_dst_offset,
+        packed_ortho_src_dst_shape_strides};
+
+    const StridedIndexer masked_src_indexer{masked_nd, 0,
+                                            packed_masked_src_shape_strides};
+    const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size,
+                                              /* step */ masked_dst_stride};
+
+    using KernelName = class masked_extract_some_slices_strided_impl_krn<
+        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
+        indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoOffsets_StridedIndexer,
+                                           StridedIndexer, Strided1DIndexer,
+                                           dataT, indT, LocalAccessorT>;
+
+    const std::size_t masked_extent = masked_nelems;
+
+    const std::size_t lws = detail::get_lws(masked_extent);
+
+    const std::size_t n_groups = ((masked_extent + lws - 1) / lws);
+    const std::size_t orthog_extent = static_cast<std::size_t>(orthog_nelems);
+
+    sycl::range<2> gRange{orthog_extent, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size =
+            std::min<std::size_t>(lws, masked_extent) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_nelems,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesContigFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_contig_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesContigFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_contig_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractSomeSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractSomeSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Masked placement
+
+template <typename OrthoIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_place_all_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int nd,
+    const ssize_t
+        *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
+    ssize_t rhs_size,              // rhs is 1D
+    ssize_t rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
+    const Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
+
+    using KernelName = class masked_place_all_slices_strided_impl_krn<
+        TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT,
+        indT>;
+
+    static constexpr std::size_t nominal_lws = 256;
+    const std::size_t masked_extent = iteration_size;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                  Strided1DCyclicIndexer, dataT, indT,
+                                  LocalAccessorT>;
+
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, iteration_size,
+                          orthog_dst_rhs_indexer, masked_dst_indexer,
+                          masked_rhs_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_place_some_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t orthog_nelems,
+    ssize_t masked_nelems,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int orthog_nd,
+    // [ortho_shape, ortho_dst_strides, ortho_rhs_strides],
+    // length 3*ortho_nd
+    const ssize_t *packed_ortho_dst_rhs_shape_strides,
+    ssize_t ortho_dst_offset,
+    ssize_t ortho_rhs_offset,
+    int masked_nd,
+    // [masked_dst_shape, masked_dst_strides],
+    // length 2*masked_nd, mask_dst is 1D
+    const ssize_t *packed_masked_dst_shape_strides,
+    ssize_t masked_rhs_size,
+    ssize_t masked_rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    const TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
+        orthog_nd, ortho_dst_offset, ortho_rhs_offset,
+        packed_ortho_dst_rhs_shape_strides};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_dst_indexer{masked_nd, 0,
+                                            packed_masked_dst_shape_strides};
+    const Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size,
+                                                    masked_rhs_stride};
+
+    using KernelName = class masked_place_some_slices_strided_impl_krn<
+        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
+        dataT, indT>;
+
+    static constexpr std::size_t nominal_lws = 256;
+    const std::size_t orthog_extent = orthog_nelems;
+    const std::size_t masked_extent = masked_nelems;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+
+    sycl::range<2> gRange{orthog_extent, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
+                                  Strided1DCyclicIndexer, dataT, indT,
+                                  LocalAccessorT>;
+
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, masked_nelems,
+                          orthog_dst_rhs_indexer, masked_dst_indexer,
+                          masked_rhs_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T>
+struct MaskPlaceAllSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceAllSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceSomeSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceSomeSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Non-zero
+
+template <typename T1, typename T2>
+class non_zero_indexes_krn;
+
+typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    int,
+    const char *,
+    char *,
+    const ssize_t *,
+    std::vector<sycl::event> const &);
+
+template <typename indT1, typename indT2>
+sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
+                                  ssize_t iter_size,
+                                  ssize_t nz_elems,
+                                  int nd,
+                                  const char *cumsum_cp,
+                                  char *indexes_cp,
+                                  const ssize_t *mask_shape,
+                                  std::vector<sycl::event> const &depends)
+{
+    const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
+    indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
+
+    static constexpr std::size_t nominal_lws = 256u;
+    const std::size_t masked_extent = iter_size;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
+        sycl::local_accessor<indT1, 1> lacc(lacc_size, cgh);
+
+        using KernelName = class non_zero_indexes_krn<indT1, indT2>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_i = ndit.get_group(0);
+            const std::uint32_t l_i = ndit.get_local_id(0);
+            const std::uint32_t lws = ndit.get_local_range(0);
+
+            const std::size_t masked_block_start = group_i * lws;
+
+            for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+                const std::size_t offset = masked_block_start + i;
+                lacc[i] = (offset == 0) ? indT1(0)
+                          : (offset - 1 < masked_extent)
+                              ? cumsum_data[offset - 1]
+                              : cumsum_data[masked_extent - 1] + 1;
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            const std::size_t i = masked_block_start + l_i;
+            const auto cs_val = lacc[l_i];
+            const bool cond = (lacc[l_i + 1] == cs_val + 1);
+
+            if (cond && (i < masked_extent)) {
+                ssize_t i_ = static_cast<ssize_t>(i);
+                for (int dim = nd; --dim > 0;) {
+                    const auto sd = mask_shape[dim];
+                    const ssize_t q = i_ / sd;
+                    const ssize_t r = (i_ - q * sd);
+                    indexes_data[cs_val + dim * nz_elems] =
+                        static_cast<indT2>(r);
+                    i_ = q;
+                }
+                indexes_data[cs_val] = static_cast<indT2>(i_);
+            }
+        });
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels::indexing
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
new file mode 100644
index 000000000000..a78cb1750b81
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -0,0 +1,859 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.place and
+/// dpctl.tensor.extract, dpctl.tensor.nonzero
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "boolean_advanced_indexing.hpp"
+#include "kernels/boolean_advanced_indexing.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+// Masked extraction
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_i32_impl_dispatch_vector
+        [td_ns::num_types];
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_i64_impl_dispatch_vector
+        [td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_contig_impl_fn_ptr_t;
+
+static masked_extract_all_slices_contig_impl_fn_ptr_t
+    masked_extract_all_slices_contig_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_extract_all_slices_contig_impl_fn_ptr_t
+    masked_extract_all_slices_contig_i64_impl_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_some_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_i32_impl_dispatch_vector
+        [td_ns::num_types];
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_i64_impl_dispatch_vector
+        [td_ns::num_types];
+
+void populate_masked_extract_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactoryForInt32, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_extract_all_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactoryForInt64, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_extract_all_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractSomeSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactoryForInt32, td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(
+        masked_extract_some_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractSomeSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactoryForInt64, td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(
+        masked_extract_some_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesContigFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
+                                 MaskExtractAllSlicesContigFactoryForInt32,
+                                 td_ns::num_types>
+        dvb5;
+    dvb5.populate_dispatch_vector(
+        masked_extract_all_slices_contig_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesContigFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
+                                 MaskExtractAllSlicesContigFactoryForInt64,
+                                 td_ns::num_types>
+        dvb6;
+    dvb6.populate_dispatch_vector(
+        masked_extract_all_slices_contig_i64_impl_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_extract(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &cumsum,
+               int axis_start, // axis_start <= mask_i < axis_end
+               int axis_end,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int src_nd = src.get_ndim();
+    if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_ortho_dims(true);
+    std::size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis_end; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]);
+    }
+
+    std::size_t masked_src_nelems(1);
+    std::size_t masked_dst_nelems(dst_shape[axis_start]);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_src_nelems *= src_shape[i];
+    }
+
+    // masked_dst_nelems is number of set elements in the mask, or last element
+    // in cumsum
+    if (!same_ortho_dims ||
+        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz)))
+    {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, ortho_nelems * masked_dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, cumsum) || overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error("Unexpected data type of cumsum array, expecting "
+                              "'int32' or 'int64'");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event extract_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == src_nd) {
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src.is_c_contiguous()) {
+            auto fn =
+                (use_i32)
+                    ? masked_extract_all_slices_contig_i32_impl_dispatch_vector
+                          [src_typeid]
+                    : masked_extract_all_slices_contig_i64_impl_dispatch_vector
+                          [src_typeid];
+
+            extract_ev =
+                fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, dst_data_p,
+                   dst_shape_vec[0], dst_strides_vec[0], depends);
+
+            //
+            host_task_events.push_back(extract_ev);
+        }
+        else {
+            // empty orthogonal directions
+            auto fn =
+                (use_i32)
+                    ? masked_extract_all_slices_strided_i32_impl_dispatch_vector
+                          [src_typeid]
+                    : masked_extract_all_slices_strided_i64_impl_dispatch_vector
+                          [src_typeid];
+
+            using dpctl::tensor::offset_utils::device_allocate_and_pack;
+            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, src_shape_vec, src_strides_vec);
+            auto packed_src_shape_strides_owner =
+                std::move(std::get<0>(ptr_size_event_tuple1));
+            sycl::event copy_src_shape_strides_ev =
+                std::get<2>(ptr_size_event_tuple1);
+            const py::ssize_t *packed_src_shape_strides =
+                packed_src_shape_strides_owner.get();
+
+            std::vector<sycl::event> all_deps;
+            all_deps.reserve(depends.size() + 1);
+            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+            all_deps.push_back(copy_src_shape_strides_ev);
+
+            assert(all_deps.size() == depends.size() + 1);
+
+            extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p,
+                            dst_data_p, src_nd, packed_src_shape_strides,
+                            dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+            sycl::event cleanup_tmp_allocations_ev =
+                dpctl::tensor::alloc_utils::async_smart_free(
+                    exec_q, {extract_ev}, packed_src_shape_strides_owner);
+            host_task_events.push_back(cleanup_tmp_allocations_ev);
+        }
+    }
+    else {
+        // non-empty orthogonal directions
+        auto fn =
+            (use_i32)
+                ? masked_extract_some_slices_strided_i32_impl_dispatch_vector
+                      [src_typeid]
+                : masked_extract_some_slices_strided_i64_impl_dispatch_vector
+                      [src_typeid];
+
+        int masked_src_nd = mask_span_sz;
+        int ortho_nd = src_nd - masked_src_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_src_shape;
+        shT masked_src_shape;
+        shT ortho_src_strides;
+        shT masked_src_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            src_shape_vec, src_strides_vec, axis_start, axis_end,
+            ortho_src_shape,
+            masked_src_shape, // 4 vectors modified
+            ortho_src_strides, masked_src_strides);
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis_start, axis_start + 1,
+            ortho_dst_shape,
+            masked_dst_shape, // 4 vectors modified
+            ortho_dst_strides, masked_dst_strides);
+
+        assert(ortho_src_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(),
+                          ortho_dst_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_src_strides;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+
+        const py::ssize_t *_shape = ortho_src_shape.data();
+
+        py::ssize_t ortho_src_offset(0);
+        py::ssize_t ortho_dst_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, ortho_src_strides, ortho_dst_strides,
+            // output
+            simplified_ortho_shape, simplified_ortho_src_strides,
+            simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset);
+
+        assert(masked_dst_shape.size() == 1);
+        assert(masked_dst_strides.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_src_strides, simplified_ortho_dst_strides,
+            masked_src_shape, masked_src_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        const py::ssize_t *packed_ortho_src_dst_shape_strides =
+            packed_shapes_strides;
+        const py::ssize_t *packed_masked_src_shape_strides =
+            packed_shapes_strides + (3 * ortho_nd);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT
+        // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_
+        extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p,
+                        cumsum_data_p, dst_data_p,
+                        // data to build orthog_src_dst_indexer
+                        ortho_nd, packed_ortho_src_dst_shape_strides,
+                        ortho_src_offset, ortho_dst_offset,
+                        // data to build masked_src_indexer
+                        masked_src_nd, packed_masked_src_shape_strides,
+                        // data to build masked_dst_indexer,
+                        masked_dst_shape[0], masked_dst_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {extract_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
+}
+
+// Masked placement
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_all_slices_strided_impl_fn_ptr_t;
+
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_some_slices_strided_impl_fn_ptr_t;
+
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
+
+void populate_masked_place_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceAllSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceAllSlicesStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_place_all_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceAllSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceAllSlicesStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_place_all_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceSomeSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceSomeSlicesStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(
+        masked_place_some_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceSomeSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceSomeSlicesStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(
+        masked_place_some_slices_strided_i64_impl_dispatch_vector);
+}
+
+/*
+ * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id]  if cumsum[i] ==
+ * ((i > 0) ? cumsum[i-1] + 1 : 1)
+ */
+std::pair<sycl::event, sycl::event>
+    py_place(const dpctl::tensor::usm_ndarray &dst,
+             const dpctl::tensor::usm_ndarray &cumsum,
+             int axis_start, // axis_start <= mask_i < axis_end
+             int axis_end,
+             const dpctl::tensor::usm_ndarray &rhs,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int dst_nd = dst.get_ndim();
+    if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int rhs_nd = rhs.get_ndim();
+    if (dst_nd != rhs_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
+    bool same_ortho_dims(true);
+    std::size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]);
+    }
+    for (auto i = axis_end; i < dst_nd; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]);
+    }
+
+    std::size_t masked_dst_nelems(1);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_dst_nelems *= dst_shape[i];
+    }
+
+    if (!same_ortho_dims ||
+        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz)))
+    {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, ortho_nelems * masked_dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, rhs) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int dst_typenum = dst.get_typenum();
+    int rhs_typenum = rhs.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error("Unexpected data type of cumsum array, expecting "
+                              "'int32' or 'int64'");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    if (dst_typeid != rhs_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *dst_data_p = dst.get_data();
+    char *rhs_data_p = rhs.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto rhs_shape_vec = rhs.get_shape_vector();
+    auto rhs_strides_vec = rhs.get_strides_vector();
+
+    sycl::event place_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == dst_nd) {
+        // empty orthogonal directions
+        auto fn = (use_i32)
+                      ? masked_place_all_slices_strided_i32_impl_dispatch_vector
+                            [dst_typeid]
+                      : masked_place_all_slices_strided_i64_impl_dispatch_vector
+                            [dst_typeid];
+
+        assert(rhs_shape_vec.size() == 1);
+        assert(rhs_strides_vec.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape_vec, dst_strides_vec);
+        auto packed_dst_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_dst_shape_strides_ev =
+            std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_dst_shape_strides =
+            packed_dst_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_dst_shape_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        place_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, rhs_data_p,
+                      dst_nd, packed_dst_shape_strides, rhs_shape_vec[0],
+                      rhs_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {place_ev}, packed_dst_shape_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+        auto fn =
+            (use_i32)
+                ? masked_place_some_slices_strided_i32_impl_dispatch_vector
+                      [dst_typeid]
+                : masked_place_some_slices_strided_i64_impl_dispatch_vector
+                      [dst_typeid];
+
+        int masked_dst_nd = mask_span_sz;
+        int ortho_nd = dst_nd - masked_dst_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis_start, axis_end,
+            ortho_dst_shape,
+            masked_dst_shape, // 4 vectors modified
+            ortho_dst_strides, masked_dst_strides);
+
+        shT ortho_rhs_shape;
+        shT masked_rhs_shape;
+        shT ortho_rhs_strides;
+        shT masked_rhs_strides;
+        dpctl::tensor::py_internal::split_iteration_space(
+            rhs_shape_vec, rhs_strides_vec, axis_start, axis_start + 1,
+            ortho_rhs_shape,
+            masked_rhs_shape, // 4 vectors modified
+            ortho_rhs_strides, masked_rhs_strides);
+
+        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(ortho_rhs_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(),
+                          ortho_rhs_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+        std::vector<py::ssize_t> simplified_ortho_rhs_strides;
+
+        const py::ssize_t *_shape = ortho_dst_shape.data();
+
+        py::ssize_t ortho_dst_offset(0);
+        py::ssize_t ortho_rhs_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            ortho_nd, _shape, ortho_dst_strides, ortho_rhs_strides,
+            simplified_ortho_shape, simplified_ortho_dst_strides,
+            simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset);
+
+        assert(masked_rhs_shape.size() == 1);
+        assert(masked_rhs_strides.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_dst_strides, simplified_ortho_rhs_strides,
+            masked_dst_shape, masked_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        const py::ssize_t *packed_ortho_dst_rhs_shape_strides =
+            packed_shapes_strides;
+        const py::ssize_t *packed_masked_dst_shape_strides =
+            packed_shapes_strides + (3 * ortho_nd);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        place_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p,
+                      cumsum_data_p, rhs_data_p,
+                      // data to build orthog_dst_rhs_indexer
+                      ortho_nd, packed_ortho_dst_rhs_shape_strides,
+                      ortho_dst_offset, ortho_rhs_offset,
+                      // data to build masked_dst_indexer
+                      masked_dst_nd, packed_masked_dst_shape_strides,
+                      // data to build masked_dst_indexer,
+                      masked_rhs_shape[0], masked_rhs_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {place_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {dst, cumsum, rhs}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, place_ev);
+}
+
+// Non-zero
+
+std::pair<sycl::event, sycl::event>
+    py_nonzero(const dpctl::tensor::usm_ndarray
+                   &cumsum, // int32/int64 input array, 1D, C-contiguous
+               const dpctl::tensor::usm_ndarray
+                   &indexes, // int32/int64 2D output array, C-contiguous
+               const std::vector<py::ssize_t>
+                   &mask_shape, // shape of array from which cumsum was computed
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(indexes);
+
+    int cumsum_nd = cumsum.get_ndim();
+    if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) {
+        throw py::value_error("Cumsum array must be a C-contiguous vector");
+    }
+
+    int indexes_nd = indexes.get_ndim();
+    if (indexes_nd != 2 || !indexes.is_c_contiguous()) {
+        throw py::value_error("Index array must be a C-contiguous matrix");
+    }
+
+    std::size_t _ndim = mask_shape.size();
+    if (_ndim > std::numeric_limits<int>::max()) {
+        throw py::value_error("Shape is too large");
+    }
+    int ndim = static_cast<int>(_ndim);
+
+    const py::ssize_t *indexes_shape = indexes.get_shape_raw();
+
+    if (ndim != indexes_shape[0]) {
+        throw py::value_error(
+            "Length of shape must equal width of index matrix");
+    }
+
+    auto cumsum_sz = cumsum.get_size();
+    py::ssize_t shape_nelems =
+        std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1),
+                        std::multiplies<py::ssize_t>());
+
+    if (cumsum_sz != shape_nelems) {
+        throw py::value_error("Shape and cumsum size are not consistent");
+    }
+
+    py::ssize_t nz_elems = indexes_shape[1];
+
+    int indexes_typenum = indexes.get_typenum();
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum);
+
+    int cumsum_typenum = cumsum.get_typenum();
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
+    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+
+    // cumsum must be int32_t or int64_t only
+    if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) ||
+        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid))
+    {
+        throw py::value_error("Cumulative sum array and index array must have "
+                              "int32 or int64 data-type");
+    }
+
+    if (cumsum_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(cumsum, indexes)) {
+        throw py::value_error("Arrays are expected to ave no memory overlap");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        indexes, nz_elems * _ndim);
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto mask_shape_copying_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, mask_shape);
+    auto src_shape_device_owner =
+        std::move(std::get<0>(mask_shape_copying_tuple));
+    sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple);
+    const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_ev);
+
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_fn_ptr_t;
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_impl;
+
+    int fn_index = ((cumsum_typeid == int64_typeid) ? 1 : 0) +
+                   ((indexes_typeid == int64_typeid) ? 2 : 0);
+    std::array<non_zero_indexes_fn_ptr_t, 4> fn_impls = {
+        non_zero_indexes_impl<std::int32_t, std::int32_t>,
+        non_zero_indexes_impl<std::int64_t, std::int32_t>,
+        non_zero_indexes_impl<std::int32_t, std::int64_t>,
+        non_zero_indexes_impl<std::int64_t, std::int64_t>};
+    auto fn = fn_impls[fn_index];
+
+    sycl::event non_zero_indexes_ev =
+        fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(),
+           indexes.get_data(), src_shape_device_ptr, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {non_zero_indexes_ev}, src_shape_device_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {cumsum, indexes}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, non_zero_indexes_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp
new file mode 100644
index 000000000000..71eafc77b00c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp
@@ -0,0 +1,81 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_extract(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &cumsum,
+               int axis_start, // axis_start <= mask_i < axis_end
+               int axis_end,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends = {});
+
+extern void populate_masked_extract_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_place(const dpctl::tensor::usm_ndarray &dst,
+             const dpctl::tensor::usm_ndarray &cumsum,
+             int axis_start, // axis_start <= mask_i < axis_end
+             int axis_end,
+             const dpctl::tensor::usm_ndarray &rhs,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends = {});
+
+extern void populate_masked_place_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_nonzero(const dpctl::tensor::usm_ndarray
+                   &cumsum, // int32 input array, 1D, C-contiguous
+               const dpctl::tensor::usm_ndarray
+                   &indexes, // int32 2D output array, C-contiguous
+               const std::vector<py::ssize_t>
+                   &mask_shape, // shape of array from which cumsum was computed
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends = {});
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 3c096df8e367..901af48074be 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -44,7 +44,7 @@
 #include "dpnp4pybind11.hpp"
 
 #include "accumulators.hpp"
-// #include "boolean_advanced_indexing.hpp"
+#include "boolean_advanced_indexing.hpp"
 // #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
@@ -112,10 +112,10 @@ using dpctl::tensor::py_internal::usm_ndarray_zeros;
 using dpctl::tensor::py_internal::usm_ndarray_put;
 using dpctl::tensor::py_internal::usm_ndarray_take;
 
-// using dpctl::tensor::py_internal::py_extract;
+using dpctl::tensor::py_internal::py_extract;
 using dpctl::tensor::py_internal::py_mask_positions;
-// using dpctl::tensor::py_internal::py_nonzero;
-// using dpctl::tensor::py_internal::py_place;
+using dpctl::tensor::py_internal::py_nonzero;
+using dpctl::tensor::py_internal::py_place;
 
 /* ================= Repeat ====================*/
 using dpctl::tensor::py_internal::py_cumsum_1d;
@@ -163,8 +163,8 @@ void init_dispatch_vectors(void)
     // init_eye_ctor_dispatch_vectors();
     init_triul_ctor_dispatch_vectors();
 
-    // populate_masked_extract_dispatch_vectors();
-    // populate_masked_place_dispatch_vectors();
+    populate_masked_extract_dispatch_vectors();
+    populate_masked_place_dispatch_vectors();
 
     populate_mask_positions_dispatch_vectors();
 
@@ -415,9 +415,9 @@ PYBIND11_MODULE(_tensor_impl, m)
     m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
-    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     auto overlap = [](const dpctl::tensor::usm_ndarray &x1,
                       const dpctl::tensor::usm_ndarray &x2) -> bool {
@@ -438,13 +438,13 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array are the same",
           py::arg("array1"), py::arg("array2"));
 
-    // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
-    //       py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
-    //       py::arg("mask_shape"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+          py::arg("mask_shape"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
     //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),

From 70a0fc6ccf80c8a5490a8bfdaca8731d8aa89c2d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 05:58:22 -0800
Subject: [PATCH 052/145] Move place() to dpctl_ext/tensor and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py            |  2 +
 dpctl_ext/tensor/_indexing_functions.py | 68 +++++++++++++++++++++++++
 dpnp/dpnp_iface_indexing.py             |  2 +-
 dpnp/dpnp_iface_manipulation.py         |  2 +-
 4 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index edb2c096bad1..d4153b340596 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -40,6 +40,7 @@
     triu,
 )
 from dpctl_ext.tensor._indexing_functions import (
+    place,
     put,
     take,
 )
@@ -54,6 +55,7 @@
     "copy",
     "from_numpy",
     "full",
+    "place",
     "put",
     "reshape",
     "roll",
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index df4f3e953042..97aac6caf20f 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -50,6 +50,74 @@ def _get_indexing_mode(name):
         )
 
 
+def place(arr, mask, vals):
+    """place(arr, mask, vals)
+
+    Change elements of an array based on conditional and input values.
+
+    If ``mask`` is boolean ``dpctl.tensor.place`` is
+    equivalent to ``arr[condition] = vals``.
+
+    Args:
+        arr (usm_ndarray):
+            Array to put data into.
+        mask (usm_ndarray):
+            Boolean mask array. Must have the same size as ``arr``.
+        vals (usm_ndarray, sequence):
+            Values to put into ``arr``. Only the first N elements are
+            used, where N is the number of True values in ``mask``. If
+            ``vals`` is smaller than N, it will be repeated, and if
+            elements of ``arr`` are to be masked, this sequence must be
+            non-empty. Array ``vals`` must be one dimensional.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if not isinstance(mask, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            arr.sycl_queue,
+            mask.sycl_queue,
+            vals.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if arr.shape != mask.shape or vals.ndim != 1:
+        raise ValueError("Array sizes are not as required")
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    nz_count = ti.mask_positions(
+        mask, cumsum, sycl_queue=exec_q, depends=deps_ev
+    )
+    if nz_count == 0:
+        return
+    if vals.size == 0:
+        raise ValueError("Cannot insert from an empty array!")
+    if vals.dtype == arr.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, arr.dtype)
+    hev, pl_ev = ti._place(
+        dst=arr,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=mask.ndim,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(hev, pl_ev)
+
+
 def put(x, indices, vals, /, *, axis=None, mode="wrap"):
     """put(x, indices, vals, axis=None, mode="wrap")
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index b0769337c38b..a92370a69525 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -1619,7 +1619,7 @@ def place(a, mask, vals):
             usm_vals, usm_a.dtype, casting="safe", copy=False
         )
 
-    dpt.place(usm_a, usm_mask, usm_vals)
+    dpt_ext.place(usm_a, usm_mask, usm_vals)
 
 
 def put(a, ind, v, /, *, axis=None, mode="wrap"):
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 4866c912ab6a..e988bbaa237b 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -415,7 +415,7 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to replace the indices with
             # the index of the first NaN value in result array of unique values
-            dpt.place(
+            dpt_ext.place(
                 usm_res.inverse_indices,
                 usm_res.inverse_indices > first_nan,
                 dpt_ext.reshape(first_nan, 1),

From afa5411c6c8f5d1caf5cd0a134ad28725128c55a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 06:10:07 -0800
Subject: [PATCH 053/145] Move extract() to dpctl_ext/tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py            |  2 +
 dpctl_ext/tensor/_copy_utils.py         | 70 +++++++++++++++++++++++++
 dpctl_ext/tensor/_indexing_functions.py | 48 +++++++++++++++++
 dpnp/dpnp_iface_indexing.py             |  2 +-
 4 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index d4153b340596..8fb164828eb5 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -40,6 +40,7 @@
     triu,
 )
 from dpctl_ext.tensor._indexing_functions import (
+    extract,
     place,
     put,
     take,
@@ -53,6 +54,7 @@
     "asnumpy",
     "astype",
     "copy",
+    "extract",
     "from_numpy",
     "full",
     "place",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index c62218893a2c..4be6475ec4a3 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -27,6 +27,7 @@
 # *****************************************************************************
 
 import builtins
+import operator
 
 import dpctl
 import dpctl.memory as dpm
@@ -41,6 +42,8 @@
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 
+from ._numpy_helper import normalize_axis_index
+
 __doc__ = (
     "Implementation module for copy- and cast- operations on "
     ":class:`dpctl.tensor.usm_ndarray`."
@@ -130,6 +133,73 @@ def _copy_from_numpy_into(dst, np_ary):
     )
 
 
+def _extract_impl(ary, ary_mask, axis=0):
+    """
+    Extract elements of ary by applying mask starting from slot
+    dimension axis
+    """
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if isinstance(ary_mask, dpt.usm_ndarray):
+        dst_usm_type = dpctl.utils.get_coerced_usm_type(
+            (ary.usm_type, ary_mask.usm_type)
+        )
+        exec_q = dpctl.utils.get_execution_queue(
+            (ary.sycl_queue, ary_mask.sycl_queue)
+        )
+        if exec_q is None:
+            raise dpctl.utils.ExecutionPlacementError(
+                "arrays have different associated queues. "
+                "Use `y.to_device(x.device)` to migrate."
+            )
+    elif isinstance(ary_mask, np.ndarray):
+        dst_usm_type = ary.usm_type
+        exec_q = ary.sycl_queue
+        ary_mask = dpt.asarray(
+            ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
+        )
+    else:
+        raise TypeError(
+            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
+            f"{type(ary_mask)}"
+        )
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
+    )
+    dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    dst = dpt.empty(
+        dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
+    )
+    if dst.size == 0:
+        return dst
+    hev, ev = ti._extract(
+        src=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        dst=dst,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, ev)
+    return dst
+
+
 def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
     """
     from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 97aac6caf20f..de889f239d1f 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -37,6 +37,9 @@
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
+from ._copy_utils import (
+    _extract_impl,
+)
 from ._numpy_helper import normalize_axis_index
 
 
@@ -50,6 +53,51 @@ def _get_indexing_mode(name):
         )
 
 
+def extract(condition, arr):
+    """extract(condition, arr)
+
+    Returns the elements of an array that satisfies the condition.
+
+    If ``condition`` is boolean ``dpctl.tensor.extract`` is
+    equivalent to ``arr[condition]``.
+
+    Note that ``dpctl.tensor.place`` does the opposite of
+    ``dpctl.tensor.extract``.
+
+    Args:
+       conditions (usm_ndarray):
+            An array whose non-zero or ``True`` entries indicate the element
+            of ``arr`` to extract.
+
+       arr (usm_ndarray):
+            Input array of the same size as ``condition``.
+
+    Returns:
+        usm_ndarray:
+            Rank 1 array of values from ``arr`` where ``condition`` is
+            ``True``.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    exec_q = dpctl.utils.get_execution_queue(
+        (
+            condition.sycl_queue,
+            arr.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError
+    if condition.shape != arr.shape:
+        raise ValueError("Arrays are not of the same size")
+    return _extract_impl(arr, condition)
+
+
 def place(arr, mask, vals):
     """place(arr, mask, vals)
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index a92370a69525..c462c1d5854a 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -823,7 +823,7 @@ def extract(condition, a):
             usm_a = dpt_ext.reshape(usm_a, -1)
             usm_cond = dpt_ext.reshape(usm_cond, -1)
 
-        usm_res = dpt.extract(usm_cond, usm_a)
+        usm_res = dpt_ext.extract(usm_cond, usm_a)
 
     return dpnp_array._create_from_usm_ndarray(usm_res)
 

From 7feb4ee30381ef3935e19a25263b1ad3de3895e3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 06:18:09 -0800
Subject: [PATCH 054/145] Move nonzero() to dpctl_ext/tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py            |  2 ++
 dpctl_ext/tensor/_copy_utils.py         | 31 +++++++++++++++++++++++++
 dpctl_ext/tensor/_indexing_functions.py | 28 ++++++++++++++++++++++
 dpnp/dpnp_iface_indexing.py             |  4 ++--
 4 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 8fb164828eb5..a14d86079b0a 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -41,6 +41,7 @@
 )
 from dpctl_ext.tensor._indexing_functions import (
     extract,
+    nonzero,
     place,
     put,
     take,
@@ -57,6 +58,7 @@
     "extract",
     "from_numpy",
     "full",
+    "nonzero",
     "place",
     "put",
     "reshape",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 4be6475ec4a3..bef2b5a5cf16 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -200,6 +200,37 @@ def _extract_impl(ary, ary_mask, axis=0):
     return dst
 
 
+def _nonzero_impl(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    exec_q = ary.sycl_queue
+    usm_type = ary.usm_type
+    mask_nelems = ary.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(
+        mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
+    )
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary, cumsum, sycl_queue=exec_q, depends=dep_evs
+    )
+    indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
+    indexes = dpt.empty(
+        (ary.ndim, mask_count),
+        dtype=indexes_dt,
+        usm_type=usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q)
+    res = tuple(indexes[i, :] for i in range(ary.ndim))
+    _manager.add_event_pair(hev, nz_ev)
+    return res
+
+
 def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
     """
     from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index de889f239d1f..4feb2f9adbeb 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -39,6 +39,7 @@
 
 from ._copy_utils import (
     _extract_impl,
+    _nonzero_impl,
 )
 from ._numpy_helper import normalize_axis_index
 
@@ -98,6 +99,33 @@ def extract(condition, arr):
     return _extract_impl(arr, condition)
 
 
+def nonzero(arr):
+    """nonzero(arr)
+
+    Return the indices of non-zero elements.
+
+    Returns a tuple of usm_ndarrays, one for each dimension
+    of ``arr``, containing the indices of the non-zero elements
+    in that dimension. The values of ``arr`` are always tested in
+    row-major, C-style order.
+
+    Args:
+        arr (usm_ndarray):
+            Input array, which has non-zero array rank.
+
+    Returns:
+        Tuple[usm_ndarray, ...]:
+            Indices of non-zero array elements.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if arr.ndim == 0:
+        raise ValueError("Array of positive rank is expected")
+    return _nonzero_impl(arr)
+
+
 def place(arr, mask, vals):
     """place(arr, mask, vals)
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index c462c1d5854a..5e1a6a0ecc3d 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -817,7 +817,7 @@ def extract(condition, a):
         usm_a = dpt_ext.reshape(usm_a, -1)
         usm_cond = dpt_ext.reshape(usm_cond, -1)
 
-        usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
+        usm_res = dpt_ext.take(usm_a, dpt_ext.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
             usm_a = dpt_ext.reshape(usm_a, -1)
@@ -1546,7 +1546,7 @@ def nonzero(a):
 
     usm_a = dpnp.get_usm_ndarray(a)
     return tuple(
-        dpnp_array._create_from_usm_ndarray(y) for y in dpt.nonzero(usm_a)
+        dpnp_array._create_from_usm_ndarray(y) for y in dpt_ext.nonzero(usm_a)
     )
 
 

From f63f2f05540ad0dc7d2f04957446f4cf8429dd83 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 06:32:06 -0800
Subject: [PATCH 055/145] Move put_along_axis to dpctl_ext/tensor and reuse it
 in dpnp

---
 dpctl_ext/tensor/__init__.py            |   2 +
 dpctl_ext/tensor/_copy_utils.py         | 153 ++++++++++++++++++++++++
 dpctl_ext/tensor/_indexing_functions.py |  87 ++++++++++++++
 dpnp/dpnp_iface_indexing.py             |   2 +-
 4 files changed, 243 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a14d86079b0a..e3f50236f261 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -44,6 +44,7 @@
     nonzero,
     place,
     put,
+    put_along_axis,
     take,
 )
 from dpctl_ext.tensor._manipulation_functions import (
@@ -61,6 +62,7 @@
     "nonzero",
     "place",
     "put",
+    "put_along_axis",
     "reshape",
     "roll",
     "take",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index bef2b5a5cf16..95608ace0c3b 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -28,6 +28,7 @@
 
 import builtins
 import operator
+from numbers import Integral
 
 import dpctl
 import dpctl.memory as dpm
@@ -40,6 +41,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
@@ -200,6 +202,42 @@ def _extract_impl(ary, ary_mask, axis=0):
     return dst
 
 
+def _get_indices_queue_usm_type(inds, queue, usm_type):
+    """
+    Utility for validating indices are NumPy ndarray or usm_ndarray of integral
+    dtype or Python integers. At least one must be an array.
+
+    For each array, the queue and usm type are appended to `queue_list` and
+    `usm_type_list`, respectively.
+    """
+    queues = [queue]
+    usm_types = [usm_type]
+    any_array = False
+    for ind in inds:
+        if isinstance(ind, (np.ndarray, dpt.usm_ndarray)):
+            any_array = True
+            if ind.dtype.kind not in "ui":
+                raise IndexError(
+                    "arrays used as indices must be of integer (or boolean) "
+                    "type"
+                )
+            if isinstance(ind, dpt.usm_ndarray):
+                queues.append(ind.sycl_queue)
+                usm_types.append(ind.usm_type)
+        elif not isinstance(ind, Integral):
+            raise TypeError(
+                "all elements of `ind` expected to be usm_ndarrays, "
+                f"NumPy arrays, or integers, found {type(ind)}"
+            )
+    if not any_array:
+        raise TypeError(
+            "at least one element of `inds` expected to be an array"
+        )
+    usm_type = dpctl.utils.get_coerced_usm_type(usm_types)
+    q = dpctl.utils.get_execution_queue(queues)
+    return q, usm_type
+
+
 def _nonzero_impl(ary):
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
@@ -231,6 +269,121 @@ def _nonzero_impl(ary):
     return res
 
 
+def _prepare_indices_arrays(inds, q, usm_type):
+    """
+    Utility taking a mix of usm_ndarray and possibly Python int scalar indices,
+    a queue (assumed to be common to arrays in inds), and a usm type.
+
+    Python scalar integers are promoted to arrays on the provided queue and
+    with the provided usm type. All arrays are then promoted to a common
+    integral type (if possible) before being broadcast to a common shape.
+    """
+    # scalar integers -> arrays
+    inds = tuple(
+        map(
+            lambda ind: (
+                ind
+                if isinstance(ind, dpt.usm_ndarray)
+                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
+            ),
+            inds,
+        )
+    )
+
+    # promote to a common integral type if possible
+    ind_dt = dpt.result_type(*inds)
+    if ind_dt.kind not in "ui":
+        raise ValueError(
+            "cannot safely promote indices to an integer data type"
+        )
+    inds = tuple(
+        map(
+            lambda ind: (
+                ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt)
+            ),
+            inds,
+        )
+    )
+
+    # broadcast
+    inds = dpt.broadcast_arrays(*inds)
+
+    return inds
+
+
+def _put_multi_index(ary, inds, p, vals, mode=0):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    ary_nd = ary.ndim
+    p = normalize_axis_index(operator.index(p), ary_nd)
+    mode = operator.index(mode)
+    if mode not in [0, 1]:
+        raise ValueError(
+            "Invalid value for mode keyword, only 0 or 1 is supported"
+        )
+    if not isinstance(inds, (list, tuple)):
+        inds = (inds,)
+
+    exec_q, coerced_usm_type = _get_indices_queue_usm_type(
+        inds, ary.sycl_queue, ary.usm_type
+    )
+
+    if exec_q is not None:
+        if not isinstance(vals, dpt.usm_ndarray):
+            vals = dpt.asarray(
+                vals,
+                dtype=ary.dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    coerced_usm_type,
+                    vals.usm_type,
+                )
+            )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    inds = _prepare_indices_arrays(inds, exec_q, coerced_usm_type)
+
+    ind0 = inds[0]
+    ary_sh = ary.shape
+    p_end = p + len(inds)
+    if 0 in ary_sh[p:p_end] and ind0.size != 0:
+        raise IndexError(
+            "cannot put into non-empty indices along an empty axis"
+        )
+    expected_vals_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
+    if vals.dtype == ary.dtype:
+        rhs = vals
+    else:
+        rhs = dpt_ext.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    hev, put_ev = ti._put(
+        dst=ary,
+        ind=inds,
+        val=rhs,
+        axis_start=p,
+        mode=mode,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, put_ev)
+    return
+
+
 def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
     """
     from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 4feb2f9adbeb..8bc512bfe9d4 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -40,6 +40,7 @@
 from ._copy_utils import (
     _extract_impl,
     _nonzero_impl,
+    _put_multi_index,
 )
 from ._numpy_helper import normalize_axis_index
 
@@ -54,6 +55,12 @@ def _get_indexing_mode(name):
         )
 
 
+def _range(sh_i, i, nd, q, usm_t, dt):
+    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
+    ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
+    return ind
+
+
 def extract(condition, arr):
     """extract(condition, arr)
 
@@ -343,6 +350,86 @@ def put_vec_duplicates(vec, ind, vals):
     _manager.add_event_pair(hev, put_ev)
 
 
+def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"):
+    """
+    Puts elements into an array at the one-dimensional indices specified by
+    ``indices`` along a provided ``axis``.
+
+    Args:
+        x (usm_ndarray):
+            input array. Must be compatible with ``indices``, except for the
+            axis (dimension) specified by ``axis``.
+        indices (usm_ndarray):
+            array indices. Must have the same rank (i.e., number of dimensions)
+            as ``x``.
+        vals (usm_ndarray):
+            Array of values to be put into ``x``.
+            Must be broadcastable to the shape of ``indices``.
+        axis: int
+            axis along which to select values. If ``axis`` is negative, the
+            function determines the axis along which to select values by
+            counting from the last dimension. Default: ``-1``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    .. note::
+
+        If input array ``indices`` contains duplicates, a race condition
+        occurs, and the value written into corresponding positions in ``x``
+        may vary from run to run. Preserving sequential semantics in handing
+        the duplicates to achieve deterministic behavior requires additional
+        work.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
+        )
+    x_nd = x.ndim
+    if x_nd != indices.ndim:
+        raise ValueError(
+            "Number of dimensions in the first and the second "
+            "argument arrays must be equal"
+        )
+    pp = normalize_axis_index(operator.index(axis), x_nd)
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
+    else:
+        queues_ = [x.sycl_queue, indices.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type]
+    exec_q = dpctl.utils.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+        )
+    out_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
+    mode_i = _get_indexing_mode(mode)
+    indexes_dt = (
+        dpt.uint64
+        if indices.dtype == dpt.uint64
+        else ti.default_device_index_type(exec_q.sycl_device)
+    )
+    _ind = tuple(
+        (
+            indices
+            if i == pp
+            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
+        )
+        for i in range(x_nd)
+    )
+    return _put_multi_index(x, _ind, 0, vals, mode=mode_i)
+
+
 def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
     """take(x, indices, axis=None, out=None, mode="wrap")
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 5e1a6a0ecc3d..6ccc655796a4 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -1807,7 +1807,7 @@ def put_along_axis(a, ind, values, axis, mode="wrap"):
             values, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
 
-    dpt.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
+    dpt_ext.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
 
 
 def putmask(x1, mask, values):

From 6fecefe2ee88c04bd1e2a8978f4facac316628f8 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 06:37:40 -0800
Subject: [PATCH 056/145] Move take_along_axis() to dpctl_ext/tensor and reuse
 it in dpnp

---
 dpctl_ext/tensor/__init__.py            |  2 +
 dpctl_ext/tensor/_copy_utils.py         | 52 +++++++++++++++++
 dpctl_ext/tensor/_indexing_functions.py | 78 +++++++++++++++++++++++++
 dpnp/dpnp_iface_indexing.py             |  2 +-
 4 files changed, 133 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index e3f50236f261..69aca2b5143d 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -46,6 +46,7 @@
     put,
     put_along_axis,
     take,
+    take_along_axis,
 )
 from dpctl_ext.tensor._manipulation_functions import (
     roll,
@@ -66,6 +67,7 @@
     "reshape",
     "roll",
     "take",
+    "take_along_axis",
     "to_numpy",
     "tril",
     "triu",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 95608ace0c3b..5d1ac209c86b 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -384,6 +384,58 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
     return
 
 
+def _take_multi_index(ary, inds, p, mode=0):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    ary_nd = ary.ndim
+    p = normalize_axis_index(operator.index(p), ary_nd)
+    mode = operator.index(mode)
+    if mode not in [0, 1]:
+        raise ValueError(
+            "Invalid value for mode keyword, only 0 or 1 is supported"
+        )
+    if not isinstance(inds, (list, tuple)):
+        inds = (inds,)
+
+    exec_q, res_usm_type = _get_indices_queue_usm_type(
+        inds, ary.sycl_queue, ary.usm_type
+    )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    inds = _prepare_indices_arrays(inds, exec_q, res_usm_type)
+
+    ind0 = inds[0]
+    ary_sh = ary.shape
+    p_end = p + len(inds)
+    if 0 in ary_sh[p:p_end] and ind0.size != 0:
+        raise IndexError("cannot take non-empty indices from an empty axis")
+    res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
+    res = dpt.empty(
+        res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    hev, take_ev = ti._take(
+        src=ary,
+        ind=inds,
+        dst=res,
+        axis_start=p,
+        mode=mode,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, take_ev)
+    return res
+
+
 def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
     """
     from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 8bc512bfe9d4..6ca327192f73 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -41,6 +41,7 @@
     _extract_impl,
     _nonzero_impl,
     _put_multi_index,
+    _take_multi_index,
 )
 from ._numpy_helper import normalize_axis_index
 
@@ -561,3 +562,80 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
         out = orig_out
 
     return out
+
+
+def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"):
+    """
+    Returns elements from an array at the one-dimensional indices specified
+    by ``indices`` along a provided ``axis``.
+
+    Args:
+        x (usm_ndarray):
+            input array. Must be compatible with ``indices``, except for the
+            axis (dimension) specified by ``axis``.
+        indices (usm_ndarray):
+            array indices. Must have the same rank (i.e., number of dimensions)
+            as ``x``.
+        axis: int
+            axis along which to select values. If ``axis`` is negative, the
+            function determines the axis along which to select values by
+            counting from the last dimension. Default: ``-1``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    Returns:
+        usm_ndarray:
+            an array having the same data type as ``x``. The returned array has
+            the same rank (i.e., number of dimensions) as ``x`` and a shape
+            determined according to broadcasting rules, except for the axis
+            (dimension) specified by ``axis`` whose size must equal the size
+            of the corresponding axis (dimension) in ``indices``.
+
+    Note:
+        Treatment of the out-of-bound indices in ``indices`` array is controlled
+        by the value of ``mode`` keyword.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
+        )
+    x_nd = x.ndim
+    if x_nd != indices.ndim:
+        raise ValueError(
+            "Number of dimensions in the first and the second "
+            "argument arrays must be equal"
+        )
+    pp = normalize_axis_index(operator.index(axis), x_nd)
+    out_usm_type = dpctl.utils.get_coerced_usm_type(
+        (x.usm_type, indices.usm_type)
+    )
+    exec_q = dpctl.utils.get_execution_queue((x.sycl_queue, indices.sycl_queue))
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+        )
+    mode_i = _get_indexing_mode(mode)
+    indexes_dt = (
+        dpt.uint64
+        if indices.dtype == dpt.uint64
+        else ti.default_device_index_type(exec_q.sycl_device)
+    )
+    _ind = tuple(
+        (
+            indices
+            if i == pp
+            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
+        )
+        for i in range(x_nd)
+    )
+    return _take_multi_index(x, _ind, 0, mode=mode_i)
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 6ccc655796a4..95998dbdda4b 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -2295,7 +2295,7 @@ def take_along_axis(a, indices, axis=-1, mode="wrap"):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_ind = dpnp.get_usm_ndarray(indices)
 
-    usm_res = dpt.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
+    usm_res = dpt_ext.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From b60d095ff6bbac9c96f8d139697333b169cc5ae1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 07:06:03 -0800
Subject: [PATCH 057/145] Move ti._eye to() to dpctl_ext/tensor/libtensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/constructors.hpp          |  96 +++++++++++-
 .../tensor/libtensor/source/eye_ctor.cpp      | 142 ++++++++++++++++++
 .../tensor/libtensor/source/eye_ctor.hpp      |  57 +++++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  24 +--
 5 files changed, 307 insertions(+), 14 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/eye_ctor.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/eye_ctor.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 88507aa2192f..0b166a202735 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -54,7 +54,7 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 22189ee3129c..26ae46707a6b 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -56,7 +56,8 @@ using dpctl::tensor::ssize_t;
 
 template <typename Ty>
 class full_strided_kernel;
-// template <typename Ty> class eye_kernel;
+template <typename Ty>
+class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
@@ -162,6 +163,99 @@ sycl::event full_strided_impl(sycl::queue &q,
     return fill_ev;
 }
 
+/* ================ Eye ================== */
+
+typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &,
+                                    std::size_t nelems, // num_elements
+                                    ssize_t start,
+                                    ssize_t end,
+                                    ssize_t step,
+                                    char *, // dst_data_ptr
+                                    const std::vector<sycl::event> &);
+
+template <typename Ty>
+class EyeFunctor
+{
+private:
+    Ty *p = nullptr;
+    ssize_t start_v;
+    ssize_t end_v;
+    ssize_t step_v;
+
+public:
+    EyeFunctor(char *dst_p,
+               const ssize_t v0,
+               const ssize_t v1,
+               const ssize_t dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        Ty set_v = 0;
+        ssize_t i = static_cast<ssize_t>(wiid.get(0));
+        if (i >= start_v and i <= end_v) {
+            if ((i - start_v) % step_v == 0) {
+                set_v = 1;
+            }
+        }
+        p[i] = set_v;
+    }
+};
+
+/*!
+ * @brief Function to populate 2D array with eye matrix.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Number of elements to assign.
+ * @param start   Position of the first non-zero value.
+ * @param end     Position of the last non-zero value.
+ * @param step    Number of array elements between non-zeros.
+ * @param array_data Kernel accessible USM pointer for the destination array.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event eye_impl(sycl::queue &exec_q,
+                     std::size_t nelems,
+                     const ssize_t start,
+                     const ssize_t end,
+                     const ssize_t step,
+                     char *array_data,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = eye_kernel<Ty>;
+        using Impl = EyeFunctor<Ty>;
+
+        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                     Impl(array_data, start, end, step));
+    });
+
+    return eye_event;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct EyeFactory
+{
+    fnT get()
+    {
+        fnT f = eye_impl<Ty>;
+        return f;
+    }
+};
+
 /* =========================== Tril and triu ============================== */
 
 // define function type
diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp b/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp
new file mode 100644
index 000000000000..025a7d58d06e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp
@@ -0,0 +1,142 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "eye_ctor.hpp"
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::eye_fn_ptr_t;
+static eye_fn_ptr_t eye_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_eye(py::ssize_t k,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    // dst must be 2D
+
+    if (dst.get_ndim() != 2) {
+        throw py::value_error(
+            "usm_ndarray_eye: Expecting 2D array to populate");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    const py::ssize_t nelem = dst.get_size();
+    const py::ssize_t rows = dst.get_shape(0);
+    const py::ssize_t cols = dst.get_shape(1);
+    if (rows == 0 || cols == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+    if (!is_dst_c_contig && !is_dst_f_contig) {
+        throw py::value_error("USM array is not contiguous");
+    }
+
+    py::ssize_t start;
+    if (is_dst_c_contig) {
+        start = (k < 0) ? -k * cols : k;
+    }
+    else {
+        start = (k < 0) ? -k : k * rows;
+    }
+
+    const py::ssize_t *strides = dst.get_strides_raw();
+    py::ssize_t step;
+    if (strides == nullptr) {
+        step = (is_dst_c_contig) ? cols + 1 : rows + 1;
+    }
+    else {
+        step = strides[0] + strides[1];
+    }
+
+    const py::ssize_t length = std::min({rows, cols, rows + k, cols - k});
+    const py::ssize_t end = start + step * (length - 1);
+
+    char *dst_data = dst.get_data();
+    sycl::event eye_event;
+
+    auto fn = eye_dispatch_vector[dst_typeid];
+
+    eye_event = fn(exec_q, static_cast<std::size_t>(nelem), start, end, step,
+                   dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}),
+                          eye_event);
+}
+
+void init_eye_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::constructors::EyeFactory;
+
+    DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb;
+    dvb.populate_dispatch_vector(eye_dispatch_vector);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp b/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp
new file mode 100644
index 000000000000..dda7f2c4813a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_eye(py::ssize_t k,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends = {});
+
+extern void init_eye_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 901af48074be..98ab488e5879 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -52,7 +52,7 @@
 #include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
-// #include "eye_ctor.hpp"
+#include "eye_ctor.hpp"
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
@@ -124,7 +124,7 @@ using dpctl::tensor::py_internal::py_cumsum_1d;
 
 /* ================ Eye ================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_eye;
+using dpctl::tensor::py_internal::usm_ndarray_eye;
 
 /* =========================== Tril and triu ============================== */
 
@@ -160,7 +160,7 @@ void init_dispatch_vectors(void)
     // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
-    // init_eye_ctor_dispatch_vectors();
+    init_eye_ctor_dispatch_vectors();
     init_triul_ctor_dispatch_vectors();
 
     populate_masked_extract_dispatch_vectors();
@@ -348,15 +348,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("mode"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("_eye", &usm_ndarray_eye,
-    //       "Fills input 2D contiguous usm_ndarray `dst` with "
-    //       "zeros outside of the diagonal "
-    //       "specified by "
-    //       "the diagonal index `k` "
-    //       "which is filled with ones."
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_eye", &usm_ndarray_eye,
+          "Fills input 2D contiguous usm_ndarray `dst` with "
+          "zeros outside of the diagonal "
+          "specified by "
+          "the diagonal index `k` "
+          "which is filled with ones."
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("default_device_fp_type",
           dpctl::tensor::py_internal::default_device_fp_type,

From a1eea4e730f9dde47d7fd2eae0fa338e1e500299 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 09:26:18 -0800
Subject: [PATCH 058/145] Move eye() to dpctl_ext/tensor and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py |   2 +
 dpctl_ext/tensor/_ctors.py   | 128 +++++++++++++++++++++++++++++++++++
 dpnp/dpnp_container.py       |   2 +-
 3 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 69aca2b5143d..fa76faccc632 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -35,6 +35,7 @@
     to_numpy,
 )
 from dpctl_ext.tensor._ctors import (
+    eye,
     full,
     tril,
     triu,
@@ -58,6 +59,7 @@
     "astype",
     "copy",
     "extract",
+    "eye",
     "from_numpy",
     "full",
     "nonzero",
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 5a39e9367e9c..5a9e07c73346 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -58,6 +58,38 @@ def _cast_fill_val(fill_val, dt):
         return fill_val
 
 
+def _ensure_native_dtype_device_support(dtype, dev) -> None:
+    """Check that dtype is natively supported by device.
+
+    Arg:
+        dtype:
+            Elemental data-type
+        dev (:class:`dpctl.SyclDevice`):
+            The device about which the query is being made.
+    Returns:
+        None
+    Raise:
+        ValueError:
+            if device does not natively support this `dtype`.
+    """
+    if dtype in [dpt.float64, dpt.complex128] and not dev.has_aspect_fp64:
+        raise ValueError(
+            f"Device {dev.name} does not provide native support "
+            "for double-precision floating point type."
+        )
+    if (
+        dtype
+        in [
+            dpt.float16,
+        ]
+        and not dev.has_aspect_fp16
+    ):
+        raise ValueError(
+            f"Device {dev.name} does not provide native support "
+            "for half-precision floating point type."
+        )
+
+
 def _to_scalar(obj, sc_ty):
     """A way to convert object to NumPy scalar type.
     Raises OverflowError if obj can not be represented
@@ -67,6 +99,102 @@ def _to_scalar(obj, sc_ty):
     return zd_arr[()]
 
 
+def eye(
+    n_rows,
+    n_cols=None,
+    /,
+    *,
+    k=0,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \
+        device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th
+    diagonal.
+
+    Args:
+        n_rows (int):
+            number of rows in the output array.
+        n_cols (int, optional):
+            number of columns in the output array. If ``None``,
+            ``n_cols = n_rows``. Default: ``None``
+        k (int):
+            index of the diagonal, with ``0`` as the main diagonal.
+            A positive value of ``k`` is a superdiagonal, a negative value
+            is a subdiagonal.
+            Raises :exc:`TypeError` if ``k`` is not an integer.
+            Default: ``0``
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or
+            a NumPy scalar type. Default: ``None``
+        order ("C" or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            A diagonal matrix.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    n_rows = operator.index(n_rows)
+    n_cols = n_rows if n_cols is None else operator.index(n_cols)
+    k = operator.index(k)
+    if k >= n_cols or -k >= n_rows:
+        return dpt.zeros(
+            (n_rows, n_cols),
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        (n_rows, n_cols),
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    if n_rows != 0 and n_cols != 0:
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue)
+        _manager.add_event_pair(hev, eye_ev)
+    return res
+
+
 def _validate_fill_value(fill_val):
     """Validates that `fill_val` is a numeric or boolean scalar."""
     # TODO: verify if `np.True_` and `np.False_` should be instances of
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index acda579a5f5e..0727b9bfd775 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -196,7 +196,7 @@ def eye(
         order = "C"
 
     """Creates `dpnp_array` with ones on the `k`th diagonal."""
-    array_obj = dpt.eye(
+    array_obj = dpt_ext.eye(
         N,
         M,
         k=k,

From 95ccf708ffcce4bcfcf1c1f934bf92d986911cca Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 13:17:17 -0800
Subject: [PATCH 059/145] Move ti._repeat... to dpctl_ext/tensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../libtensor/include/kernels/repeat.hpp      | 462 ++++++++++
 dpctl_ext/tensor/libtensor/source/repeat.cpp  | 820 ++++++++++++++++++
 dpctl_ext/tensor/libtensor/source/repeat.hpp  |  83 ++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  84 +-
 5 files changed, 1407 insertions(+), 44 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/repeat.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/repeat.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 0b166a202735..797dbd6f4d1b 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -60,7 +60,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
 
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp b/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp
new file mode 100644
index 000000000000..aab9a709f010
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp
@@ -0,0 +1,462 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor repeating operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::repeat
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class repeat_by_sequence_kernel;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class RepeatSequenceFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    const repT *reps = nullptr;
+    const repT *cumsum = nullptr;
+    std::size_t src_axis_nelems = 1;
+    OrthogIndexer orthog_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
+    RepIndexer reps_strider;
+
+public:
+    RepeatSequenceFunctor(const T *src_,
+                          T *dst_,
+                          const repT *reps_,
+                          const repT *cumsum_,
+                          std::size_t src_axis_nelems_,
+                          const OrthogIndexer &orthog_strider_,
+                          const SrcAxisIndexer &src_axis_strider_,
+                          const DstAxisIndexer &dst_axis_strider_,
+                          const RepIndexer &reps_strider_)
+        : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_),
+          src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_),
+          src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        std::size_t id = idx[0];
+        auto i_orthog = id / src_axis_nelems;
+        auto i_along = id - (i_orthog * src_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto val = src[src_offset + src_axis_strider(i_along)];
+        auto last = cumsum[i_along];
+        auto first = last - reps[reps_strider(i_along)];
+        for (auto i = first; i < last; ++i) {
+            dst[dst_offset + dst_axis_strider(i)] = val;
+        }
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event
+    repeat_by_sequence_impl(sycl::queue &q,
+                            std::size_t orthog_nelems,
+                            std::size_t src_axis_nelems,
+                            const char *src_cp,
+                            char *dst_cp,
+                            const char *reps_cp,
+                            const char *cumsum_cp,
+                            int orthog_nd,
+                            const ssize_t *orthog_src_dst_shape_and_strides,
+                            ssize_t src_offset,
+                            ssize_t dst_offset,
+                            ssize_t src_axis_shape,
+                            ssize_t src_axis_stride,
+                            ssize_t dst_axis_shape,
+                            ssize_t dst_axis_stride,
+                            ssize_t reps_shape,
+                            ssize_t reps_stride,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        const TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset,
+            orthog_src_dst_shape_and_strides};
+        // indexers along repeated axis
+        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
+                                                /* step */ src_axis_stride};
+        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
+                                                /* step */ dst_axis_stride};
+        // indexer along reps array
+        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
+                                            /* step */ reps_stride};
+
+        const std::size_t gws = orthog_nelems * src_axis_nelems;
+
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems,
+                orthog_indexer, src_axis_indexer, dst_axis_indexer,
+                reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatSequenceFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event repeat_by_sequence_1d_impl(sycl::queue &q,
+                                       std::size_t src_nelems,
+                                       const char *src_cp,
+                                       char *dst_cp,
+                                       const char *reps_cp,
+                                       const char *cumsum_cp,
+                                       int src_nd,
+                                       const ssize_t *src_shape_strides,
+                                       ssize_t dst_shape,
+                                       ssize_t dst_stride,
+                                       ssize_t reps_shape,
+                                       ssize_t reps_stride,
+                                       const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        const StridedIndexer src_indexer{src_nd, 0, src_shape_strides};
+        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
+                                           /* step */ dst_stride};
+        // indexer along reps array
+        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
+                                            /* step */ reps_stride};
+
+        const std::size_t gws = src_nelems;
+
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer,
+                src_indexer, dst_indexer, reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatSequence1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_1d_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
+class repeat_by_scalar_kernel;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
+class RepeatScalarFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    ssize_t reps = 1;
+    std::size_t dst_axis_nelems = 0;
+    OrthogIndexer orthog_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
+
+public:
+    RepeatScalarFunctor(const T *src_,
+                        T *dst_,
+                        const ssize_t reps_,
+                        std::size_t dst_axis_nelems_,
+                        const OrthogIndexer &orthog_strider_,
+                        const SrcAxisIndexer &src_axis_strider_,
+                        const DstAxisIndexer &dst_axis_strider_)
+        : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_),
+          orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        std::size_t id = idx[0];
+        auto i_orthog = id / dst_axis_nelems;
+        auto i_along = id - (i_orthog * dst_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto dst_axis_offset = dst_axis_strider(i_along);
+        auto src_axis_offset = src_axis_strider(i_along / reps);
+        dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset];
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_impl(sycl::queue &q,
+                                  std::size_t orthog_nelems,
+                                  std::size_t dst_axis_nelems,
+                                  const char *src_cp,
+                                  char *dst_cp,
+                                  const ssize_t reps,
+                                  int orthog_nd,
+                                  const ssize_t *orthog_shape_and_strides,
+                                  ssize_t src_offset,
+                                  ssize_t dst_offset,
+                                  ssize_t src_axis_shape,
+                                  ssize_t src_axis_stride,
+                                  ssize_t dst_axis_shape,
+                                  ssize_t dst_axis_stride,
+                                  const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        const TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset, orthog_shape_and_strides};
+        // indexers along repeated axis
+        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
+                                                /* step */ src_axis_stride};
+        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
+                                                /* step */ dst_axis_stride};
+
+        const std::size_t gws = orthog_nelems * dst_axis_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                Strided1DIndexer, T>(
+                src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer,
+                src_axis_indexer, dst_axis_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatScalarFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_impl<T>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_1d_impl(sycl::queue &q,
+                                     std::size_t dst_nelems,
+                                     const char *src_cp,
+                                     char *dst_cp,
+                                     const ssize_t reps,
+                                     int src_nd,
+                                     const ssize_t *src_shape_strides,
+                                     ssize_t dst_shape,
+                                     ssize_t dst_stride,
+                                     const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        const StridedIndexer src_indexer(src_nd, 0, src_shape_strides);
+        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
+                                           /* step */ dst_stride};
+
+        const std::size_t gws = dst_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                Strided1DIndexer, T>(src_tp, dst_tp, reps,
+                                                     dst_nelems, orthog_indexer,
+                                                     src_indexer, dst_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatScalar1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_1d_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::repeat
diff --git a/dpctl_ext/tensor/libtensor/source/repeat.cpp b/dpctl_ext/tensor/libtensor/source/repeat.cpp
new file mode 100644
index 000000000000..4bba1d35a08e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/repeat.cpp
@@ -0,0 +1,820 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/repeat.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t;
+static repeat_by_sequence_fn_ptr_t
+    repeat_by_sequence_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t;
+static repeat_by_sequence_1d_fn_ptr_t
+    repeat_by_sequence_1d_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t;
+static repeat_by_scalar_fn_ptr_t
+    repeat_by_scalar_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t;
+static repeat_by_scalar_1d_fn_ptr_t
+    repeat_by_scalar_1d_dispatch_vector[td_ns::num_types];
+
+void init_repeat_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::repeat::RepeatSequenceFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_fn_ptr_t,
+                                 RepeatSequenceFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_1d_fn_ptr_t,
+                                 RepeatSequence1DFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalarFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_fn_ptr_t, RepeatScalarFactory,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_1d_fn_ptr_t,
+                                 RepeatScalar1DFactory, td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          int axis,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t reps_sz = reps.get_size();
+    std::size_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    std::size_t orthog_nelems(1); // number of orthogonal iterations
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    std::size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    std::size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (!same_orthog_dims || src_axis_nelems != reps_sz ||
+        src_axis_nelems != cumsum_sz)
+    {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * dst_axis_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        auto packed_src_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_src_shape_strides =
+            packed_src_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev =
+            fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p,
+               cumsum_data_p, src_nd, packed_src_shape_strides,
+               dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0],
+               reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+
+        auto fn = repeat_by_sequence_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape,
+            axis_src_shape, orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape,
+            axis_dst_shape, orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_orthog_shape,
+            simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p,
+                       dst_data_p, reps_data_p, cumsum_data_p,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexers along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0],
+                       // data to build indexer for reps array
+                       reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t src_sz = src.get_size();
+    std::size_t reps_sz = reps.get_size();
+    std::size_t cumsum_sz = cumsum.get_size();
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (src_sz != reps_sz || src_sz != cumsum_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               dst.get_size());
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, cumsum, or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    auto packed_src_shapes_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple1));
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+    const py::ssize_t *packed_src_shapes_strides =
+        packed_src_shapes_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(
+        exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p,
+        src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0],
+        reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {repeat_ev}, packed_src_shapes_strides_owner);
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        int axis,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    std::size_t orthog_nelems(1); // number of orthogonal iterations
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    std::size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    std::size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * (src_axis_nelems * reps));
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        auto packed_src_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_src_shape_strides =
+            packed_src_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps,
+                       src_nd, packed_src_shape_strides, dst_shape_vec[0],
+                       dst_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+
+        auto fn = repeat_by_scalar_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape,
+            axis_src_shape, orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape,
+            axis_dst_shape, orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_orthog_shape,
+            simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p,
+                       dst_data_p, reps,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexer along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends)
+{
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t src_sz = src.get_size();
+    std::size_t dst_sz = dst.get_size();
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if ((src_sz * reps) != dst_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               src_sz * reps);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    auto packed_src_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple1));
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+    const py::ssize_t *packed_src_shape_strides =
+        packed_src_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps,
+                               src_nd, packed_src_shape_strides,
+                               dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/repeat.hpp b/dpctl_ext/tensor/libtensor/source/repeat.hpp
new file mode 100644
index 000000000000..5835377fb29c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/repeat.hpp
@@ -0,0 +1,83 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_repeat_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          int axis,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        int axis,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 98ab488e5879..48e65c4deb84 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -57,7 +57,7 @@
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
 // #include "linear_sequences.hpp"
-// #include "repeat.hpp"
+#include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
@@ -119,8 +119,8 @@ using dpctl::tensor::py_internal::py_place;
 
 /* ================= Repeat ====================*/
 using dpctl::tensor::py_internal::py_cumsum_1d;
-// using dpctl::tensor::py_internal::py_repeat_by_scalar;
-// using dpctl::tensor::py_internal::py_repeat_by_sequence;
+using dpctl::tensor::py_internal::py_repeat_by_scalar;
+using dpctl::tensor::py_internal::py_repeat_by_sequence;
 
 /* ================ Eye ================== */
 
@@ -169,7 +169,7 @@ void init_dispatch_vectors(void)
     populate_mask_positions_dispatch_vectors();
 
     populate_cumsum_1d_dispatch_vectors();
-    // init_repeat_dispatch_vectors();
+    init_repeat_dispatch_vectors();
 
     // init_clip_dispatch_vectors();
 
@@ -450,45 +450,43 @@ PYBIND11_MODULE(_tensor_impl, m)
     //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
     //       py::arg("depends") = py::list());
 
-    // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
-    //                           const dpctl::tensor::usm_ndarray &dst,
-    //                           const dpctl::tensor::usm_ndarray &reps,
-    //                           const dpctl::tensor::usm_ndarray &cumsum,
-    //                           std::optional<int> axis, sycl::queue &exec_q,
-    //                           const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     if (axis) {
-    //         return py_repeat_by_sequence(src, dst, reps, cumsum,
-    //         axis.value(),
-    //                                      exec_q, depends);
-    //     }
-    //     else {
-    //         return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
-    //                                      depends);
-    //     }
-    // };
-    // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
-    //       py::arg("dst"), py::arg("reps"), py::arg("cumsum"),
-    //       py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") =
-    //       py::list());
-
-    // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
-    //                         const dpctl::tensor::usm_ndarray &dst,
-    //                         const py::ssize_t reps, std::optional<int> axis,
-    //                         sycl::queue &exec_q,
-    //                         const std::vector<sycl::event> depends)
-    //     -> std::pair<sycl::event, sycl::event> {
-    //     if (axis) {
-    //         return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
-    //                                    depends);
-    //     }
-    //     else {
-    //         return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
-    //     }
-    // };
-    // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
-    //       py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
+                              const dpctl::tensor::usm_ndarray &dst,
+                              const dpctl::tensor::usm_ndarray &reps,
+                              const dpctl::tensor::usm_ndarray &cumsum,
+                              std::optional<int> axis, sycl::queue &exec_q,
+                              const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(),
+                                         exec_q, depends);
+        }
+        else {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
+                                         depends);
+        }
+    };
+    m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
+          py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
+                            const dpctl::tensor::usm_ndarray &dst,
+                            const py::ssize_t reps, std::optional<int> axis,
+                            sycl::queue &exec_q,
+                            const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
+                                       depends);
+        }
+        else {
+            return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
+        }
+    };
+    m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
+          py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     // m.def("_clip", &py_clip,
     //       "Clamps elements of array `x` to the range "

From 43f97cf6b9382a51acbb327e94242ad6a7c775f1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 13:24:53 -0800
Subject: [PATCH 060/145] Move repeat() to dpctl_ext/tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py                |   2 +
 dpctl_ext/tensor/_manipulation_functions.py | 232 +++++++++++++++++++-
 dpnp/dpnp_iface_manipulation.py             |   2 +-
 3 files changed, 234 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index fa76faccc632..033004bb0095 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -50,6 +50,7 @@
     take_along_axis,
 )
 from dpctl_ext.tensor._manipulation_functions import (
+    repeat,
     roll,
 )
 from dpctl_ext.tensor._reshape import reshape
@@ -66,6 +67,7 @@
     "place",
     "put",
     "put_along_axis",
+    "repeat",
     "reshape",
     "roll",
     "take",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index fa8fc27876b3..85b4f029156b 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -28,6 +28,7 @@
 
 import operator
 
+import dpctl
 import dpctl.tensor as dpt
 import dpctl.utils as dputils
 import numpy as np
@@ -36,7 +37,7 @@
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor._tensor_impl as ti
 
-from ._numpy_helper import normalize_axis_tuple
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
 
 __doc__ = (
     "Implementation module for array manipulation "
@@ -44,6 +45,235 @@
 )
 
 
+def repeat(x, repeats, /, *, axis=None):
+    """repeat(x, repeats, axis=None)
+
+    Repeat elements of an array on a per-element basis.
+
+    Args:
+        x (usm_ndarray): input array
+
+        repeats (Union[int, Sequence[int, ...], usm_ndarray]):
+            The number of repetitions for each element.
+
+            `repeats` must be broadcast-compatible with `N` where `N` is
+            `prod(x.shape)` if `axis` is `None` and `x.shape[axis]`
+            otherwise.
+
+            If `repeats` is an array, it must have an integer data type.
+            Otherwise, `repeats` must be a Python integer or sequence of
+            Python integers (i.e., a tuple, list, or range).
+
+        axis (Optional[int]):
+            The axis along which to repeat values. If `axis` is `None`, the
+            function repeats elements of the flattened array. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            output array with repeated elements.
+
+            If `axis` is `None`, the returned array is one-dimensional,
+            otherwise, it has the same shape as `x`, except for the axis along
+            which elements were repeated.
+
+            The returned array will have the same data type as `x`.
+            The returned array will be located on the same device as `x` and
+            have the same USM allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    x_ndim = x.ndim
+    x_shape = x.shape
+    if axis is not None:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        axis_size = x_shape[axis]
+    else:
+        axis_size = x.size
+
+    scalar = False
+    if isinstance(repeats, int):
+        if repeats < 0:
+            raise ValueError("`repeats` must be a positive integer")
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+        scalar = True
+    elif isinstance(repeats, dpt.usm_ndarray):
+        if repeats.ndim > 1:
+            raise ValueError(
+                "`repeats` array must be 0- or 1-dimensional, got "
+                f"{repeats.ndim}"
+            )
+        exec_q = dpctl.utils.get_execution_queue(
+            (x.sycl_queue, repeats.sycl_queue)
+        )
+        if exec_q is None:
+            raise dputils.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                x.usm_type,
+                repeats.usm_type,
+            )
+        )
+        dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+        if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
+            raise TypeError(
+                f"'repeats' data type {repeats.dtype} cannot be cast to "
+                "'int64' according to the casting rule ''safe.''"
+            )
+        if repeats.size == 1:
+            scalar = True
+            # bring the single element to the host
+            if repeats.ndim == 0:
+                repeats = int(repeats)
+            else:
+                # Get the single element explicitly
+                # since non-0D arrays can not be converted to scalars
+                repeats = int(repeats[0])
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+        else:
+            if repeats.size != axis_size:
+                raise ValueError(
+                    "'repeats' array must be broadcastable to the size of "
+                    "the repeated axis"
+                )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("'repeats' elements must be positive")
+
+    elif isinstance(repeats, (tuple, list, range)):
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+
+        len_reps = len(repeats)
+        if len_reps == 1:
+            repeats = repeats[0]
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+            scalar = True
+        else:
+            if len_reps != axis_size:
+                raise ValueError(
+                    "`repeats` sequence must have the same length as the "
+                    "repeated axis"
+                )
+            repeats = dpt.asarray(
+                repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
+            )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("`repeats` elements must be positive")
+    else:
+        raise TypeError(
+            "Expected int, sequence, or `usm_ndarray` for second argument,"
+            f"got {type(repeats)}"
+        )
+
+    _manager = dputils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if scalar:
+        res_axis_size = repeats * axis_size
+        if axis is not None:
+            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+        else:
+            res_shape = (res_axis_size,)
+        res = dpt.empty(
+            res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+        )
+        if res_axis_size > 0:
+            ht_rep_ev, rep_ev = ti._repeat_by_scalar(
+                src=x,
+                dst=res,
+                reps=repeats,
+                axis=axis,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_rep_ev, rep_ev)
+    else:
+        if repeats.dtype != dpt.int64:
+            rep_buf = dpt.empty(
+                repeats.shape,
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            # _cumsum_1d synchronizes so `depends` ends here safely
+            res_axis_size = ti._cumsum_1d(
+                rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]
+            )
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
+            res = dpt.empty(
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=rep_buf,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                _manager.add_event_pair(ht_rep_ev, rep_ev)
+        else:
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            res_axis_size = ti._cumsum_1d(
+                repeats, cumsum, sycl_queue=exec_q, depends=dep_evs
+            )
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
+            res = dpt.empty(
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=repeats,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                _manager.add_event_pair(ht_rep_ev, rep_ev)
+    return res
+
+
 def roll(x, /, shift, *, axis=None):
     """
     roll(x, shift, axis)
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index e988bbaa237b..8aa5b6832b3d 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -2837,7 +2837,7 @@ def repeat(a, repeats, axis=None):
         a = dpnp.ravel(a)
 
     usm_arr = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.repeat(usm_arr, repeats, axis=axis)
+    usm_res = dpt_ext.repeat(usm_arr, repeats, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From 9b248f4bff68c5ccb153a1fb51cebcd582fd6cf7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 13:48:50 -0800
Subject: [PATCH 061/145] Move ti._where() to dpctl_ext/tensor/libtensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../libtensor/include/kernels/where.hpp       | 339 ++++++++++++++++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  10 +-
 dpctl_ext/tensor/libtensor/source/where.cpp   | 266 ++++++++++++++
 dpctl_ext/tensor/libtensor/source/where.hpp   |  57 +++
 5 files changed, 668 insertions(+), 6 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/where.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/where.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/where.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 797dbd6f4d1b..cec88aed44c8 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -58,7 +58,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/where.hpp b/dpctl_ext/tensor/libtensor/include/kernels/where.hpp
new file mode 100644
index 000000000000..b92a3a76c9c1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/where.hpp
@@ -0,0 +1,339 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.where.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::search
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T, typename condT, typename IndexerT>
+class where_strided_kernel;
+template <typename T, typename condT, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class where_contig_kernel;
+
+template <typename T,
+          typename condT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class WhereContigFunctor
+{
+private:
+    std::size_t nelems = 0;
+    const condT *cond_p = nullptr;
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    WhereContigFunctor(std::size_t nelems_,
+                       const condT *cond_p_,
+                       const T *x1_p_,
+                       const T *x2_p_,
+                       T *dst_p_)
+        : nelems(nelems_), cond_p(cond_p_), x1_p(x1_p_), x2_p(x2_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
+                      is_complex<T>::value)
+        {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + nelems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                using dpctl::tensor::type_utils::convert_impl;
+                const bool check = convert_impl<bool, condT>(cond_p[offset]);
+                dst_p[offset] = check ? x1_p[offset] : x2_p[offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
+                sycl::vec<T, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t idx = base + it * sgSize;
+                    auto x1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x1_p[idx]);
+                    auto x2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x2_p[idx]);
+                    auto cond_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&cond_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    const sycl::vec<T, vec_sz> x1_vec =
+                        sub_group_load<vec_sz>(sg, x1_multi_ptr);
+                    const sycl::vec<T, vec_sz> x2_vec =
+                        sub_group_load<vec_sz>(sg, x2_multi_ptr);
+                    const sycl::vec<condT, vec_sz> cond_vec =
+                        sub_group_load<vec_sz>(sg, cond_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
+                    dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k];
+                }
+            }
+        }
+    }
+};
+
+typedef sycl::event (*where_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename condT>
+sycl::event where_contig_impl(sycl::queue &q,
+                              std::size_t nelems,
+                              const char *cond_cp,
+                              const char *x1_cp,
+                              const char *x2_cp,
+                              char *dst_cp,
+                              const std::vector<sycl::event> &depends)
+{
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        std::size_t lws = 64;
+        static constexpr std::uint8_t vec_sz = 4u;
+        static constexpr std::uint8_t n_vecs = 2u;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(cond_cp) &&
+            is_aligned<required_alignment>(x1_cp) &&
+            is_aligned<required_alignment>(x2_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   enable_sg_loadstore>(nelems, cond_tp, x1_tp,
+                                                        x2_tp, dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                where_contig_kernel<T, condT, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   disable_sg_loadstore>(nelems, cond_tp, x1_tp,
+                                                         x2_tp, dst_tp));
+        }
+    });
+
+    return where_ev;
+}
+
+template <typename T, typename condT, typename IndexerT>
+class WhereStridedFunctor
+{
+private:
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
+    const condT *cond_p = nullptr;
+    IndexerT indexer;
+
+public:
+    WhereStridedFunctor(const condT *cond_p_,
+                        const T *x1_p_,
+                        const T *x2_p_,
+                        T *dst_p_,
+                        const IndexerT &indexer_)
+        : x1_p(x1_p_), x2_p(x2_p_), dst_p(dst_p_), cond_p(cond_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        std::size_t gid = id[0];
+        auto offsets = indexer(static_cast<ssize_t>(gid));
+
+        using dpctl::tensor::type_utils::convert_impl;
+        bool check =
+            convert_impl<bool, condT>(cond_p[offsets.get_first_offset()]);
+
+        dst_p[offsets.get_fourth_offset()] =
+            check ? x1_p[offsets.get_second_offset()]
+                  : x2_p[offsets.get_third_offset()];
+    }
+};
+
+typedef sycl::event (*where_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename condT>
+sycl::event where_strided_impl(sycl::queue &q,
+                               std::size_t nelems,
+                               int nd,
+                               const char *cond_cp,
+                               const char *x1_cp,
+                               const char *x2_cp,
+                               char *dst_cp,
+                               const ssize_t *shape_strides,
+                               ssize_t x1_offset,
+                               ssize_t x2_offset,
+                               ssize_t cond_offset,
+                               ssize_t dst_offset,
+                               const std::vector<sycl::event> &depends)
+{
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const FourOffsets_StridedIndexer indexer{
+            nd, cond_offset, x1_offset, x2_offset, dst_offset, shape_strides};
+
+        cgh.parallel_for<
+            where_strided_kernel<T, condT, FourOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            WhereStridedFunctor<T, condT, FourOffsets_StridedIndexer>(
+                cond_tp, x1_tp, x2_tp, dst_tp, indexer));
+    });
+
+    return where_ev;
+}
+
+template <typename fnT, typename T, typename condT>
+struct WhereStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = where_strided_impl<T, condT>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T, typename condT>
+struct WhereContigFactory
+{
+    fnT get()
+    {
+        fnT fn = where_contig_impl<T, condT>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::search
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 48e65c4deb84..1619357aaf64 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -62,7 +62,7 @@
 #include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
-// #include "where.hpp"
+#include "where.hpp"
 #include "zeros_ctor.hpp"
 
 namespace py = pybind11;
@@ -132,7 +132,7 @@ using dpctl::tensor::py_internal::usm_ndarray_triul;
 
 /* =========================== Where ============================== */
 
-// using dpctl::tensor::py_internal::py_where;
+using dpctl::tensor::py_internal::py_where;
 
 /* =========================== Clip ============================== */
 // using dpctl::tensor::py_internal::py_clip;
@@ -446,9 +446,9 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("mask_shape"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
-    //       py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
-    //       py::arg("depends") = py::list());
+    m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
+          py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
                               const dpctl::tensor::usm_ndarray &dst,
diff --git a/dpctl_ext/tensor/libtensor/source/where.cpp b/dpctl_ext/tensor/libtensor/source/where.cpp
new file mode 100644
index 000000000000..1afdbf45c66b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/where.cpp
@@ -0,0 +1,266 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.where
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/where.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+#include "where.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::search::where_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::search::where_strided_impl_fn_ptr_t;
+
+static where_contig_impl_fn_ptr_t where_contig_dispatch_table[td_ns::num_types]
+                                                             [td_ns::num_types];
+static where_strided_impl_fn_ptr_t
+    where_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+    py_where(const dpctl::tensor::usm_ndarray &condition,
+             const dpctl::tensor::usm_ndarray &x1,
+             const dpctl::tensor::usm_ndarray &x2,
+             const dpctl::tensor::usm_ndarray &dst,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, condition, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int nd = condition.get_ndim();
+    int x1_nd = x1.get_ndim();
+    int x2_nd = x2.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != x1_nd || nd != x2_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for where kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for where kernel.");
+    }
+
+    const py::ssize_t *x1_shape = x1.get_shape_raw();
+    const py::ssize_t *x2_shape = x2.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *cond_shape = condition.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<std::size_t>(sh_i);
+        shapes_equal = shapes_equal && (x1_shape[i] == sh_i) &&
+                       (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Axes are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) ||
+        (overlap(dst, x1) && !same_logical_tensors(dst, x1)) ||
+        (overlap(dst, x2) && !same_logical_tensors(dst, x2)))
+    {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int x1_typenum = x1.get_typenum();
+    int x2_typenum = x2.get_typenum();
+    int cond_typenum = condition.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int cond_typeid = array_types.typenum_to_lookup_id(cond_typenum);
+    int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum);
+    int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (x1_typeid != x2_typeid || x1_typeid != dst_typeid) {
+        throw py::value_error("Value arrays must have the same data type");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
+
+    char *cond_data = condition.get_data();
+    char *x1_data = x1.get_data();
+    char *x2_data = x2.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_x1_c_contig = x1.is_c_contiguous();
+    bool is_x1_f_contig = x1.is_f_contiguous();
+
+    bool is_x2_c_contig = x2.is_c_contiguous();
+    bool is_x2_f_contig = x2.is_f_contiguous();
+
+    bool is_cond_c_contig = condition.is_c_contiguous();
+    bool is_cond_f_contig = condition.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_x1_c_contig && is_x2_c_contig && is_cond_c_contig &&
+                         is_dst_c_contig);
+    bool all_f_contig = (is_x1_f_contig && is_x2_f_contig && is_cond_f_contig &&
+                         is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto contig_fn = where_contig_dispatch_table[x1_typeid][cond_typeid];
+
+        auto where_ev = contig_fn(exec_q, nelems, cond_data, x1_data, x2_data,
+                                  dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {x1, x2, dst, condition}, {where_ev});
+
+        return std::make_pair(ht_ev, where_ev);
+    }
+
+    auto const &cond_strides = condition.get_strides_vector();
+    auto const &x1_strides = x1.get_strides_vector();
+    auto const &x2_strides = x2.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_cond_strides;
+    shT simplified_x1_strides;
+    shT simplified_x2_strides;
+    shT simplified_dst_strides;
+    py::ssize_t cond_offset(0);
+    py::ssize_t x1_offset(0);
+    py::ssize_t x2_offset(0);
+    py::ssize_t dst_offset(0);
+
+    dpctl::tensor::py_internal::simplify_iteration_space_4(
+        nd, x1_shape, cond_strides, x1_strides, x2_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_cond_strides, simplified_x1_strides,
+        simplified_x2_strides, simplified_dst_strides, cond_offset, x1_offset,
+        x2_offset, dst_offset);
+
+    auto fn = where_strided_dispatch_table[x1_typeid][cond_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_cond_strides, simplified_x1_strides,
+        simplified_x2_strides, simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event where_ev = fn(exec_q, nelems, nd, cond_data, x1_data, x2_data,
+                              dst_data, packed_shape_strides, cond_offset,
+                              x1_offset, x2_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {where_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {x1, x2, condition, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, where_ev);
+}
+
+void init_where_dispatch_tables(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::search::WhereContigFactory;
+    DispatchTableBuilder<where_contig_impl_fn_ptr_t, WhereContigFactory,
+                         num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(where_contig_dispatch_table);
+
+    using dpctl::tensor::kernels::search::WhereStridedFactory;
+    DispatchTableBuilder<where_strided_impl_fn_ptr_t, WhereStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(where_strided_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/where.hpp b/dpctl_ext/tensor/libtensor/source/where.hpp
new file mode 100644
index 000000000000..ba81d8b11642
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/where.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.where
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_where(const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             sycl::queue &,
+             const std::vector<sycl::event> &);
+
+extern void init_where_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal

From 24425cf5b3991ecf0be082a65ceb9909d75ca0b2 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 19 Feb 2026 14:29:57 -0800
Subject: [PATCH 062/145] Move _type_utils.py to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py    |   7 +
 dpctl_ext/tensor/_type_utils.py | 998 ++++++++++++++++++++++++++++++++
 2 files changed, 1005 insertions(+)
 create mode 100644 dpctl_ext/tensor/_type_utils.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 033004bb0095..d7f2a785802c 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -55,20 +55,27 @@
 )
 from dpctl_ext.tensor._reshape import reshape
 
+from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
+
 __all__ = [
     "asnumpy",
     "astype",
+    "can_cast",
     "copy",
     "extract",
     "eye",
+    "finfo",
     "from_numpy",
     "full",
+    "iinfo",
+    "isdtype",
     "nonzero",
     "place",
     "put",
     "put_along_axis",
     "repeat",
     "reshape",
+    "result_type",
     "roll",
     "take",
     "take_along_axis",
diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py
new file mode 100644
index 000000000000..bcfec499f355
--- /dev/null
+++ b/dpctl_ext/tensor/_type_utils.py
@@ -0,0 +1,998 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from __future__ import annotations
+
+import dpctl.tensor as dpt
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
+
+
+def _all_data_types(_fp16, _fp64):
+    _non_fp_types = [
+        dpt.bool,
+        dpt.int8,
+        dpt.uint8,
+        dpt.int16,
+        dpt.uint16,
+        dpt.int32,
+        dpt.uint32,
+        dpt.int64,
+        dpt.uint64,
+    ]
+    if _fp64:
+        if _fp16:
+            return _non_fp_types + [
+                dpt.float16,
+                dpt.float32,
+                dpt.float64,
+                dpt.complex64,
+                dpt.complex128,
+            ]
+        else:
+            return _non_fp_types + [
+                dpt.float32,
+                dpt.float64,
+                dpt.complex64,
+                dpt.complex128,
+            ]
+    else:
+        if _fp16:
+            return _non_fp_types + [
+                dpt.float16,
+                dpt.float32,
+                dpt.complex64,
+            ]
+        else:
+            return _non_fp_types + [
+                dpt.float32,
+                dpt.complex64,
+            ]
+
+
+def _acceptance_fn_default_binary(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    return True
+
+
+def _acceptance_fn_default_unary(arg_dtype, ret_buf_dt, res_dt, sycl_dev):
+    return True
+
+
+def _acceptance_fn_divide(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    # both are being promoted, if the kind of result is
+    # different than the kind of original input dtypes,
+    # we use default dtype for the resulting kind.
+    # This covers, e.g. (array_dtype_i1 / array_dtype_u1)
+    # result of which in divide is double (in NumPy), but
+    # regular type promotion rules peg at float16
+    if (ret_buf1_dt.kind != arg1_dtype.kind) and (
+        ret_buf2_dt.kind != arg2_dtype.kind
+    ):
+        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
+        if res_dt == default_dt:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def _acceptance_fn_negative(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # negative is not defined for boolean data type
+    if arg_dtype.char == "?":
+        raise ValueError(
+            "The `negative` function, the `-` operator, is not supported "
+            "for inputs of data type bool, use the `~` operator or the "
+            "`logical_not` function instead"
+        )
+    else:
+        return True
+
+
+def _acceptance_fn_reciprocal(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # if the kind of result is different from the kind of input, we use the
+    # default floating-point dtype for the resulting kind. This guarantees
+    # alignment of reciprocal and divide output types.
+    if buf_dt.kind != arg_dtype.kind:
+        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
+        if res_dt == default_dt:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def _acceptance_fn_subtract(
+    arg1_dtype, arg2_dtype, buf1_dt, buf2_dt, res_dt, sycl_dev
+):
+    # subtract is not defined for boolean data type
+    if arg1_dtype.char == "?" and arg2_dtype.char == "?":
+        raise ValueError(
+            "The `subtract` function, the `-` operator, is not supported "
+            "for inputs of data type bool, use the `^` operator,  the "
+            "`bitwise_xor`, or the `logical_xor` function instead"
+        )
+    else:
+        return True
+
+
+def _can_cast(
+    from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool, casting="safe"
+) -> bool:
+    """
+    Can `from_` be cast to `to_` safely on a device with
+    fp16 and fp64 aspects as given?
+    """
+    if not _dtype_supported_by_device_impl(to_, _fp16, _fp64):
+        return False
+    can_cast_v = np.can_cast(from_, to_, casting=casting)  # ask NumPy
+    if _fp16 and _fp64:
+        return can_cast_v
+    if not can_cast_v:
+        if (
+            from_.kind in "biu"
+            and to_.kind in "fc"
+            and _is_maximal_inexact_type(to_, _fp16, _fp64)
+        ):
+            return True
+
+    return can_cast_v
+
+
+def _dtype_supported_by_device_impl(
+    dt: dpt.dtype, has_fp16: bool, has_fp64: bool
+) -> bool:
+    if has_fp64:
+        if not has_fp16:
+            if dt is dpt.float16:
+                return False
+    else:
+        if dt is dpt.float64:
+            return False
+        elif dt is dpt.complex128:
+            return False
+        if not has_fp16 and dt is dpt.float16:
+            return False
+    return True
+
+
+def _find_buf_dtype(arg_dtype, query_fn, sycl_dev, acceptance_fn):
+    res_dt = query_fn(arg_dtype)
+    if res_dt:
+        return None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    all_dts = _all_data_types(_fp16, _fp64)
+    for buf_dt in all_dts:
+        if _can_cast(arg_dtype, buf_dt, _fp16, _fp64):
+            res_dt = query_fn(buf_dt)
+            if res_dt:
+                acceptable = acceptance_fn(arg_dtype, buf_dt, res_dt, sycl_dev)
+                if acceptable:
+                    return buf_dt, res_dt
+                else:
+                    continue
+
+    return None, None
+
+
+def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn):
+    res_dt = query_fn(arg1_dtype, arg2_dtype)
+    if res_dt:
+        return None, None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    all_dts = _all_data_types(_fp16, _fp64)
+    for buf1_dt in all_dts:
+        for buf2_dt in all_dts:
+            if _can_cast(arg1_dtype, buf1_dt, _fp16, _fp64) and _can_cast(
+                arg2_dtype, buf2_dt, _fp16, _fp64
+            ):
+                res_dt = query_fn(buf1_dt, buf2_dt)
+                if res_dt:
+                    ret_buf1_dt = None if buf1_dt == arg1_dtype else buf1_dt
+                    ret_buf2_dt = None if buf2_dt == arg2_dtype else buf2_dt
+                    if ret_buf1_dt is None or ret_buf2_dt is None:
+                        return ret_buf1_dt, ret_buf2_dt, res_dt
+                    else:
+                        acceptable = acceptance_fn(
+                            arg1_dtype,
+                            arg2_dtype,
+                            ret_buf1_dt,
+                            ret_buf2_dt,
+                            res_dt,
+                            sycl_dev,
+                        )
+                        if acceptable:
+                            return ret_buf1_dt, ret_buf2_dt, res_dt
+                        else:
+                            continue
+
+    return None, None, None
+
+
+def _find_buf_dtype_in_place_op(arg1_dtype, arg2_dtype, query_fn, sycl_dev):
+    res_dt = query_fn(arg1_dtype, arg2_dtype)
+    if res_dt:
+        return None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg2_dtype, arg1_dtype, _fp16, _fp64, casting="same_kind"):
+        res_dt = query_fn(arg1_dtype, arg1_dtype)
+        if res_dt:
+            return arg1_dtype, res_dt
+
+    return None, None
+
+
+def _get_device_default_dtype(dt_kind, sycl_dev):
+    if dt_kind == "b":
+        return dpt.dtype(ti.default_device_bool_type(sycl_dev))
+    elif dt_kind == "i":
+        return dpt.dtype(ti.default_device_int_type(sycl_dev))
+    elif dt_kind == "u":
+        return dpt.dtype(ti.default_device_uint_type(sycl_dev))
+    elif dt_kind == "f":
+        return dpt.dtype(ti.default_device_fp_type(sycl_dev))
+    elif dt_kind == "c":
+        return dpt.dtype(ti.default_device_complex_type(sycl_dev))
+    raise RuntimeError
+
+
+def _is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool):
+    """
+    Return True if data type `dt` is the
+    maximal size inexact data type
+    """
+    if _fp64:
+        return dt in [dpt.float64, dpt.complex128]
+    return dt in [dpt.float32, dpt.complex64]
+
+
+def _to_device_supported_dtype(dt, dev):
+    has_fp16 = dev.has_aspect_fp16
+    has_fp64 = dev.has_aspect_fp64
+
+    return _to_device_supported_dtype_impl(dt, has_fp16, has_fp64)
+
+
+def _to_device_supported_dtype_impl(dt, has_fp16, has_fp64):
+    if has_fp64:
+        if not has_fp16:
+            if dt is dpt.float16:
+                return dpt.float32
+    else:
+        if dt is dpt.float64:
+            return dpt.float32
+        elif dt is dpt.complex128:
+            return dpt.complex64
+        if not has_fp16 and dt is dpt.float16:
+            return dpt.float32
+    return dt
+
+
+class WeakBooleanType:
+    """Python type representing type of Python boolean objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakIntegralType:
+    """Python type representing type of Python integral objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakFloatingType:
+    """Python type representing type of Python floating point objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakComplexType:
+    """Python type representing type of Python complex floating point objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+def _weak_type_num_kind(o):
+    _map = {"?": 0, "i": 1, "f": 2, "c": 3}
+    if isinstance(o, WeakBooleanType):
+        return _map["?"]
+    if isinstance(o, WeakIntegralType):
+        return _map["i"]
+    if isinstance(o, WeakFloatingType):
+        return _map["f"]
+    if isinstance(o, WeakComplexType):
+        return _map["c"]
+    raise TypeError(
+        f"Unexpected type {o} while expecting "
+        "`WeakBooleanType`, `WeakIntegralType`,"
+        "`WeakFloatingType`, or `WeakComplexType`."
+    )
+
+
+def _strong_dtype_num_kind(o):
+    _map = {"b": 0, "i": 1, "u": 1, "f": 2, "c": 3}
+    if not isinstance(o, dpt.dtype):
+        raise TypeError
+    k = o.kind
+    if k in _map:
+        return _map[k]
+    raise ValueError(f"Unrecognized kind {k} for dtype {o}")
+
+
+def _is_weak_dtype(dtype):
+    return isinstance(
+        dtype,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    )
+
+
+def _resolve_weak_types(o1_dtype, o2_dtype, dev):
+    """Resolves weak data type per NEP-0050"""
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            raise ValueError
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
+    """
+    Resolves weak data type per NEP-0050 for comparisons and
+    divide, where result type is known and special behavior
+    is needed to handle mixed integer kinds and Python integers
+    without overflow
+    """
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            raise ValueError
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            if o1_kind_num == o2_kind_num and isinstance(
+                o1_dtype, WeakIntegralType
+            ):
+                o1_val = o1_dtype.get()
+                o2_iinfo = dpt.iinfo(o2_dtype)
+                if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max):
+                    return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            if o1_kind_num == o2_kind_num and isinstance(
+                o2_dtype, WeakIntegralType
+            ):
+                o2_val = o2_dtype.get()
+                o1_iinfo = dpt.iinfo(o1_dtype)
+                if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max):
+                    return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val))
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev):
+    """
+    Resolves weak data types per NEP-0050,
+    where the second and third arguments are
+    permitted to be weak types
+    """
+    if _is_weak_dtype(st_dtype):
+        raise ValueError
+    if _is_weak_dtype(dtype1):
+        if _is_weak_dtype(dtype2):
+            kind_num1 = _weak_type_num_kind(dtype1)
+            kind_num2 = _weak_type_num_kind(dtype2)
+            st_kind_num = _strong_dtype_num_kind(st_dtype)
+
+            if kind_num1 > st_kind_num:
+                if isinstance(dtype1, WeakIntegralType):
+                    ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype1, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype1 = dpt.complex64
+                    ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype1 = st_dtype
+
+            if kind_num2 > st_kind_num:
+                if isinstance(dtype2, WeakIntegralType):
+                    ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype2, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype2 = dpt.complex64
+                    ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype2 = st_dtype
+
+            return ret_dtype1, ret_dtype2
+
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype2), dtype2),
+            ]
+        )
+        dt1_kind_num = _weak_type_num_kind(dtype1)
+        if dt1_kind_num > max_dt_num_kind:
+            if isinstance(dtype1, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), dtype2
+            if isinstance(dtype1, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dpt.complex64, dtype2
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    dtype2,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), dtype2
+        else:
+            return max_dtype, dtype2
+    elif _is_weak_dtype(dtype2):
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype1), dtype1),
+            ]
+        )
+        dt2_kind_num = _weak_type_num_kind(dtype2)
+        if dt2_kind_num > max_dt_num_kind:
+            if isinstance(dtype2, WeakIntegralType):
+                return dtype1, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype2, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dtype1, dpt.complex64
+                return (
+                    dtype1,
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                )
+            return dtype1, _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return dtype1, max_dtype
+    else:
+        # both are strong dtypes
+        # return unmodified
+        return dtype1, dtype2
+
+
+def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev):
+    """Resolves one weak data type with one strong data type per NEP-0050"""
+    if _is_weak_dtype(st_dtype):
+        raise ValueError
+    if _is_weak_dtype(dtype):
+        st_kind_num = _strong_dtype_num_kind(st_dtype)
+        kind_num = _weak_type_num_kind(dtype)
+        if kind_num > st_kind_num:
+            if isinstance(dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype, WeakComplexType):
+                if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                    return dpt.complex64
+                return _to_device_supported_dtype(dpt.complex128, dev)
+            return _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return st_dtype
+    else:
+        return dtype
+
+
+class finfo_object:
+    """
+    `numpy.finfo` subclass which returns Python floating-point scalars for
+    `eps`, `max`, `min`, and `smallest_normal` attributes.
+    """
+
+    def __init__(self, dtype):
+        _supported_dtype([dpt.dtype(dtype)])
+        self._finfo = np.finfo(dtype)
+
+    @property
+    def bits(self):
+        """Number of bits occupied by the real-valued floating-point data type."""
+        return int(self._finfo.bits)
+
+    @property
+    def smallest_normal(self):
+        """
+        Smallest positive real-valued floating-point number with full
+        precision.
+        """
+        return float(self._finfo.smallest_normal)
+
+    @property
+    def tiny(self):
+        """An alias for `smallest_normal`"""
+        return float(self._finfo.tiny)
+
+    @property
+    def eps(self):
+        """
+        Difference between 1.0 and the next smallest representable real-valued
+        floating-point number larger than 1.0 according to the IEEE-754
+        standard.
+        """
+        return float(self._finfo.eps)
+
+    @property
+    def epsneg(self):
+        """
+        Difference between 1.0 and the next smallest representable real-valued
+        floating-point number smaller than 1.0 according to the IEEE-754
+        standard.
+        """
+        return float(self._finfo.epsneg)
+
+    @property
+    def min(self):
+        """Smallest representable real-valued number."""
+        return float(self._finfo.min)
+
+    @property
+    def max(self):
+        """Largest representable real-valued number."""
+        return float(self._finfo.max)
+
+    @property
+    def resolution(self):
+        """The approximate decimal resolution of this type."""
+        return float(self._finfo.resolution)
+
+    @property
+    def precision(self):
+        """
+        The approximate number of decimal digits to which this kind of
+        floating point type is precise.
+        """
+        return float(self._finfo.precision)
+
+    @property
+    def dtype(self):
+        """
+        The dtype for which finfo returns information. For complex input, the
+        returned dtype is the associated floating point dtype for its real and
+        complex components.
+        """
+        return self._finfo.dtype
+
+    def __str__(self):
+        return self._finfo.__str__()
+
+    def __repr__(self):
+        return self._finfo.__repr__()
+
+
+def can_cast(from_, to, /, *, casting="safe") -> bool:
+    """can_cast(from, to, casting="safe")
+
+    Determines if one data type can be cast to another data type according \
+    to Type Promotion Rules.
+
+    Args:
+       from_ (Union[usm_ndarray, dtype]):
+           source data type. If `from_` is an array, a device-specific type
+           promotion rules apply.
+       to (dtype):
+           target data type
+       casting (Optional[str]):
+            controls what kind of data casting may occur.
+
+                * "no" means data types should not be cast at all.
+                * "safe" means only casts that preserve values are allowed.
+                * "same_kind" means only safe casts and casts within a kind,
+                  like `float64` to `float32`, are allowed.
+                * "unsafe" means any data conversion can be done.
+
+            Default: `"safe"`.
+
+    Returns:
+        bool:
+            Gives `True` if cast can occur according to the casting rule.
+
+    Device-specific type promotion rules take into account which data type are
+    and are not supported by a specific device.
+    """
+    if isinstance(to, dpt.usm_ndarray):
+        raise TypeError(f"Expected `dpt.dtype` type, got {type(to)}.")
+
+    dtype_to = dpt.dtype(to)
+    _supported_dtype([dtype_to])
+
+    if isinstance(from_, dpt.usm_ndarray):
+        dtype_from = from_.dtype
+        return _can_cast(
+            dtype_from,
+            dtype_to,
+            from_.sycl_device.has_aspect_fp16,
+            from_.sycl_device.has_aspect_fp64,
+            casting=casting,
+        )
+    else:
+        dtype_from = dpt.dtype(from_)
+        _supported_dtype([dtype_from])
+        # query casting as if all dtypes are supported
+        return _can_cast(dtype_from, dtype_to, True, True, casting=casting)
+
+
+def result_type(*arrays_and_dtypes):
+    """
+    result_type(*arrays_and_dtypes)
+
+    Returns the dtype that results from applying the Type Promotion Rules to \
+        the arguments.
+
+    Args:
+        arrays_and_dtypes (Union[usm_ndarray, dtype]):
+            An arbitrary length sequence of usm_ndarray objects or dtypes.
+
+    Returns:
+        dtype:
+            The dtype resulting from an operation involving the
+            input arrays and dtypes.
+    """
+    dtypes = []
+    devices = []
+    weak_dtypes = []
+    for arg_i in arrays_and_dtypes:
+        if isinstance(arg_i, dpt.usm_ndarray):
+            devices.append(arg_i.sycl_device)
+            dtypes.append(arg_i.dtype)
+        elif isinstance(arg_i, int):
+            weak_dtypes.append(WeakIntegralType(arg_i))
+        elif isinstance(arg_i, float):
+            weak_dtypes.append(WeakFloatingType(arg_i))
+        elif isinstance(arg_i, complex):
+            weak_dtypes.append(WeakComplexType(arg_i))
+        elif isinstance(arg_i, bool):
+            weak_dtypes.append(WeakBooleanType(arg_i))
+        else:
+            dt = dpt.dtype(arg_i)
+            _supported_dtype([dt])
+            dtypes.append(dt)
+
+    has_fp16 = True
+    has_fp64 = True
+    target_dev = None
+    if devices:
+        inspected = False
+        for d in devices:
+            if inspected:
+                unsame_fp16_support = d.has_aspect_fp16 != has_fp16
+                unsame_fp64_support = d.has_aspect_fp64 != has_fp64
+                if unsame_fp16_support or unsame_fp64_support:
+                    raise ValueError(
+                        "Input arrays reside on devices "
+                        "with different device supports; "
+                        "unable to determine which "
+                        "device-specific type promotion rules "
+                        "to use."
+                    )
+            else:
+                has_fp16 = d.has_aspect_fp16
+                has_fp64 = d.has_aspect_fp64
+                target_dev = d
+                inspected = True
+
+    if not dtypes and weak_dtypes:
+        dtypes.append(weak_dtypes[0].get())
+
+    if not (has_fp16 and has_fp64):
+        for dt in dtypes:
+            if not _dtype_supported_by_device_impl(dt, has_fp16, has_fp64):
+                raise ValueError(
+                    f"Argument {dt} is not supported by the device"
+                )
+        res_dt = np.result_type(*dtypes)
+        res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
+        for wdt in weak_dtypes:
+            pair = _resolve_weak_types(wdt, res_dt, target_dev)
+            res_dt = np.result_type(*pair)
+            res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
+    else:
+        res_dt = np.result_type(*dtypes)
+        if weak_dtypes:
+            weak_dt_obj = [wdt.get() for wdt in weak_dtypes]
+            res_dt = np.result_type(res_dt, *weak_dt_obj)
+
+    return res_dt
+
+
+def iinfo(dtype, /):
+    """iinfo(dtype)
+
+    Returns machine limits for integer data types.
+
+    Args:
+        dtype (dtype, usm_ndarray):
+            integer dtype or
+            an array with integer dtype.
+
+    Returns:
+        iinfo_object:
+            An object with the following attributes:
+
+            * bits: int
+                number of bits occupied by the data type
+            * max: int
+                largest representable number.
+            * min: int
+                smallest representable number.
+            * dtype: dtype
+                integer data type.
+    """
+    if isinstance(dtype, dpt.usm_ndarray):
+        dtype = dtype.dtype
+    _supported_dtype([dpt.dtype(dtype)])
+    return np.iinfo(dtype)
+
+
+def finfo(dtype, /):
+    """finfo(type)
+
+    Returns machine limits for floating-point data types.
+
+    Args:
+        dtype (dtype, usm_ndarray): floating-point dtype or
+            an array with floating point data type.
+            If complex, the information is about its component
+            data type.
+
+    Returns:
+        finfo_object:
+            an object have the following attributes:
+
+                * bits: int
+                    number of bits occupied by dtype.
+                * eps: float
+                    difference between 1.0 and the next smallest representable
+                    real-valued floating-point number larger than 1.0 according
+                    to the IEEE-754 standard.
+                * max: float
+                    largest representable real-valued number.
+                * min: float
+                    smallest representable real-valued number.
+                * smallest_normal: float
+                    smallest positive real-valued floating-point number with
+                    full precision.
+                * dtype: dtype
+                    real-valued floating-point data type.
+
+    """
+    if isinstance(dtype, dpt.usm_ndarray):
+        dtype = dtype.dtype
+    _supported_dtype([dpt.dtype(dtype)])
+    return finfo_object(dtype)
+
+
+def _supported_dtype(dtypes):
+    for dtype in dtypes:
+        if dtype.char not in "?bBhHiIlLqQefdFD":
+            raise ValueError(f"Dpctl doesn't support dtype {dtype}.")
+    return True
+
+
+def isdtype(dtype, kind):
+    """isdtype(dtype, kind)
+
+    Returns a boolean indicating whether a provided `dtype` is
+    of a specified data type `kind`.
+
+    See [array API](array_api) for more information.
+
+    [array_api]: https://data-apis.org/array-api/latest/
+    """
+
+    if not isinstance(dtype, np.dtype):
+        raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype}")
+
+    if isinstance(kind, np.dtype):
+        return dtype == kind
+
+    elif isinstance(kind, str):
+        if kind == "bool":
+            return dtype == np.dtype("bool")
+        elif kind == "signed integer":
+            return dtype.kind == "i"
+        elif kind == "unsigned integer":
+            return dtype.kind == "u"
+        elif kind == "integral":
+            return dtype.kind in "iu"
+        elif kind == "real floating":
+            return dtype.kind == "f"
+        elif kind == "complex floating":
+            return dtype.kind == "c"
+        elif kind == "numeric":
+            return dtype.kind in "iufc"
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind}")
+
+    elif isinstance(kind, tuple):
+        return any(isdtype(dtype, k) for k in kind)
+
+    else:
+        raise TypeError(f"Unsupported data type kind: {kind}")
+
+
+def _default_accumulation_dtype(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "bi":
+        res_dt = dpt.dtype(ti.default_device_int_type(q))
+        if inp_dt.itemsize > res_dt.itemsize:
+            res_dt = inp_dt
+    elif inp_kind in "u":
+        res_dt = dpt.dtype(ti.default_device_uint_type(q))
+        res_ii = dpt.iinfo(res_dt)
+        inp_ii = dpt.iinfo(inp_dt)
+        if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
+            pass
+        else:
+            res_dt = inp_dt
+    elif inp_kind in "fc":
+        res_dt = inp_dt
+
+    return res_dt
+
+
+def _default_accumulation_dtype_fp_types(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    and the accumulation supports only floating-point data types
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "biu":
+        res_dt = dpt.dtype(ti.default_device_fp_type(q))
+        can_cast_v = dpt.can_cast(inp_dt, res_dt)
+        if not can_cast_v:
+            _fp64 = q.sycl_device.has_aspect_fp64
+            res_dt = dpt.float64 if _fp64 else dpt.float32
+    elif inp_kind in "f":
+        res_dt = inp_dt
+    elif inp_kind in "c":
+        raise ValueError("function not defined for complex types")
+
+    return res_dt
+
+
+__all__ = [
+    "_find_buf_dtype",
+    "_find_buf_dtype2",
+    "_to_device_supported_dtype",
+    "_acceptance_fn_default_unary",
+    "_acceptance_fn_reciprocal",
+    "_acceptance_fn_default_binary",
+    "_acceptance_fn_divide",
+    "_acceptance_fn_negative",
+    "_acceptance_fn_subtract",
+    "_resolve_one_strong_one_weak_types",
+    "_resolve_one_strong_two_weak_types",
+    "_resolve_weak_types",
+    "_resolve_weak_types_all_py_ints",
+    "_weak_type_num_kind",
+    "_strong_dtype_num_kind",
+    "can_cast",
+    "finfo",
+    "iinfo",
+    "isdtype",
+    "result_type",
+    "WeakBooleanType",
+    "WeakIntegralType",
+    "WeakFloatingType",
+    "WeakComplexType",
+    "_default_accumulation_dtype",
+    "_default_accumulation_dtype_fp_types",
+    "_find_buf_dtype_in_place_op",
+]

From c88297c6f57f9b3accb523d409f2272f00489cbb Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 02:15:56 -0800
Subject: [PATCH 063/145] Use import _type_utils from dpctl_ext

---
 dpnp/dpnp_array.py                   | 2 +-
 dpnp/dpnp_iface_mathematical.py      | 8 +++++---
 dpnp/dpnp_iface_trigonometric.py     | 4 +++-
 dpnp/dpnp_utils/dpnp_utils_common.py | 5 +++--
 dpnp/tests/test_indexing.py          | 7 +++++--
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 0b6d882c53db..564627bacf2f 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -38,12 +38,12 @@
 import warnings
 
 import dpctl.tensor as dpt
-import dpctl.tensor._type_utils as dtu
 from dpctl.tensor._numpy_helper import AxisError
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 
 from . import memory as dpm
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index e339c24d384c..f2779b2fdcd1 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -40,6 +40,7 @@
 """
 
 # pylint: disable=protected-access
+# pylint: disable=duplicate-code
 # pylint: disable=no-name-in-module
 
 
@@ -48,15 +49,16 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
-import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
     normalize_axis_index,
     normalize_axis_tuple,
 )
-from dpctl.tensor._type_utils import _acceptance_fn_divide
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 
@@ -1561,7 +1563,7 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
     mkl_fn_to_call="_mkl_div_to_call",
     mkl_impl_fn="_div",
     binary_inplace_fn=ti._divide_inplace,
-    acceptance_fn=_acceptance_fn_divide,
+    acceptance_fn=dtu._acceptance_fn_divide,
 )
 
 
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index a46f06c10e08..9894bd304701 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -45,8 +45,10 @@
 
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
-import dpctl.tensor._type_utils as dtu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 
diff --git a/dpnp/dpnp_utils/dpnp_utils_common.py b/dpnp/dpnp_utils/dpnp_utils_common.py
index e4bde2e1ec86..aa294fefe275 100644
--- a/dpnp/dpnp_utils/dpnp_utils_common.py
+++ b/dpnp/dpnp_utils/dpnp_utils_common.py
@@ -29,8 +29,9 @@
 
 from collections.abc import Iterable
 
-import dpctl.tensor._type_utils as dtu
-
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
 
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index 9a55efe138b7..79c41a2f45f7 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -4,8 +4,6 @@
 import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import AxisError
-from dpctl.tensor._type_utils import _to_device_supported_dtype
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_,
@@ -16,6 +14,11 @@
 )
 
 import dpnp
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._numpy_helper import AxisError
+from dpctl_ext.tensor._type_utils import _to_device_supported_dtype
 from dpnp.dpnp_array import dpnp_array
 
 from .helper import (

From 9207fc57be52aa8aa724290efe0c3da194906894 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 02:19:22 -0800
Subject: [PATCH 064/145] Move _broadcast_shape_impl to
 dpctl_ext/tenspr/_manipulation_fucntions.py

---
 dpctl_ext/tensor/_copy_utils.py             |  2 +-
 dpctl_ext/tensor/_manipulation_functions.py | 40 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 5d1ac209c86b..30a25dd1e2f6 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -37,12 +37,12 @@
 import numpy as np
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
-from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
+from dpctl_ext.tensor._type_utils import _dtype_supported_by_device_impl
 
 from ._numpy_helper import normalize_axis_index
 
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 85b4f029156b..572b2a99f872 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -26,6 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+import itertools
 import operator
 
 import dpctl
@@ -45,6 +46,45 @@
 )
 
 
+def _broadcast_shape_impl(shapes):
+    if len(set(shapes)) == 1:
+        return shapes[0]
+    mutable_shapes = False
+    nds = [len(s) for s in shapes]
+    biggest = max(nds)
+    sh_len = len(shapes)
+    for i in range(sh_len):
+        diff = biggest - nds[i]
+        if diff > 0:
+            ty = type(shapes[i])
+            shapes[i] = ty(
+                itertools.chain(itertools.repeat(1, diff), shapes[i])
+            )
+    common_shape = []
+    for axis in range(biggest):
+        lengths = [s[axis] for s in shapes]
+        unique = set(lengths + [1])
+        if len(unique) > 2:
+            raise ValueError(
+                "Shape mismatch: two or more arrays have "
+                f"incompatible dimensions on axis ({axis},)"
+            )
+        elif len(unique) == 2:
+            unique.remove(1)
+            new_length = unique.pop()
+            common_shape.append(new_length)
+            for i in range(sh_len):
+                if shapes[i][axis] == 1:
+                    if not mutable_shapes:
+                        shapes = [list(s) for s in shapes]
+                        mutable_shapes = True
+                    shapes[i][axis] = new_length
+        else:
+            common_shape.append(1)
+
+    return tuple(common_shape)
+
+
 def repeat(x, repeats, /, *, axis=None):
     """repeat(x, repeats, axis=None)
 

From ebbce3483b2a337398ae1ccdbb94722635ab7cc5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 02:35:21 -0800
Subject: [PATCH 065/145] Reuse can_cast() from dpctl_ext/tensor

---
 dpctl_ext/tensor/_copy_utils.py             | 2 +-
 dpctl_ext/tensor/_manipulation_functions.py | 3 ++-
 dpctl_ext/tensor/_type_utils.py             | 3 ++-
 dpnp/dpnp_iface_manipulation.py             | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 30a25dd1e2f6..e14d35a0e485 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -1013,7 +1013,7 @@ def astype(
     else:
         target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
 
-    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
+    if not dpt_ext.can_cast(ary_dtype, target_dtype, casting=casting):
         raise TypeError(
             f"Can not cast from {ary_dtype} to {newdtype} "
             f"according to rule {casting}."
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 572b2a99f872..f1b8b46dbcbc 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -36,6 +36,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
@@ -162,7 +163,7 @@ def repeat(x, repeats, /, *, axis=None):
             )
         )
         dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-        if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
+        if not dpt_ext.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
             raise TypeError(
                 f"'repeats' data type {repeats.dtype} cannot be cast to "
                 "'int64' according to the casting rule ''safe.''"
diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py
index bcfec499f355..9715aa306e1f 100644
--- a/dpctl_ext/tensor/_type_utils.py
+++ b/dpctl_ext/tensor/_type_utils.py
@@ -33,6 +33,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 
@@ -955,7 +956,7 @@ def _default_accumulation_dtype_fp_types(inp_dt, q):
     inp_kind = inp_dt.kind
     if inp_kind in "biu":
         res_dt = dpt.dtype(ti.default_device_fp_type(q))
-        can_cast_v = dpt.can_cast(inp_dt, res_dt)
+        can_cast_v = dpt_ext.can_cast(inp_dt, res_dt)
         if not can_cast_v:
             _fp64 = q.sycl_device.has_aspect_fp64
             res_dt = dpt.float64 if _fp64 else dpt.float32
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 8aa5b6832b3d..66cf9f6f844c 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -1270,7 +1270,7 @@ def can_cast(from_, to, casting="safe"):
         if dpnp.is_supported_array_type(from_)
         else dpnp.dtype(from_)
     )
-    return dpt.can_cast(dtype_from, to, casting=casting)
+    return dpt_ext.can_cast(dtype_from, to, casting=casting)
 
 
 def column_stack(tup):

From 6a133337d7cf47c5ac4406e689a49c0b452f0d64 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 02:44:41 -0800
Subject: [PATCH 066/145] Reuse finfo(), iinfo(), isdtype() from
 dpctl_ext/tensor

---
 dpctl_ext/tensor/_type_utils.py | 8 ++++----
 dpnp/dpnp_iface_types.py        | 9 ++++++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py
index 9715aa306e1f..1e386e15dfa3 100644
--- a/dpctl_ext/tensor/_type_utils.py
+++ b/dpctl_ext/tensor/_type_utils.py
@@ -450,7 +450,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
                 o1_dtype, WeakIntegralType
             ):
                 o1_val = o1_dtype.get()
-                o2_iinfo = dpt.iinfo(o2_dtype)
+                o2_iinfo = dpt_ext.iinfo(o2_dtype)
                 if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max):
                     return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype
             return o2_dtype, o2_dtype
@@ -473,7 +473,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
                 o2_dtype, WeakIntegralType
             ):
                 o2_val = o2_dtype.get()
-                o1_iinfo = dpt.iinfo(o1_dtype)
+                o1_iinfo = dpt_ext.iinfo(o1_dtype)
                 if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max):
                     return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val))
             return o1_dtype, o1_dtype
@@ -936,8 +936,8 @@ def _default_accumulation_dtype(inp_dt, q):
             res_dt = inp_dt
     elif inp_kind in "u":
         res_dt = dpt.dtype(ti.default_device_uint_type(q))
-        res_ii = dpt.iinfo(res_dt)
-        inp_ii = dpt.iinfo(inp_dt)
+        res_ii = dpt_ext.iinfo(res_dt)
+        inp_ii = dpt_ext.iinfo(inp_dt)
         if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
             pass
         else:
diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py
index 8fdb9e1d3d38..f133333d6b83 100644
--- a/dpnp/dpnp_iface_types.py
+++ b/dpnp/dpnp_iface_types.py
@@ -40,6 +40,9 @@
 import dpctl.tensor as dpt
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .dpnp_array import dpnp_array
@@ -211,7 +214,7 @@ def finfo(dtype):
     """
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt.finfo(dtype)
+    return dpt_ext.finfo(dtype)
 
 
 # pylint: disable=redefined-outer-name
@@ -244,7 +247,7 @@ def iinfo(dtype):
 
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt.iinfo(dtype)
+    return dpt_ext.iinfo(dtype)
 
 
 def isdtype(dtype, kind):
@@ -298,7 +301,7 @@ def isdtype(dtype, kind):
     elif isinstance(kind, tuple):
         kind = tuple(dpt.dtype(k) if isinstance(k, type) else k for k in kind)
 
-    return dpt.isdtype(dtype, kind)
+    return dpt_ext.isdtype(dtype, kind)
 
 
 def issubdtype(arg1, arg2):

From de2c429f7c384757c0c871301c441cdb1a089e92 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 02:47:36 -0800
Subject: [PATCH 067/145] Reuse result_type() from dpctl_ext/tensor

---
 dpctl_ext/tensor/_copy_utils.py | 2 +-
 dpnp/dpnp_iface_indexing.py     | 2 +-
 dpnp/dpnp_iface_manipulation.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index e14d35a0e485..af72544a8b0d 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -291,7 +291,7 @@ def _prepare_indices_arrays(inds, q, usm_type):
     )
 
     # promote to a common integral type if possible
-    ind_dt = dpt.result_type(*inds)
+    ind_dt = dpt_ext.result_type(*inds)
     if ind_dt.kind not in "ui":
         raise ValueError(
             "cannot safely promote indices to an integer data type"
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 95998dbdda4b..35739df9a12b 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -250,7 +250,7 @@ def choose(a, choices, out=None, mode="wrap"):
 
     res_usm_type, exec_q = get_usm_allocations(choices + [inds])
     # apply type promotion to input choices
-    res_dt = dpt.result_type(*choices)
+    res_dt = dpt_ext.result_type(*choices)
     if len(choices) > 1:
         choices = tuple(
             map(
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 66cf9f6f844c..08fd55c58ace 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -3195,7 +3195,7 @@ def result_type(*arrays_and_dtypes):
         )
         for X in arrays_and_dtypes
     ]
-    return dpt.result_type(*usm_arrays_and_dtypes)
+    return dpt_ext.result_type(*usm_arrays_and_dtypes)
 
 
 def roll(x, shift, axis=None):

From 9a970021f27db0d340c8a1907dfc6470de1361a7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 02:50:50 -0800
Subject: [PATCH 068/145] Move _scalar_utils.py to dpctl_ext/tensor

---
 dpctl_ext/tensor/_scalar_utils.py | 122 ++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 dpctl_ext/tensor/_scalar_utils.py

diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py
new file mode 100644
index 000000000000..86787baea8cc
--- /dev/null
+++ b/dpctl_ext/tensor/_scalar_utils.py
@@ -0,0 +1,122 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numbers
+
+import dpctl.memory as dpm
+import dpctl.tensor as dpt
+import numpy as np
+from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
+
+from ._type_utils import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _to_device_supported_dtype,
+)
+
+
+def _get_queue_usm_type(o):
+    """Return SYCL device where object `o` allocated memory, or None."""
+    if isinstance(o, dpt.usm_ndarray):
+        return o.sycl_queue, o.usm_type
+    elif hasattr(o, "__sycl_usm_array_interface__"):
+        try:
+            m = dpm.as_usm_memory(o)
+            return m.sycl_queue, m.get_usm_type()
+        except Exception:
+            return None, None
+    return None, None
+
+
+def _get_dtype(o, dev):
+    if isinstance(o, dpt.usm_ndarray):
+        return o.dtype
+    if hasattr(o, "__sycl_usm_array_interface__"):
+        return dpt.asarray(o).dtype
+    if _is_buffer(o):
+        host_dt = np.array(o).dtype
+        dev_dt = _to_device_supported_dtype(host_dt, dev)
+        return dev_dt
+    if hasattr(o, "dtype"):
+        dev_dt = _to_device_supported_dtype(o.dtype, dev)
+        return dev_dt
+    if isinstance(o, bool):
+        return WeakBooleanType(o)
+    if isinstance(o, int):
+        return WeakIntegralType(o)
+    if isinstance(o, float):
+        return WeakFloatingType(o)
+    if isinstance(o, complex):
+        return WeakComplexType(o)
+    return np.object_
+
+
+def _validate_dtype(dt) -> bool:
+    return isinstance(
+        dt,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ) or (
+        isinstance(dt, dpt.dtype)
+        and dt
+        in [
+            dpt.bool,
+            dpt.int8,
+            dpt.uint8,
+            dpt.int16,
+            dpt.uint16,
+            dpt.int32,
+            dpt.uint32,
+            dpt.int64,
+            dpt.uint64,
+            dpt.float16,
+            dpt.float32,
+            dpt.float64,
+            dpt.complex64,
+            dpt.complex128,
+        ]
+    )
+
+
+def _get_shape(o):
+    if isinstance(o, dpt.usm_ndarray):
+        return o.shape
+    if _is_buffer(o):
+        return memoryview(o).shape
+    if isinstance(o, numbers.Number):
+        return ()
+    return getattr(o, "shape", tuple())
+
+
+__all__ = [
+    "_get_dtype",
+    "_get_queue_usm_type",
+    "_get_shape",
+    "_validate_dtype",
+]

From 15c467ddb5dcdc4edbf7d901fac63999fb23f512 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 02:57:06 -0800
Subject: [PATCH 069/145] Move where() to dpctl_ext/tensor and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py          |   3 +
 dpctl_ext/tensor/_search_functions.py | 419 ++++++++++++++++++++++++++
 dpnp/dpnp_algo/dpnp_arraycreation.py  |   2 +-
 dpnp/dpnp_iface_searching.py          |   5 +-
 4 files changed, 427 insertions(+), 2 deletions(-)
 create mode 100644 dpctl_ext/tensor/_search_functions.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index d7f2a785802c..2ca989bc8c14 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,6 +27,8 @@
 # *****************************************************************************
 
 
+from dpctl.tensor._search_functions import where
+
 from dpctl_ext.tensor._copy_utils import (
     asnumpy,
     astype,
@@ -82,4 +84,5 @@
     "to_numpy",
     "tril",
     "triu",
+    "where",
 ]
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
new file mode 100644
index 000000000000..053c68e1857d
--- /dev/null
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -0,0 +1,419 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+from dpctl_ext.tensor._manipulation_functions import _broadcast_shape_impl
+
+from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _all_data_types,
+    _can_cast,
+    _is_weak_dtype,
+    _strong_dtype_num_kind,
+    _to_device_supported_dtype,
+    _weak_type_num_kind,
+)
+
+
+def _default_dtype_from_weak_type(dt, dev):
+    if isinstance(dt, WeakBooleanType):
+        return dpt.bool
+    if isinstance(dt, WeakIntegralType):
+        return dpt.dtype(ti.default_device_int_type(dev))
+    if isinstance(dt, WeakFloatingType):
+        return dpt.dtype(ti.default_device_fp_type(dev))
+    if isinstance(dt, WeakComplexType):
+        return dpt.dtype(ti.default_device_complex_type(dev))
+
+
+def _resolve_two_weak_types(o1_dtype, o2_dtype, dev):
+    """Resolves two weak data types per NEP-0050"""
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            return _default_dtype_from_weak_type(
+                o1_dtype, dev
+            ), _default_dtype_from_weak_type(o2_dtype, dev)
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _where_result_type(dt1, dt2, dev):
+    res_dtype = dpt_ext.result_type(dt1, dt2)
+    fp16 = dev.has_aspect_fp16
+    fp64 = dev.has_aspect_fp64
+
+    all_dts = _all_data_types(fp16, fp64)
+    if res_dtype in all_dts:
+        return res_dtype
+    else:
+        for res_dtype_ in all_dts:
+            if _can_cast(dt1, res_dtype_, fp16, fp64) and _can_cast(
+                dt2, res_dtype_, fp16, fp64
+            ):
+                return res_dtype_
+        return None
+
+
+def where(condition, x1, x2, /, *, order="K", out=None):
+    """
+    Returns :class:`dpctl.tensor.usm_ndarray` with elements chosen
+    from ``x1`` or ``x2`` depending on ``condition``.
+
+    Args:
+        condition (usm_ndarray): When ``True`` yields from ``x1``,
+            and otherwise yields from ``x2``.
+            Must be compatible with ``x1`` and ``x2`` according
+            to broadcasting rules.
+        x1 (Union[usm_ndarray, bool, int, float, complex]):
+            Array from which values are chosen when ``condition`` is ``True``.
+            Must be compatible with ``condition`` and ``x2`` according
+            to broadcasting rules.
+        x2 (Union[usm_ndarray, bool, int, float, complex]):
+            Array from which values are chosen when ``condition`` is not
+            ``True``.
+            Must be compatible with ``condition`` and ``x2`` according
+            to broadcasting rules.
+        order (``"K"``, ``"C"``, ``"F"``, ``"A"``, optional):
+            Memory layout of the new output array,
+            if parameter ``out`` is ``None``.
+            Default: ``"K"``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            An array with elements from ``x1`` where ``condition`` is ``True``,
+            and elements from ``x2`` elsewhere.
+
+    The data type of the returned array is determined by applying
+    the Type Promotion Rules to ``x1`` and ``x2``.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    q1, condition_usm_type = condition.sycl_queue, condition.usm_type
+    q2, x1_usm_type = _get_queue_usm_type(x1)
+    q3, x2_usm_type = _get_queue_usm_type(x2)
+    if q2 is None and q3 is None:
+        exec_q = q1
+        out_usm_type = condition_usm_type
+    elif q3 is None:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x1_usm_type,
+            )
+        )
+    elif q2 is None:
+        exec_q = dpctl.utils.get_execution_queue((q1, q3))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x2_usm_type,
+            )
+        )
+    else:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x1_usm_type,
+                x2_usm_type,
+            )
+        )
+    dpctl.utils.validate_usm_type(out_usm_type, allow_none=False)
+    condition_shape = condition.shape
+    x1_shape = _get_shape(x1)
+    x2_shape = _get_shape(x2)
+    if not all(
+        isinstance(s, (tuple, list))
+        for s in (
+            x1_shape,
+            x2_shape,
+        )
+    ):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                condition_shape,
+                x1_shape,
+                x2_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{condition_shape}, {x1_shape}, and {x2_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = _get_dtype(x1, sycl_dev)
+    x2_dtype = _get_dtype(x2, sycl_dev)
+    if not all(_validate_dtype(o) for o in (x1_dtype, x2_dtype)):
+        raise ValueError("Operands have unsupported data types")
+    x1_dtype, x2_dtype = _resolve_two_weak_types(x1_dtype, x2_dtype, sycl_dev)
+    out_dtype = _where_result_type(x1_dtype, x2_dtype, sycl_dev)
+    if out_dtype is None:
+        raise TypeError(
+            "function 'where' does not support input "
+            f"types ({x1_dtype}, {x2_dtype}), "
+            "and the inputs could not be safely coerced "
+            "to any supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                "output array must be of usm_ndarray type, got " f"{type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are "
+                f"inconsistent. Expected output shape is {res_shape}, "
+                f"got {out.shape}"
+            )
+
+        if out_dtype != out.dtype:
+            raise ValueError(
+                f"Output array of type {out_dtype} is needed, "
+                f"got {out.dtype}"
+            )
+
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
+            condition, out
+        ):
+            out = dpt.empty_like(out)
+
+        if isinstance(x1, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(x1, out)
+                and not ti._same_logical_tensors(x1, out)
+                and x1_dtype == out_dtype
+            ):
+                out = dpt.empty_like(out)
+
+        if isinstance(x2, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(x2, out)
+                and not ti._same_logical_tensors(x2, out)
+                and x2_dtype == out_dtype
+            ):
+                out = dpt.empty_like(out)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    condition,
+                    x1,
+                    x2,
+                )
+            )
+            else "C"
+        )
+    if not isinstance(x1, dpt.usm_ndarray):
+        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
+    if not isinstance(x2, dpt.usm_ndarray):
+        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
+
+    if condition.size == 0:
+        if out is not None:
+            return out
+        else:
+            if order == "K":
+                return _empty_like_triple_orderK(
+                    condition,
+                    x1,
+                    x2,
+                    out_dtype,
+                    res_shape,
+                    out_usm_type,
+                    exec_q,
+                )
+            else:
+                return dpt.empty(
+                    res_shape,
+                    dtype=out_dtype,
+                    order=order,
+                    usm_type=out_usm_type,
+                    sycl_queue=exec_q,
+                )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if x1_dtype != out_dtype:
+        if order == "K":
+            _x1 = _empty_like_orderK(x1, out_dtype)
+        else:
+            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
+        )
+        x1 = _x1
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+
+    if x2_dtype != out_dtype:
+        if order == "K":
+            _x2 = _empty_like_orderK(x2, out_dtype)
+        else:
+            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
+        )
+        x2 = _x2
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+
+    if out is None:
+        if order == "K":
+            out = _empty_like_triple_orderK(
+                condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
+            )
+        else:
+            out = dpt.empty(
+                res_shape,
+                dtype=out_dtype,
+                order=order,
+                usm_type=out_usm_type,
+                sycl_queue=exec_q,
+            )
+
+    if condition_shape != res_shape:
+        condition = dpt.broadcast_to(condition, res_shape)
+    if x1_shape != res_shape:
+        x1 = dpt.broadcast_to(x1, res_shape)
+    if x2_shape != res_shape:
+        x2 = dpt.broadcast_to(x2, res_shape)
+
+    dep_evs = _manager.submitted_events
+    hev, where_ev = ti._where(
+        condition=condition,
+        x1=x1,
+        x2=x2,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, where_ev)
+    if not (orig_out is None or orig_out is out):
+        # Copy the out data from temporary buffer to original memory
+        ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out,
+            dst=orig_out,
+            sycl_queue=exec_q,
+            depends=[where_ev],
+        )
+        _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+        out = orig_out
+
+    return out
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 47edf63a68b4..f3dd18153563 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -243,7 +243,7 @@ def dpnp_linspace(
             # Needed a special handling for denormal numbers (when step == 0),
             # see numpy#5437 for more details.
             # Note, dpt.where() is used to avoid a synchronization branch.
-            usm_res = dpt.where(
+            usm_res = dpt_ext.where(
                 step == 0, (usm_res / step_num) * delta, usm_res * step
             )
         else:
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 16ab633d506b..a2389978d506 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -44,6 +44,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
@@ -473,5 +474,7 @@ def where(condition, x=None, y=None, /, *, order="K", out=None):
     usm_condition = dpnp.get_usm_ndarray(condition)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt.where(usm_condition, usm_x, usm_y, order=order, out=usm_out)
+    usm_res = dpt_ext.where(
+        usm_condition, usm_x, usm_y, order=order, out=usm_out
+    )
     return dpnp.get_result_array(usm_res, out)

From 134bb35006aaf2db54c474427b50e330e8470e1d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 03:18:31 -0800
Subject: [PATCH 070/145] Update imports in dpnp/dpnp_iface_indexing.py

---
 dpnp/dpnp_iface_indexing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 95998dbdda4b..f305b106221f 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -47,9 +47,6 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._copy_utils import _nonzero_impl
-from dpctl.tensor._indexing_functions import _get_indexing_mode
-from dpctl.tensor._numpy_helper import normalize_axis_index
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
@@ -60,6 +57,9 @@
 
 # pylint: disable=no-name-in-module
 import dpnp.backend.extensions.indexing._indexing_impl as indexing_ext
+from dpctl_ext.tensor._copy_utils import _nonzero_impl
+from dpctl_ext.tensor._indexing_functions import _get_indexing_mode
+from dpctl_ext.tensor._numpy_helper import normalize_axis_index
 
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (

From c58b531481fc9b83b995fced56510d7e5b17bbc8 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 03:44:23 -0800
Subject: [PATCH 071/145] Enable -fno-fast-math compiler flag

---
 dpctl_ext/tensor/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index cec88aed44c8..06efd2aa7b8b 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -92,10 +92,10 @@ endif()
 
 set(_no_fast_math_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
 #list(
 #APPEND _no_fast_math_sources

From 4f00632551ecbcfd9a467e7fdf63d2ff239d5ed5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 03:59:54 -0800
Subject: [PATCH 072/145] Move _clip to dpctl_ext/tensor/libtensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   4 +-
 .../tensor/libtensor/include/kernels/clip.hpp | 357 ++++++++++++++++++
 dpctl_ext/tensor/libtensor/source/clip.cpp    | 267 +++++++++++++
 dpctl_ext/tensor/libtensor/source/clip.hpp    |  57 +++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  20 +-
 5 files changed, 693 insertions(+), 12 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/clip.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/clip.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/clip.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 06efd2aa7b8b..6f823a818ce7 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -61,7 +61,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
 
 set(_static_lib_trgt simplify_iteration_space)
@@ -94,7 +94,7 @@ set(_no_fast_math_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
 #list(
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp b/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp
new file mode 100644
index 000000000000..7ce50b2b62bc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp
@@ -0,0 +1,357 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.clip.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::clip
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T>
+T clip(const T &x, const T &min, const T &max)
+{
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr (is_complex<T>::value) {
+        using dpctl::tensor::math_utils::max_complex;
+        using dpctl::tensor::math_utils::min_complex;
+        return min_complex(max_complex(x, min), max);
+    }
+    else if constexpr (std::is_floating_point_v<T> ||
+                       std::is_same_v<T, sycl::half>) {
+        auto tmp = (std::isnan(x) || x > min) ? x : min;
+        return (std::isnan(tmp) || tmp < max) ? tmp : max;
+    }
+    else if constexpr (std::is_same_v<T, bool>) {
+        return (x || min) && max;
+    }
+    else {
+        auto tmp = (x > min) ? x : min;
+        return (tmp < max) ? tmp : max;
+    }
+}
+
+template <typename T,
+          std::uint8_t vec_sz = 4,
+          std::uint8_t n_vecs = 2,
+          bool enable_sg_loadstore = true>
+class ClipContigFunctor
+{
+private:
+    std::size_t nelems = 0;
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    ClipContigFunctor(std::size_t nelems_,
+                      const T *x_p_,
+                      const T *min_p_,
+                      const T *max_p_,
+                      T *dst_p_)
+        : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + nelems_per_sg);
+
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
+                sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t idx = base + it * sgSize;
+                    auto x_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x_p[idx]);
+                    auto min_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&min_p[idx]);
+                    auto max_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&max_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    const sycl::vec<T, vec_sz> x_vec =
+                        sub_group_load<vec_sz>(sg, x_multi_ptr);
+                    const sycl::vec<T, vec_sz> min_vec =
+                        sub_group_load<vec_sz>(sg, min_multi_ptr);
+                    const sycl::vec<T, vec_sz> max_vec =
+                        sub_group_load<vec_sz>(sg, max_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
+                                               max_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
+                    dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
+                }
+            }
+        }
+    }
+};
+
+template <typename T, int vec_sz, int n_vecs>
+class clip_contig_kernel;
+
+typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_contig_impl(sycl::queue &q,
+                             std::size_t nelems,
+                             const char *x_cp,
+                             const char *min_cp,
+                             const char *max_cp,
+                             char *dst_cp,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        std::size_t lws = 64;
+        static constexpr std::uint8_t vec_sz = 4;
+        static constexpr std::uint8_t n_vecs = 2;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(x_cp) &&
+            is_aligned<required_alignment>(min_cp) &&
+            is_aligned<required_alignment>(max_cp) &&
+            is_aligned<required_alignment>(dst_cp))
+        {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using Impl =
+                ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+            using Impl =
+                ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+    });
+
+    return clip_ev;
+}
+
+template <typename T, typename IndexerT>
+class ClipStridedFunctor
+{
+private:
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT indexer;
+
+public:
+    ClipStridedFunctor(const T *x_p_,
+                       const T *min_p_,
+                       const T *max_p_,
+                       T *dst_p_,
+                       const IndexerT &indexer_)
+        : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        std::size_t gid = id[0];
+        auto offsets = indexer(static_cast<ssize_t>(gid));
+        dst_p[offsets.get_fourth_offset()] = clip(
+            x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
+            max_p[offsets.get_third_offset()]);
+    }
+};
+
+template <typename T, typename IndexerT>
+class clip_strided_kernel;
+
+typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_strided_impl(sycl::queue &q,
+                              std::size_t nelems,
+                              int nd,
+                              const char *x_cp,
+                              const char *min_cp,
+                              const char *max_cp,
+                              char *dst_cp,
+                              const ssize_t *shape_strides,
+                              ssize_t x_offset,
+                              ssize_t min_offset,
+                              ssize_t max_offset,
+                              ssize_t dst_offset,
+                              const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const FourOffsets_StridedIndexer indexer{
+            nd, x_offset, min_offset, max_offset, dst_offset, shape_strides};
+
+        using KernelName = clip_strided_kernel<T, FourOffsets_StridedIndexer>;
+        using Impl = ClipStridedFunctor<T, FourOffsets_StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            Impl(x_tp, min_tp, max_tp, dst_tp, indexer));
+    });
+
+    return clip_ev;
+}
+
+template <typename fnT, typename T>
+struct ClipStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = clip_strided_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct ClipContigFactory
+{
+    fnT get()
+    {
+
+        fnT fn = clip_contig_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::clip
diff --git a/dpctl_ext/tensor/libtensor/source/clip.cpp b/dpctl_ext/tensor/libtensor/source/clip.cpp
new file mode 100644
index 000000000000..1414689bc4b7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/clip.cpp
@@ -0,0 +1,267 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "clip.hpp"
+#include "kernels/clip.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t;
+
+static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types];
+static clip_strided_impl_fn_ptr_t
+    clip_strided_dispatch_vector[td_ns::num_types];
+
+void init_clip_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::clip::ClipContigFactory;
+    DispatchVectorBuilder<clip_contig_impl_fn_ptr_t, ClipContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(clip_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::clip::ClipStridedFactory;
+    DispatchVectorBuilder<clip_strided_impl_fn_ptr_t, ClipStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(clip_strided_dispatch_vector);
+}
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+    py_clip(const dpctl::tensor::usm_ndarray &src,
+            const dpctl::tensor::usm_ndarray &min,
+            const dpctl::tensor::usm_ndarray &max,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int nd = src.get_ndim();
+    int min_nd = min.get_ndim();
+    int max_nd = max.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != min_nd || nd != max_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for clip kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for clip kernel.");
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *min_shape = min.get_shape_raw();
+    const py::ssize_t *max_shape = max.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<std::size_t>(sh_i);
+        shapes_equal = shapes_equal && (min_shape[i] == sh_i) &&
+                       (max_shape[i] == sh_i) && (src_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Arrays are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
+        (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
+        (overlap(dst, max) && !same_logical_tensors(dst, max)))
+    {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int min_typenum = min.get_typenum();
+    int max_typenum = max.get_typenum();
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int min_typeid = array_types.typenum_to_lookup_id(min_typenum);
+    int max_typeid = array_types.typenum_to_lookup_id(max_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid || src_typeid != min_typeid ||
+        src_typeid != max_typeid)
+    {
+        throw py::value_error("Input, min, max, and destination arrays must "
+                              "have the same data type");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
+
+    char *src_data = src.get_data();
+    char *min_data = min.get_data();
+    char *max_data = max.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_min_c_contig = min.is_c_contiguous();
+    bool is_min_f_contig = min.is_f_contiguous();
+
+    bool is_max_c_contig = max.is_c_contiguous();
+    bool is_max_f_contig = max.is_f_contiguous();
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_min_c_contig && is_max_c_contig &&
+                         is_src_c_contig && is_dst_c_contig);
+    bool all_f_contig = (is_min_f_contig && is_max_f_contig &&
+                         is_src_f_contig && is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = clip_contig_dispatch_vector[src_typeid];
+
+        sycl::event clip_ev =
+            fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev});
+
+        return std::make_pair(ht_ev, clip_ev);
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &min_strides = min.get_strides_vector();
+    auto const &max_strides = max.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_min_strides;
+    shT simplified_max_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t min_offset(0);
+    py::ssize_t max_offset(0);
+    py::ssize_t dst_offset(0);
+
+    dpctl::tensor::py_internal::simplify_iteration_space_4(
+        nd, src_shape, src_strides, min_strides, max_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides, src_offset, min_offset,
+        max_offset, dst_offset);
+
+    auto fn = clip_strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data,
+                             dst_data, packed_shape_strides, src_offset,
+                             min_offset, max_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {clip_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, min, max, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, clip_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/clip.hpp b/dpctl_ext/tensor/libtensor/source/clip.hpp
new file mode 100644
index 000000000000..de8f0e559b6e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/clip.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_clip(const dpctl::tensor::usm_ndarray &src,
+            const dpctl::tensor::usm_ndarray &min,
+            const dpctl::tensor::usm_ndarray &max,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends);
+
+extern void init_clip_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 1619357aaf64..0a3e111c8003 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -45,7 +45,7 @@
 
 #include "accumulators.hpp"
 #include "boolean_advanced_indexing.hpp"
-// #include "clip.hpp"
+#include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
 #include "copy_for_reshape.hpp"
@@ -135,7 +135,7 @@ using dpctl::tensor::py_internal::usm_ndarray_triul;
 using dpctl::tensor::py_internal::py_where;
 
 /* =========================== Clip ============================== */
-// using dpctl::tensor::py_internal::py_clip;
+using dpctl::tensor::py_internal::py_clip;
 
 // populate dispatch tables
 void init_dispatch_tables(void)
@@ -171,7 +171,7 @@ void init_dispatch_vectors(void)
     populate_cumsum_1d_dispatch_vectors();
     init_repeat_dispatch_vectors();
 
-    // init_clip_dispatch_vectors();
+    init_clip_dispatch_vectors();
 
     return;
 }
@@ -488,11 +488,11 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    // m.def("_clip", &py_clip,
-    //       "Clamps elements of array `x` to the range "
-    //       "[`min`, `max] and writes the result to the "
-    //       "array `dst` for each element of `x`, `min`, and `max`."
-    //       "Returns a tuple of events: (hev, ev)",
-    //       py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_clip", &py_clip,
+          "Clamps elements of array `x` to the range "
+          "[`min`, `max] and writes the result to the "
+          "array `dst` for each element of `x`, `min`, and `max`."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 }

From 5d45d85548bb53915e4536bd92579c794d8dd575 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 04:09:16 -0800
Subject: [PATCH 073/145] Move clip() to dpctl_ext/tensor and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py    |   2 +
 dpctl_ext/tensor/_clip.py       | 781 ++++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_mathematical.py |   3 +-
 3 files changed, 785 insertions(+), 1 deletion(-)
 create mode 100644 dpctl_ext/tensor/_clip.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 2ca989bc8c14..8cd8a1896b28 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -57,6 +57,7 @@
 )
 from dpctl_ext.tensor._reshape import reshape
 
+from ._clip import clip
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
@@ -64,6 +65,7 @@
     "astype",
     "can_cast",
     "copy",
+    "clip",
     "extract",
     "eye",
     "finfo",
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
new file mode 100644
index 000000000000..50d3ecd568e3
--- /dev/null
+++ b/dpctl_ext/tensor/_clip.py
@@ -0,0 +1,781 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_elementwise_impl as tei
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+from dpctl_ext.tensor._copy_utils import (
+    _empty_like_orderK,
+    _empty_like_pair_orderK,
+    _empty_like_triple_orderK,
+)
+from dpctl_ext.tensor._manipulation_functions import _broadcast_shape_impl
+from dpctl_ext.tensor._type_utils import _can_cast
+
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    _resolve_one_strong_one_weak_types,
+    _resolve_one_strong_two_weak_types,
+)
+
+
+def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev):
+    """
+    Checks if both types `arg1_dtype` and `arg2_dtype` can be
+    cast to `res_dtype` according to the rule `safe`
+    """
+    if arg1_dtype == res_dtype and arg2_dtype == res_dtype:
+        return None, None, res_dtype
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast(
+        arg2_dtype, res_dtype, _fp16, _fp64
+    ):
+        # prevent unnecessary casting
+        ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype
+        ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype
+        return ret_buf1_dt, ret_buf2_dt, res_dtype
+    else:
+        return None, None, None
+
+
+def _clip_none(x, val, out, order, _binary_fn):
+    q1, x_usm_type = x.sycl_queue, x.usm_type
+    q2, val_usm_type = _get_queue_usm_type(val)
+    if q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = dpctl.utils.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                x_usm_type,
+                val_usm_type,
+            )
+        )
+    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+    x_shape = x.shape
+    val_shape = _get_shape(val)
+    if not isinstance(val_shape, (tuple, list)):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                x_shape,
+                val_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{x_shape} and {val_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x_dtype = x.dtype
+    val_dtype = _get_dtype(val, sycl_dev)
+    if not _validate_dtype(val_dtype):
+        raise ValueError("Operands have unsupported data types")
+
+    val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev)
+
+    res_dt = x.dtype
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if not _can_cast(val_dtype, res_dt, _fp16, _fp64):
+        raise ValueError(
+            f"function 'clip' does not support input types "
+            f"({x_dtype}, {val_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(x, out):
+            if not ti._same_logical_tensors(x, out):
+                out = dpt.empty_like(out)
+
+        if isinstance(val, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(val, out)
+                and not ti._same_logical_tensors(val, out)
+                and val_dtype == res_dt
+            ):
+                out = dpt.empty_like(out)
+
+    if isinstance(val, dpt.usm_ndarray):
+        val_ary = val
+    else:
+        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    x,
+                    val_ary,
+                )
+            )
+            else "C"
+        )
+    if val_dtype == res_dt:
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, val_ary, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        if val_ary.shape != res_shape:
+            val_ary = dpt.broadcast_to(val_ary, res_shape)
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_binary_ev, binary_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, copy_ev)
+            out = orig_out
+        return out
+    else:
+        if order == "K":
+            buf = _empty_like_orderK(val_ary, res_dt)
+        else:
+            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, buf, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        buf = dpt.broadcast_to(buf, res_shape)
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x,
+            src2=buf,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_binary_ev, binary_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        return out
+
+
+def clip(x, /, min=None, max=None, out=None, order="K"):
+    """clip(x, min=None, max=None, out=None, order="K")
+
+    Clips to the range [`min_i`, `max_i`] for each element `x_i`
+    in `x`.
+
+    Args:
+        x (usm_ndarray): Array containing elements to clip.
+            Must be compatible with `min` and `max` according
+            to broadcasting rules.
+        min ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+            Array containing minimum values.
+            Must be compatible with `x` and `max` according
+            to broadcasting rules.
+        max ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+            Array containing maximum values.
+            Must be compatible with `x` and `min` according
+            to broadcasting rules.
+        out ({None, usm_ndarray}, optional):
+            Output array to populate.
+            Array must have the correct shape and the expected data type.
+        order ("C","F","A","K", optional):
+            Memory layout of the newly output array, if parameter `out` is
+            `None`.
+            Default: "K".
+
+    Returns:
+        usm_ndarray:
+            An array with elements clipped to the range [`min`, `max`].
+            The returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected `x` to be of dpctl.tensor.usm_ndarray type, got "
+            f"{type(x)}"
+        )
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    if x.dtype.kind in "iu":
+        if isinstance(min, int) and min <= dpt_ext.iinfo(x.dtype).min:
+            min = None
+        if isinstance(max, int) and max >= dpt_ext.iinfo(x.dtype).max:
+            max = None
+    if min is None and max is None:
+        exec_q = x.sycl_queue
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != x.shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {x.shape}, "
+                    f"got {out.shape}"
+                )
+
+            if x.dtype != out.dtype:
+                raise ValueError(
+                    f"Output array of type {x.dtype} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+                else:
+                    return out
+        else:
+            if order == "K":
+                out = _empty_like_orderK(x, x.dtype)
+            else:
+                out = dpt.empty_like(x, order=order)
+
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_copy_ev, cpy_ev)
+            out = orig_out
+        return out
+    elif max is None:
+        return _clip_none(x, min, out, order, tei._maximum)
+    elif min is None:
+        return _clip_none(x, max, out, order, tei._minimum)
+    else:
+        q1, x_usm_type = x.sycl_queue, x.usm_type
+        q2, min_usm_type = _get_queue_usm_type(min)
+        q3, max_usm_type = _get_queue_usm_type(max)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            res_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = dpctl.utils.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    max_usm_type,
+                )
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                    max_usm_type,
+                )
+            )
+        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
+        x_shape = x.shape
+        min_shape = _get_shape(min)
+        max_shape = _get_shape(max)
+        if not all(
+            isinstance(s, (tuple, list))
+            for s in (
+                min_shape,
+                max_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    x_shape,
+                    min_shape,
+                    max_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{x_shape}, {min_shape}, and {max_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        x_dtype = x.dtype
+        min_dtype = _get_dtype(min, sycl_dev)
+        max_dtype = _get_dtype(max, sycl_dev)
+        if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)):
+            raise ValueError("Operands have unsupported data types")
+
+        min_dtype, max_dtype = _resolve_one_strong_two_weak_types(
+            x_dtype, min_dtype, max_dtype, sycl_dev
+        )
+
+        buf1_dt, buf2_dt, res_dt = _check_clip_dtypes(
+            x_dtype,
+            min_dtype,
+            max_dtype,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{clip}' does not support input types "
+                f"({x_dtype}, {min_dtype}, {max_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != res_shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {res_shape}, "
+                    f"got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
+                is None
+            ):
+                raise ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+
+            if isinstance(min, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(min, out)
+                    and not ti._same_logical_tensors(min, out)
+                    and buf1_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+            if isinstance(max, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(max, out)
+                    and not ti._same_logical_tensors(max, out)
+                    and buf2_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+        if isinstance(min, dpt.usm_ndarray):
+            a_min = min
+        else:
+            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+        if isinstance(max, dpt.usm_ndarray):
+            a_max = max
+        else:
+            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+
+        if order == "A":
+            order = (
+                "F"
+                if all(
+                    arr.flags.f_contiguous
+                    for arr in (
+                        x,
+                        a_min,
+                        a_max,
+                    )
+                )
+                else "C"
+            )
+        if buf1_dt is None and buf2_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+            if x_shape != res_shape:
+                x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=dep_ev,
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        elif buf1_dt is None:
+            if order == "K":
+                buf2 = _empty_like_orderK(a_max, buf2_dt)
+            else:
+                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        buf2,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=buf2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        elif buf2_dt is None:
+            if order == "K":
+                buf1 = _empty_like_orderK(a_min, buf1_dt)
+            else:
+                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        buf1,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=buf1,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        if order == "K":
+            if (
+                x.flags.c_contiguous
+                and a_min.flags.c_contiguous
+                and a_max.flags.c_contiguous
+            ):
+                order = "C"
+            elif (
+                x.flags.f_contiguous
+                and a_min.flags.f_contiguous
+                and a_max.flags.f_contiguous
+            ):
+                order = "F"
+        if order == "K":
+            buf1 = _empty_like_orderK(a_min, buf1_dt)
+        else:
+            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+        if order == "K":
+            buf2 = _empty_like_orderK(a_max, buf2_dt)
+        else:
+            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_triple_orderK(
+                    x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        x = dpt.broadcast_to(x, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
+        ht_, clip_ev = ti._clip(
+            src=x,
+            min=buf1,
+            max=buf2,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy1_ev, copy2_ev],
+        )
+        _manager.add_event_pair(ht_, clip_ev)
+        return out
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index f2779b2fdcd1..3dbf07be080a 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -58,6 +58,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -729,7 +730,7 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs):
     usm_max = None if max is None else dpnp.get_usm_ndarray_or_scalar(max)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
+    usm_res = dpt_ext.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
     if out is not None and isinstance(out, dpnp_array):
         return out
     return dpnp_array._create_from_usm_ndarray(usm_res)

From e70425b9f66f9edc3faaede735384e31fe918621 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 20 Feb 2026 04:51:33 -0800
Subject: [PATCH 074/145] Populate dispatch tables with where

---
 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 0a3e111c8003..e6bc3b5dfb61 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -145,7 +145,7 @@ void init_dispatch_tables(void)
     init_copy_and_cast_usm_to_usm_dispatch_tables();
     init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
     init_advanced_indexing_dispatch_tables();
-    // init_where_dispatch_tables();
+    init_where_dispatch_tables();
     return;
 }
 

From 36919c14ecd1b6e0d34d5ca51a7524221231277d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 02:06:38 -0800
Subject: [PATCH 075/145] Move _linspace_step/_linspace_affine to
 dpctl_ext/tensor/libtensor

---
 dpctl_ext/tensor/CMakeLists.txt               |   4 +-
 .../include/kernels/constructors.hpp          | 177 ++++++++++
 .../libtensor/source/linear_sequences.cpp     | 308 ++++++++++++++++++
 .../libtensor/source/linear_sequences.hpp     |  66 ++++
 .../tensor/libtensor/source/tensor_ctors.cpp  |  38 ++-
 5 files changed, 573 insertions(+), 20 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 6f823a818ce7..864e34ddaba4 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -51,7 +51,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
@@ -93,7 +93,7 @@ endif()
 set(_no_fast_math_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
index 26ae46707a6b..27074cd2d246 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp
@@ -54,6 +54,10 @@ using dpctl::tensor::ssize_t;
   @defgroup CtorKernels
  */
 
+template <typename Ty>
+class linear_sequence_step_kernel;
+template <typename Ty, typename wTy>
+class linear_sequence_affine_kernel;
 template <typename Ty>
 class full_strided_kernel;
 template <typename Ty>
@@ -61,6 +65,179 @@ class eye_kernel;
 
 using namespace dpctl::tensor::offset_utils;
 
+template <typename Ty>
+class LinearSequenceStepFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty step_v;
+
+public:
+    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            p[i] = Ty{start_v.real() + i * step_v.real(),
+                      start_v.imag() + i * step_v.imag()};
+        }
+        else {
+            p[i] = start_v + i * step_v;
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting value and
+ * increment.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start_v Typed starting value of the sequence
+ * @param step_v  Typed increment of the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                Ty start_v,
+                                Ty step_v,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
+    });
+
+    return lin_space_step_event;
+}
+
+// Constructor to populate tensor with linear sequence defined by
+// start and and data
+
+template <typename Ty, typename wTy>
+class LinearSequenceAffineFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty end_v;
+    std::size_t n;
+
+public:
+    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
+          n((den == 0) ? 1 : den)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        wTy wc = wTy(i) / n;
+        wTy w = wTy(n - i) / n;
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            using reT = typename Ty::value_type;
+            auto _w = static_cast<reT>(w);
+            auto _wc = static_cast<reT>(wc);
+            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
+            re_comb =
+                sycl::fma(end_v.real(), _wc,
+                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
+            auto im_comb =
+                sycl::fma(start_v.imag(), _w,
+                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
+            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
+            Ty affine_comb = Ty{re_comb, im_comb};
+            p[i] = affine_comb;
+        }
+        else if constexpr (std::is_floating_point<Ty>::value) {
+            Ty _w = static_cast<Ty>(w);
+            Ty _wc = static_cast<Ty>(wc);
+            auto affine_comb =
+                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
+            affine_comb = sycl::fma(end_v, _wc, affine_comb);
+            p[i] = affine_comb;
+        }
+        else {
+            using dpctl::tensor::type_utils::convert_impl;
+            auto affine_comb = start_v * w + end_v * wc;
+            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting and end values.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence.
+ * @param start_v Stating value of the sequence.
+ * @param end_v   End-value of the sequence.
+ * @param include_endpoint  Whether the end-value is included in the sequence.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  Ty start_v,
+                                  Ty end_v,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    const bool device_supports_doubles =
+        exec_q.get_device().has(sycl::aspect::fp64);
+    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
+
+    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (device_supports_doubles) {
+            using KernelName = linear_sequence_affine_kernel<Ty, double>;
+            using Impl = LinearSequenceAffineFunctor<Ty, double>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+        else {
+            using KernelName = linear_sequence_affine_kernel<Ty, float>;
+            using Impl = LinearSequenceAffineFunctor<Ty, float>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+    });
+
+    return lin_space_affine_event;
+}
+
 /* ================ Full ================== */
 
 /*!
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
new file mode 100644
index 000000000000..5204f24b3724
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp
@@ -0,0 +1,308 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "linear_sequences.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+// Constructor to populate tensor with linear sequence defined by
+// start and step data
+
+typedef sycl::event (*lin_space_step_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &step,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by starting value and increment
+ * given as Python objects.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start Starting value of the sequence as Python object. Must be
+ * convertible to array element data type `Ty`.
+ * @param step  Increment of the sequence as Python object. Must be convertible
+ * to array element data type `Ty`.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const py::object &start,
+                                const py::object &step,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty step_v = py::cast<Ty>(step);
+
+    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
+
+    auto lin_space_step_event = lin_space_step_impl<Ty>(
+        exec_q, nelems, start_v, step_v, array_data, depends);
+
+    return lin_space_step_event;
+}
+
+typedef sycl::event (*lin_space_affine_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &end,
+    bool include_endpoint,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified  by starting and end values given
+ * as Python objects.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param start Stating value of the sequence as Python object. Must be
+ * convertible to array data element type `Ty`.
+ * @param end   End-value of the sequence as Python object. Must be convertible
+ * to array data element type `Ty`.
+ * @param include_endpoint  Whether the end-value is included in the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const py::object &start,
+                                  const py::object &end,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty end_v = py::cast<Ty>(end);
+
+    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
+
+    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
+        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
+
+    return lin_space_affine_event;
+}
+
+using dpctl::utils::keep_args_alive;
+
+static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
+
+static lin_space_affine_fn_ptr_t
+    lin_space_affine_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_step(const py::object &start,
+                                     const py::object &dt,
+                                     const dpctl::tensor::usm_ndarray &dst,
+                                     sycl::queue &exec_q,
+                                     const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_step_event;
+
+    auto fn = lin_space_step_dispatch_vector[dst_typeid];
+
+    linspace_step_event =
+        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
+                          linspace_step_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_affine(const py::object &start,
+                                       const py::object &end,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       bool include_endpoint,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation context");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_affine_event;
+
+    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
+
+    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
+                               end, include_endpoint, dst_data, depends);
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
+        linspace_affine_event);
+}
+
+/*!
+ * @brief  Factor to get function pointer of type `fnT` for array with elements
+ * of type `Ty`.
+ * @defgroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceStepFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_step_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for array data type
+ * `Ty`.
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceAffineFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_affine_impl<Ty>;
+        return f;
+    }
+};
+
+void init_linear_sequences_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
+
+    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
new file mode 100644
index 000000000000..45cf45153462
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp
@@ -0,0 +1,66 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_step(
+    const py::object &start,
+    const py::object &dt,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
+    const py::object &start,
+    const py::object &end,
+    const dpctl::tensor::usm_ndarray &dst,
+    bool include_endpoint,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_linear_sequences_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index e6bc3b5dfb61..7bed4df01d29 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -56,7 +56,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "kernels/dpctl_tensor_types.hpp"
-// #include "linear_sequences.hpp"
+#include "linear_sequences.hpp"
 #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "triul_ctor.hpp"
@@ -97,8 +97,8 @@ using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
 
 /* ============= linear-sequence ==================== */
 
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
-// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 /* ================ Full ================== */
 
@@ -157,7 +157,7 @@ void init_dispatch_vectors(void)
     init_copy_as_contig_dispatch_vectors();
     init_copy_for_reshape_dispatch_vectors();
     init_copy_for_roll_dispatch_vectors();
-    // init_linear_sequences_dispatch_vectors();
+    init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
     init_eye_ctor_dispatch_vectors();
@@ -300,20 +300,22 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("src"), py::arg("dst"), py::arg("shifts"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
-    //           sequence " "specified by " "starting point `start` and step
-    //           `dt`. " "Returns a tuple of events: (ht_event, comp_event)",
-    //           py::arg("start"), py::arg("dt"), py::arg("dst"),
-    //           py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    //     m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-    //           "Fills input 1D contiguous usm_ndarray `dst` with linear
-    //           sequence " "specified by " "starting point `start` and end
-    //           point `end`. " "Returns a tuple of events: (ht_event,
-    //           comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"),
-    //           py::arg("include_endpoint"), py::arg("sycl_queue"),
-    //           py::arg("depends") = py::list());
+    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and step `dt`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("dt"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and end point `end`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("end"), py::arg("dst"),
+          py::arg("include_endpoint"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("_copy_numpy_ndarray_into_usm_ndarray",
           &copy_numpy_ndarray_into_usm_ndarray,

From 95ac4f7876cb006946f2d878a8a90767f74b17b0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 02:40:07 -0800
Subject: [PATCH 076/145] Move ti.linspace() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py         |   2 +
 dpctl_ext/tensor/_ctors.py           | 206 +++++++++++++++++++++++++++
 dpnp/dpnp_algo/dpnp_arraycreation.py |   2 +-
 dpnp/fft/dpnp_utils_fft.py           |  12 +-
 4 files changed, 212 insertions(+), 10 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 8cd8a1896b28..78ba47b635e0 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -39,6 +39,7 @@
 from dpctl_ext.tensor._ctors import (
     eye,
     full,
+    linspace,
     tril,
     triu,
 )
@@ -73,6 +74,7 @@
     "full",
     "iinfo",
     "isdtype",
+    "linspace",
     "nonzero",
     "place",
     "put",
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 5a9e07c73346..ed97b260dd7d 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -30,17 +30,84 @@
 from numbers import Number
 
 import dpctl
+import dpctl.memory as dpm
 import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
+from dpctl.tensor._usmarray import _is_object_with_buffer_protocol
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
+__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"
+
+_empty_tuple = ()
+_host_set = frozenset([None])
+
+
+def _array_info_dispatch(obj):
+    if isinstance(obj, dpt.usm_ndarray):
+        return obj.shape, obj.dtype, frozenset([obj.sycl_queue])
+    if isinstance(obj, np.ndarray):
+        return obj.shape, obj.dtype, _host_set
+    if isinstance(obj, range):
+        return (len(obj),), int, _host_set
+    if isinstance(obj, bool):
+        return _empty_tuple, bool, _host_set
+    if isinstance(obj, float):
+        return _empty_tuple, float, _host_set
+    if isinstance(obj, int):
+        return _empty_tuple, int, _host_set
+    if isinstance(obj, complex):
+        return _empty_tuple, complex, _host_set
+    if isinstance(
+        obj,
+        (
+            list,
+            tuple,
+        ),
+    ):
+        return _array_info_sequence(obj)
+    if _is_object_with_buffer_protocol(obj):
+        np_obj = np.array(obj)
+        return np_obj.shape, np_obj.dtype, _host_set
+    if hasattr(obj, "__usm_ndarray__"):
+        usm_ar = obj.__usm_ndarray__
+        if isinstance(usm_ar, dpt.usm_ndarray):
+            return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
+    if hasattr(obj, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(obj)
+        return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
+
+
+def _array_info_sequence(li):
+    if not isinstance(li, (list, tuple, range)):
+        raise TypeError(f"Expected list, tuple, or range, got {type(li)}")
+    n = len(li)
+    dim = None
+    dt = None
+    device = frozenset()
+    for el in li:
+        el_dim, el_dt, el_dev = _array_info_dispatch(el)
+        if dim is None:
+            dim = el_dim
+            dt = np.promote_types(el_dt, el_dt)
+            device = device.union(el_dev)
+        elif el_dim == dim:
+            dt = np.promote_types(dt, el_dt)
+            device = device.union(el_dev)
+        else:
+            raise ValueError(f"Inconsistent dimensions, {dim} and {el_dim}")
+    if dim is None:
+        dim = ()
+        dt = float
+        device = _host_set
+    return (n,) + dim, dt, device
+
 
 def _cast_fill_val(fill_val, dt):
     """
@@ -58,6 +125,23 @@ def _cast_fill_val(fill_val, dt):
         return fill_val
 
 
+def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False):
+    """Deduce arange type from sequence spec"""
+    nd, seq_dt, d = _array_info_sequence(args)
+    if d != _host_set or nd != (len(args),):
+        raise ValueError(err_msg)
+    dt = _get_dtype(dt, sycl_queue, ref_type=seq_dt)
+    if np.issubdtype(dt, np.integer):
+        return tuple(int(v) for v in args), dt
+    if np.issubdtype(dt, np.floating):
+        return tuple(float(v) for v in args), dt
+    if np.issubdtype(dt, np.complexfloating):
+        return tuple(complex(v) for v in args), dt
+    if allow_bool and dt.char == "?":
+        return tuple(bool(v) for v in args), dt
+    raise ValueError(f"Data type {dt} is not supported")
+
+
 def _ensure_native_dtype_device_support(dtype, dev) -> None:
     """Check that dtype is natively supported by device.
 
@@ -99,6 +183,25 @@ def _to_scalar(obj, sc_ty):
     return zd_arr[()]
 
 
+def _usm_ndarray_from_suai(obj):
+    sua_iface = obj.__sycl_usm_array_interface__
+    membuf = dpm.as_usm_memory(obj)
+    ary = dpt.usm_ndarray(
+        sua_iface["shape"],
+        dtype=sua_iface["typestr"],
+        buffer=membuf,
+        strides=sua_iface.get("strides", None),
+    )
+    _data_field = sua_iface["data"]
+    if isinstance(_data_field, tuple) and len(_data_field) > 1:
+        ro_field = _data_field[1]
+    else:
+        ro_field = False
+    if ro_field:
+        ary.flags["W"] = False
+    return ary
+
+
 def eye(
     n_rows,
     n_cols=None,
@@ -301,6 +404,109 @@ def full(
     return res
 
 
+def linspace(
+    start,
+    stop,
+    /,
+    num,
+    *,
+    dtype=None,
+    device=None,
+    endpoint=True,
+    sycl_queue=None,
+    usm_type="device",
+):
+    """
+    linspace(start, stop, num, dtype=None, device=None, endpoint=True, \
+        sycl_queue=None, usm_type="device")
+
+    Returns :class:`dpctl.tensor.usm_ndarray` array populated with
+    evenly spaced numbers of specified interval.
+
+    Args:
+        start:
+            the start of the interval.
+        stop:
+            the end of the interval. If the ``endpoint`` is ``False``, the
+            function generates ``num+1`` evenly spaced points starting
+            with ``start`` and ending with ``stop`` and exclude the
+            ``stop`` from the returned array such that the returned array
+            consists of evenly spaced numbers over the half-open interval
+            ``[start, stop)``. If ``endpoint`` is ``True``, the output
+            array consists of evenly spaced numbers over the closed
+            interval ``[start, stop]``. Default: ``True``
+        num (int):
+            number of samples. Must be a non-negative integer; otherwise,
+            the function raises ``ValueError`` exception.
+        dtype:
+            output array data type. Should be a floating data type.
+            If ``dtype`` is ``None``, the output array must be the default
+            floating point data type for target device.
+            Default: ``None``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+        endpoint: boolean indicating whether to include ``stop`` in the
+            interval. Default: ``True``
+
+    Returns:
+        usm_ndarray:
+            Array populated with evenly spaced numbers in the requested
+            interval.
+    """
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if endpoint not in [True, False]:
+        raise TypeError("endpoint keyword argument must be of boolean type")
+
+    num = operator.index(num)
+    if num < 0:
+        raise ValueError("Number of points must be non-negative")
+
+    _, dt = _coerce_and_infer_dt(
+        start,
+        stop,
+        dt=dtype,
+        sycl_queue=sycl_queue,
+        err_msg="start and stop must be Python scalars.",
+        allow_bool=True,
+    )
+
+    int_dt = None
+    if np.issubdtype(dt, np.integer):
+        if dtype is not None:
+            int_dt = dt
+        dt = ti.default_device_fp_type(sycl_queue)
+        dt = dpt.dtype(dt)
+        start = float(start)
+        stop = float(stop)
+
+    res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    hev, la_ev = ti._linspace_affine(
+        start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
+    )
+    _manager.add_event_pair(hev, la_ev)
+
+    return res if int_dt is None else dpt.astype(res, int_dt)
+
+
 def tril(x, /, *, k=0):
     """
     Returns the lower triangular part of a matrix (or a stack of matrices)
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index f3dd18153563..4e05d57c6872 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -196,7 +196,7 @@ def dpnp_linspace(
 
     if dpnp.isscalar(start) and dpnp.isscalar(stop):
         # Call linspace() function for scalars.
-        usm_res = dpt.linspace(
+        usm_res = dpt_ext.linspace(
             start,
             stop,
             num,
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 534b9404254f..e3f35a0201ec 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -42,11 +42,6 @@
 from collections.abc import Sequence
 
 import dpctl
-
-# pylint: disable=no-name-in-module
-# TODO: remove it when ti.__linspace_step
-# is migrated to dpctl_ext/tensor
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
 from dpctl.tensor._numpy_helper import (
@@ -55,10 +50,9 @@
 )
 from dpctl.utils import ExecutionPlacementError
 
-# pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor._tensor_impl as ti_ext
+import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
 
@@ -205,7 +199,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides):
         if (
             out is not None
             and out.strides == tuple(out_strides)
-            and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
+            and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
         ):
             res_usm = out_usm
             result = out
@@ -538,7 +532,7 @@ def _truncate_or_pad(a, shape, axes):
             )
             _manager = dpu.SequentialOrderManager[exec_q]
             dep_evs = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray(
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=dpnp.get_usm_ndarray(a),
                 dst=z.get_array()[tuple(index)],
                 sycl_queue=exec_q,

From 1054e2d4780c585857f32a0c64b2e4b0ec6241e0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 02:56:11 -0800
Subject: [PATCH 077/145] Move ti.empty() and reuse it in dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py                |  2 +
 dpctl_ext/tensor/_clip.py                   | 12 ++--
 dpctl_ext/tensor/_copy_utils.py             | 36 +++++-----
 dpctl_ext/tensor/_ctors.py                  | 75 +++++++++++++++++++--
 dpctl_ext/tensor/_indexing_functions.py     |  6 +-
 dpctl_ext/tensor/_manipulation_functions.py | 16 ++---
 dpctl_ext/tensor/_search_functions.py       | 14 ++--
 7 files changed, 117 insertions(+), 44 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 78ba47b635e0..e57bb35ce993 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -37,6 +37,7 @@
     to_numpy,
 )
 from dpctl_ext.tensor._ctors import (
+    empty,
     eye,
     full,
     linspace,
@@ -67,6 +68,7 @@
     "can_cast",
     "copy",
     "clip",
+    "empty",
     "extract",
     "eye",
     "finfo",
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index 50d3ecd568e3..1cd360b753ad 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -197,7 +197,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, val_ary, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt.empty(
+                out = dpt_ext.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -242,7 +242,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, buf, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt.empty(
+                out = dpt_ext.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -572,7 +572,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt.empty(
+                    out = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -631,7 +631,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt.empty(
+                    out = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -687,7 +687,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt.empty(
+                    out = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -758,7 +758,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt.empty(
+                out = dpt_ext.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index af72544a8b0d..a1c14cf0f69b 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -91,7 +91,7 @@ def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
         )
     else:
         Xusm_dtype = dt
-    Xusm = dpt.empty(
+    Xusm = dpt_ext.empty(
         Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
     )
     _copy_from_numpy_into(Xusm, Xnp)
@@ -176,7 +176,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         )
     mask_nelems = ary_mask.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
+    cumsum = dpt_ext.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
     exec_q = cumsum.sycl_queue
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -184,7 +184,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
-    dst = dpt.empty(
+    dst = dpt_ext.empty(
         dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
     )
     if dst.size == 0:
@@ -247,7 +247,7 @@ def _nonzero_impl(ary):
     usm_type = ary.usm_type
     mask_nelems = ary.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt.empty(
+    cumsum = dpt_ext.empty(
         mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -256,7 +256,7 @@ def _nonzero_impl(ary):
         ary, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
-    indexes = dpt.empty(
+    indexes = dpt_ext.empty(
         (ary.ndim, mask_count),
         dtype=indexes_dt,
         usm_type=usm_type,
@@ -418,7 +418,7 @@ def _take_multi_index(ary, inds, p, mode=0):
     if 0 in ary_sh[p:p_end] and ind0.size != 0:
         raise IndexError("cannot take non-empty indices from an empty axis")
     res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
-    res = dpt.empty(
+    res = dpt_ext.empty(
         res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -681,7 +681,9 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
     inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
     sh = x.shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    R = dpt_ext.empty(
+        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
+    )
     if min(st) < 0:
         st_sorted = [st[i] for i in perm]
         sl = tuple(
@@ -734,11 +736,11 @@ def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
         raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt.empty(
+        return dpt_ext.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
@@ -758,11 +760,11 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     fl1 = X1.flags
     fl2 = X2.flags
     if fl1["C"] or fl2["C"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -785,7 +787,9 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     st2_sorted = [st2[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    R = dpt_ext.empty(
+        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
+    )
     if max(min(st1_sorted), min(st2_sorted)) < 0:
         sl = tuple(
             (
@@ -823,11 +827,11 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     fl2 = X2.flags
     fl3 = X3.flags
     if fl1["C"] or fl2["C"] or fl3["C"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"] and fl3["F"]:
-        return dpt.empty(
+        return dpt_ext.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -855,7 +859,9 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     st3_sorted = [st3[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    R = dpt_ext.empty(
+        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
+    )
     if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
         sl = tuple(
             (
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index ed97b260dd7d..dc7da78a9975 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -202,6 +202,71 @@ def _usm_ndarray_from_suai(obj):
     return ary
 
 
+def empty(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Creates :class:`dpctl.tensor.usm_ndarray` from uninitialized
+    USM allocation.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. The ``None`` value creates an
+            array of floating point data type. Default: ``None``
+        order (``"C"``, or ``F"``):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created empty array.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    return res
+
+
 def eye(
     n_rows,
     n_cols=None,
@@ -497,7 +562,7 @@ def linspace(
         start = float(start)
         stop = float(stop)
 
-    res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
+    res = dpt_ext.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
     _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
     hev, la_ev = ti._linspace_affine(
         start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
@@ -546,7 +611,7 @@ def tril(x, /, *, k=0):
 
     q = x.sycl_queue
     if k >= shape[nd - 1] - 1:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -568,7 +633,7 @@ def tril(x, /, *, k=0):
             sycl_queue=q,
         )
     else:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -632,7 +697,7 @@ def triu(x, /, *, k=0):
             sycl_queue=q,
         )
     elif k <= -shape[nd - 2] + 1:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -646,7 +711,7 @@ def triu(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     else:
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 6ca327192f73..17a4aefbd4ca 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -177,7 +177,7 @@ def place(arr, mask, vals):
         raise dpctl.utils.ExecutionPlacementError
     if arr.shape != mask.shape or vals.ndim != 1:
         raise ValueError("Array sizes are not as required")
-    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    cumsum = dpt_ext.empty(mask.size, dtype="i8", sycl_queue=exec_q)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
     nz_count = ti.mask_positions(
@@ -540,9 +540,9 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
                 "Input and output allocation queues are not compatible"
             )
         if ti._array_overlap(x, out):
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
     else:
-        out = dpt.empty(
+        out = dpt_ext.empty(
             res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index f1b8b46dbcbc..470bf3ef3876 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -223,7 +223,7 @@ def repeat(x, repeats, /, *, axis=None):
             res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
         else:
             res_shape = (res_axis_size,)
-        res = dpt.empty(
+        res = dpt_ext.empty(
             res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
         )
         if res_axis_size > 0:
@@ -238,7 +238,7 @@ def repeat(x, repeats, /, *, axis=None):
             _manager.add_event_pair(ht_rep_ev, rep_ev)
     else:
         if repeats.dtype != dpt.int64:
-            rep_buf = dpt.empty(
+            rep_buf = dpt_ext.empty(
                 repeats.shape,
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -248,7 +248,7 @@ def repeat(x, repeats, /, *, axis=None):
                 src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
             )
             _manager.add_event_pair(ht_copy_ev, copy_ev)
-            cumsum = dpt.empty(
+            cumsum = dpt_ext.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -264,7 +264,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt.empty(
+            res = dpt_ext.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -281,7 +281,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
                 _manager.add_event_pair(ht_rep_ev, rep_ev)
         else:
-            cumsum = dpt.empty(
+            cumsum = dpt_ext.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -296,7 +296,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt.empty(
+            res = dpt_ext.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -353,7 +353,7 @@ def roll(x, /, shift, *, axis=None):
     _manager = dputils.SequentialOrderManager[exec_q]
     if axis is None:
         shift = operator.index(shift)
-        res = dpt.empty(
+        res = dpt_ext.empty(
             x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
         )
         sz = operator.index(x.size)
@@ -380,7 +380,7 @@ def roll(x, /, shift, *, axis=None):
         n_i = operator.index(shape[ax])
         shifted = shifts[ax] + operator.index(sh)
         shifts[ax] = (shifted % n_i) if n_i > 0 else 0
-    res = dpt.empty(
+    res = dpt_ext.empty(
         x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
     )
     dep_evs = _manager.submitted_events
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
index 053c68e1857d..e7eb33a5f3fe 100644
--- a/dpctl_ext/tensor/_search_functions.py
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -291,7 +291,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
             condition, out
         ):
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
 
         if isinstance(x1, dpt.usm_ndarray):
             if (
@@ -299,7 +299,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x1, out)
                 and x1_dtype == out_dtype
             ):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
         if isinstance(x2, dpt.usm_ndarray):
             if (
@@ -307,7 +307,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x2, out)
                 and x2_dtype == out_dtype
             ):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
     if order == "A":
         order = (
@@ -342,7 +342,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                     exec_q,
                 )
             else:
-                return dpt.empty(
+                return dpt_ext.empty(
                     res_shape,
                     dtype=out_dtype,
                     order=order,
@@ -356,7 +356,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x1 = _empty_like_orderK(x1, out_dtype)
         else:
-            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
+            _x1 = dpt_ext.empty_like(x1, dtype=out_dtype, order=order)
         ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
         )
@@ -367,7 +367,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x2 = _empty_like_orderK(x2, out_dtype)
         else:
-            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
+            _x2 = dpt_ext.empty_like(x2, dtype=out_dtype, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -380,7 +380,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
             )
         else:
-            out = dpt.empty(
+            out = dpt_ext.empty(
                 res_shape,
                 dtype=out_dtype,
                 order=order,

From d493454439edbc2eee87a379f34d02424106fbad Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 03:16:23 -0800
Subject: [PATCH 078/145] Reuse dpctl_ext.tensor.empty() in dpnp

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 +-
 dpnp/dpnp_container.py                    | 2 +-
 dpnp/dpnp_iface_indexing.py               | 6 ++++--
 dpnp/tests/test_fft.py                    | 9 ++++++---
 dpnp/tests/test_mathematical.py           | 5 ++++-
 dpnp/tests/test_nanfunctions.py           | 9 ++++++---
 dpnp/tests/test_search.py                 | 5 ++++-
 dpnp/tests/test_statistics.py             | 5 ++++-
 8 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 55d74e8c1803..a7b083f2d734 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -1146,7 +1146,7 @@ def __call__(
                         x1, x2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out[i] = dpt.empty(
+                    out[i] = dpt_ext.empty(
                         res_shape,
                         dtype=res_dt,
                         order=order,
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 0727b9bfd775..9a2afe77f3cb 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -165,7 +165,7 @@ def empty(
         order = "C"
 
     """Creates `dpnp_array` from uninitialized USM allocation."""
-    array_obj = dpt.empty(
+    array_obj = dpt_ext.empty(
         shape,
         dtype=dtype,
         order=order,
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index bc190db70c4e..9f9b16d2ad9b 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -143,7 +143,7 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
             # Allocate a temporary buffer to avoid memory overlapping.
             out = dpt.empty_like(out)
     else:
-        out = dpt.empty(
+        out = dpt_ext.empty(
             inds.shape, dtype=chcs[0].dtype, usm_type=usm_type, sycl_queue=q
         )
 
@@ -303,7 +303,9 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
             # Allocate a temporary buffer to avoid memory overlapping.
             out = dpt.empty_like(out)
     else:
-        out = dpt.empty(res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q)
+        out = dpt_ext.empty(
+            res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q
+        )
 
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index 226420057748..cf6da5830f10 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -5,6 +5,9 @@
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
 
@@ -80,7 +83,7 @@ def test_usm_ndarray(self, n):
         a_usm = dpt.asarray(a)
 
         expected = numpy.fft.fft(a, n=n)
-        out = dpt.empty(expected.shape, dtype=a_usm.dtype)
+        out = dpt_ext.empty(expected.shape, dtype=a_usm.dtype)
         result = dpnp.fft.fft(a_usm, n=n, out=out)
         assert out is result.get_array()
         assert_dtype_allclose(result, expected)
@@ -639,7 +642,7 @@ def test_usm_ndarray(self, n):
         a_usm = dpt.asarray(a)
 
         expected = numpy.fft.irfft(a, n=n)
-        out = dpt.empty(expected.shape, dtype=a_usm.real.dtype)
+        out = dpt_ext.empty(expected.shape, dtype=a_usm.real.dtype)
         result = dpnp.fft.irfft(a_usm, n=n, out=out)
         assert out is result.get_array()
         assert_dtype_allclose(result, expected)
@@ -727,7 +730,7 @@ def test_usm_ndarray(self, n):
 
         expected = numpy.fft.rfft(a, n=n)
         out_dt = map_dtype_to_device(dpnp.complex128, a_usm.sycl_device)
-        out = dpt.empty(expected.shape, dtype=out_dt)
+        out = dpt_ext.empty(expected.shape, dtype=out_dt)
 
         result = dpnp.fft.rfft(a_usm, n=n, out=out)
         assert out is result.get_array()
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index 760c1a0ceb2e..7b6d39875d67 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -15,6 +15,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import map_dtype_to_device
@@ -1575,7 +1578,7 @@ def test_out(self):
         assert_allclose(result, expected)
 
         # output is usm_ndarray
-        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
         result = dpnp.prod(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index d92cee045a72..2d3dbd864a2e 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -12,6 +12,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -55,7 +58,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
@@ -236,7 +239,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
@@ -545,7 +548,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
diff --git a/dpnp/tests/test_search.py b/dpnp/tests/test_search.py
index 64c4eb75f906..05bc56b11d0b 100644
--- a/dpnp/tests/test_search.py
+++ b/dpnp/tests/test_search.py
@@ -3,6 +3,9 @@
 import pytest
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -36,7 +39,7 @@ def test_out(self, func):
         assert_array_equal(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_array_equal(result, expected)
diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py
index cf436087b607..6944d6bd5bdd 100644
--- a/dpnp/tests/test_statistics.py
+++ b/dpnp/tests/test_statistics.py
@@ -9,6 +9,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -812,7 +815,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)

From 186ae3ce7e84752b8fc1096cb4570664816abe58 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 03:22:45 -0800
Subject: [PATCH 079/145] Move ti.empty_like() and reuse it in dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py    |  2 +
 dpctl_ext/tensor/_clip.py       | 24 ++++-----
 dpctl_ext/tensor/_copy_utils.py |  4 +-
 dpctl_ext/tensor/_ctors.py      | 94 +++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index e57bb35ce993..7ef7f132158e 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -38,6 +38,7 @@
 )
 from dpctl_ext.tensor._ctors import (
     empty,
+    empty_like,
     eye,
     full,
     linspace,
@@ -69,6 +70,7 @@
     "copy",
     "clip",
     "empty",
+    "empty_like",
     "extract",
     "eye",
     "finfo",
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index 1cd360b753ad..8b53552b034b 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -163,7 +163,7 @@ def _clip_none(x, val, out, order, _binary_fn):
 
         if ti._array_overlap(x, out):
             if not ti._same_logical_tensors(x, out):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
         if isinstance(val, dpt.usm_ndarray):
             if (
@@ -171,7 +171,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                 and not ti._same_logical_tensors(val, out)
                 and val_dtype == res_dt
             ):
-                out = dpt.empty_like(out)
+                out = dpt_ext.empty_like(out)
 
     if isinstance(val, dpt.usm_ndarray):
         val_ary = val
@@ -229,7 +229,7 @@ def _clip_none(x, val, out, order, _binary_fn):
         if order == "K":
             buf = _empty_like_orderK(val_ary, res_dt)
         else:
-            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
+            buf = dpt_ext.empty_like(val_ary, dtype=res_dt, order=order)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -353,14 +353,14 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
                 else:
                     return out
         else:
             if order == "K":
                 out = _empty_like_orderK(x, x.dtype)
             else:
-                out = dpt.empty_like(x, order=order)
+                out = dpt_ext.empty_like(x, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -519,7 +519,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
 
             if isinstance(min, dpt.usm_ndarray):
                 if (
@@ -527,7 +527,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(min, out)
                     and buf1_dt is None
                 ):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
 
             if isinstance(max, dpt.usm_ndarray):
                 if (
@@ -535,7 +535,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(max, out)
                     and buf2_dt is None
                 ):
-                    out = dpt.empty_like(out)
+                    out = dpt_ext.empty_like(out)
 
         if isinstance(min, dpt.usm_ndarray):
             a_min = min
@@ -612,7 +612,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf2 = _empty_like_orderK(a_max, buf2_dt)
             else:
-                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+                buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -668,7 +668,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf1 = _empty_like_orderK(a_min, buf1_dt)
             else:
-                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+                buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -736,7 +736,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf1 = _empty_like_orderK(a_min, buf1_dt)
         else:
-            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+            buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -747,7 +747,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf2 = _empty_like_orderK(a_max, buf2_dt)
         else:
-            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+            buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
         )
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index a1c14cf0f69b..1b54587ee393 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -714,11 +714,11 @@ def _empty_like_orderK(x, dt, usm_type=None, dev=None):
         dev = x.device
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt.empty_like(
+        return dpt_ext.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt.empty_like(
+        return dpt_ext.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index dc7da78a9975..84e95f178e13 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -42,6 +42,9 @@
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
+from dpctl_ext.tensor._copy_utils import (
+    _empty_like_orderK,
+)
 
 __doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"
 
@@ -174,6 +177,21 @@ def _ensure_native_dtype_device_support(dtype, dev) -> None:
         )
 
 
+def _normalize_order(order, arr):
+    """
+    Utility function for processing the `order` keyword of array-like
+    constructors, which support `"K"` and `"A"` orders.
+    """
+    arr_flags = arr.flags
+    f_contig = arr_flags["F"]
+    c_contig = arr_flags["C"]
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and (f_contig or c_contig):
+        order = "C" if c_contig else "F"
+    return order
+
+
 def _to_scalar(obj, sc_ty):
     """A way to convert object to NumPy scalar type.
     Raises OverflowError if obj can not be represented
@@ -267,6 +285,82 @@ def empty(
     return res
 
 
+def empty_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Returns an uninitialized :class:`dpctl.tensor.usm_ndarray` with the
+    same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the output array shape.
+        dtype (optional):
+            data type of the array. Can be a typestring,
+            a :class:`numpy.dtype` object, NumPy char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"K"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created empty array with uninitialized memory.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        return _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+    else:
+        shape = x.shape
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = dpt.usm_ndarray(
+            shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": sycl_queue},
+        )
+        return res
+
+
 def eye(
     n_rows,
     n_cols=None,

From 5503b9a2e8e0d007a0e99d16b3bdd8bb9d3bec03 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 03:25:22 -0800
Subject: [PATCH 080/145] Reuse dpctl_ext.tensor.empty_like() in dpnp

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py | 14 +++++++-------
 dpnp/dpnp_iface_indexing.py               |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index a7b083f2d734..2e280b58c1d2 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -467,7 +467,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt.empty_like(res, dtype=res_dt)
+                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
             elif (
                 buf_dt is None
                 and dti._array_overlap(x, res)
@@ -476,7 +476,7 @@ def __call__(
                 # Allocate a temporary buffer to avoid memory overlapping.
                 # Note if `buf_dt` is not None, a temporary copy of `x` will be
                 # created, so the array overlap check isn't needed.
-                out[i] = dpt.empty_like(res)
+                out[i] = dpt_ext.empty_like(res)
 
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -486,7 +486,7 @@ def __call__(
             if order == "K":
                 buf = dtc._empty_like_orderK(x, buf_dt)
             else:
-                buf = dpt.empty_like(x, dtype=buf_dt, order=order)
+                buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
 
             ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                 src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
@@ -503,7 +503,7 @@ def __call__(
                 if order == "K":
                     out[i] = dtc._empty_like_orderK(x, res_dt)
                 else:
-                    out[i] = dpt.empty_like(x, dtype=res_dt, order=order)
+                    out[i] = dpt_ext.empty_like(x, dtype=res_dt, order=order)
 
         # Call the unary function with input and output arrays
         ht_unary_ev, unary_ev = self.get_implementation_function()(
@@ -1078,7 +1078,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt.empty_like(res, dtype=res_dt)
+                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
             else:
                 # If `dt` is not None, a temporary copy of `x` will be created,
                 # so the array overlap check isn't needed.
@@ -1094,7 +1094,7 @@ def __call__(
                     for x in x_to_check
                 ):
                     # allocate a temporary buffer to avoid memory overlapping
-                    out[i] = dpt.empty_like(res)
+                    out[i] = dpt_ext.empty_like(res)
 
         x1 = dpnp.as_usm_ndarray(x1, dtype=x1_dt, sycl_queue=exec_q)
         x2 = dpnp.as_usm_ndarray(x2, dtype=x2_dt, sycl_queue=exec_q)
@@ -1127,7 +1127,7 @@ def __call__(
                 if order == "K":
                     buf = dtc._empty_like_orderK(x, buf_dt)
                 else:
-                    buf = dpt.empty_like(x, dtype=buf_dt, order=order)
+                    buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
 
                 ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                     src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 9f9b16d2ad9b..503c66bfec58 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -141,7 +141,7 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
             ti._array_overlap(out, chc) for chc in chcs
         ):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
     else:
         out = dpt_ext.empty(
             inds.shape, dtype=chcs[0].dtype, usm_type=usm_type, sycl_queue=q
@@ -301,7 +301,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
 
         if ti._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt.empty_like(out)
+            out = dpt_ext.empty_like(out)
     else:
         out = dpt_ext.empty(
             res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q

From 748c5b5e15df4d4f0e618deced77b62deb38604c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 03:32:38 -0800
Subject: [PATCH 081/145] Move ti.arange() to dpctl_ext/tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py            |   2 +
 dpctl_ext/tensor/_ctors.py              | 132 ++++++++++++++++++++++++
 dpctl_ext/tensor/_indexing_functions.py |   2 +-
 dpnp/dpnp_algo/dpnp_arraycreation.py    |   2 +-
 dpnp/dpnp_container.py                  |   2 +-
 dpnp/dpnp_iface_arraycreation.py        |   2 +-
 dpnp/tests/test_arraycreation.py        |   2 +-
 7 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 7ef7f132158e..084d4db3bd26 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -37,6 +37,7 @@
     to_numpy,
 )
 from dpctl_ext.tensor._ctors import (
+    arange,
     empty,
     empty_like,
     eye,
@@ -64,6 +65,7 @@
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
+    "arange",
     "asnumpy",
     "astype",
     "can_cast",
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 84e95f178e13..9b80c3bc685a 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -177,6 +177,20 @@ def _ensure_native_dtype_device_support(dtype, dev) -> None:
         )
 
 
+def _get_arange_length(start, stop, step):
+    """Compute length of arange sequence"""
+    span = stop - start
+    if hasattr(step, "__float__") and hasattr(span, "__float__"):
+        return _round_for_arange(span / step)
+    tmp = span / step
+    if hasattr(tmp, "__complex__"):
+        tmp = complex(tmp)
+        tmp = tmp.real
+    else:
+        tmp = float(tmp)
+    return _round_for_arange(tmp)
+
+
 def _normalize_order(order, arr):
     """
     Utility function for processing the `order` keyword of array-like
@@ -192,6 +206,13 @@ def _normalize_order(order, arr):
     return order
 
 
+def _round_for_arange(tmp):
+    k = int(tmp)
+    if k >= 0 and float(k) < tmp:
+        tmp = tmp + 1
+    return tmp
+
+
 def _to_scalar(obj, sc_ty):
     """A way to convert object to NumPy scalar type.
     Raises OverflowError if obj can not be represented
@@ -220,6 +241,117 @@ def _usm_ndarray_from_suai(obj):
     return ary
 
 
+def arange(
+    start,
+    /,
+    stop=None,
+    step=1,
+    *,
+    dtype=None,
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Returns evenly spaced values within the half-open interval [start, stop)
+    as a one-dimensional array.
+
+    Args:
+        start:
+            Starting point of the interval
+        stop:
+            Ending point of the interval. Default: ``None``
+        step: Increment of the returned sequence. Default: ``1``
+        dtype: Output array data type. Default: ``None``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Array populated with evenly spaced values.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    if step is None:
+        step = 1
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    is_bool = False
+    if dtype:
+        is_bool = (dtype is bool) or (dpt.dtype(dtype) == dpt.bool)
+    _, dt = _coerce_and_infer_dt(
+        start,
+        stop,
+        step,
+        dt=dpt.int8 if is_bool else dtype,
+        sycl_queue=sycl_queue,
+        err_msg="start, stop, and step must be Python scalars",
+        allow_bool=False,
+    )
+    try:
+        tmp = _get_arange_length(start, stop, step)
+        sh = max(int(tmp), 0)
+    except TypeError:
+        sh = 0
+    if is_bool and sh > 2:
+        raise ValueError("no fill-function for boolean data type")
+    res = dpt.usm_ndarray(
+        (sh,),
+        dtype=dt,
+        buffer=usm_type,
+        order="C",
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    sc_ty = dt.type
+    _first = _to_scalar(start, sc_ty)
+    if sh > 1:
+        _second = _to_scalar(start + step, sc_ty)
+        if dt in [dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64]:
+            int64_ty = dpt.int64.type
+            _step = int64_ty(_second) - int64_ty(_first)
+        else:
+            _step = _second - _first
+        _step = sc_ty(_step)
+    else:
+        _step = sc_ty(1)
+    _start = _first
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating newly allocated array, no task dependencies
+    hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue)
+    _manager.add_event_pair(hev, lin_ev)
+    if is_bool:
+        res_out = dpt.usm_ndarray(
+            (sh,),
+            dtype=dpt.bool,
+            buffer=usm_type,
+            order="C",
+            buffer_ctor_kwargs={"queue": sycl_queue},
+        )
+        hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev]
+        )
+        _manager.add_event_pair(hev_cpy, cpy_ev)
+        return res_out
+    return res
+
+
 def empty(
     shape,
     *,
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 17a4aefbd4ca..0dcc6440a275 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -57,7 +57,7 @@ def _get_indexing_mode(name):
 
 
 def _range(sh_i, i, nd, q, usm_t, dt):
-    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
+    ind = dpt_ext.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
     ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
     return ind
 
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 4e05d57c6872..6230054645a8 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -225,7 +225,7 @@ def dpnp_linspace(
 
         delta = usm_stop - usm_start
 
-        usm_res = dpt.arange(
+        usm_res = dpt_ext.arange(
             0,
             stop=num,
             step=1,
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 9a2afe77f3cb..74ddda10ba02 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -75,7 +75,7 @@ def arange(
         sycl_queue=sycl_queue, device=device
     )
 
-    array_obj = dpt.arange(
+    array_obj = dpt_ext.arange(
         start,
         stop=stop,
         step=step,
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 52fc4b7f6448..290cc7aae7e6 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -3935,7 +3935,7 @@ def vander(
     tmp = m[:, ::-1] if not increasing else m
     dpnp.power(
         dpt_ext.reshape(usm_x, (-1, 1)),
-        dpt.arange(
+        dpt_ext.arange(
             N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue
         ),
         out=tmp,
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index 88e6aacb997d..382606ec377a 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -972,7 +972,7 @@ def test_ones_like(array, dtype, order):
     ],
 )
 def test_dpctl_tensor_input(func, args):
-    x0 = dpt_ext.reshape(dpt.arange(9), (3, 3))
+    x0 = dpt_ext.reshape(dpt_ext.arange(9), (3, 3))
     new_args = [eval(val, {"x0": x0}) for val in args]
     X = getattr(dpt, func)(*new_args)
     Y = getattr(dpnp, func)(*new_args)

From a7370a802b51e93c78eaf1908a81fb916cba9ad4 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 03:47:12 -0800
Subject: [PATCH 082/145] Move ti.asarray() and reuse it in dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py                |   2 +
 dpctl_ext/tensor/_clip.py                   |   6 +-
 dpctl_ext/tensor/_copy_utils.py             |   6 +-
 dpctl_ext/tensor/_ctors.py                  | 530 +++++++++++++++++++-
 dpctl_ext/tensor/_indexing_functions.py     |   2 +-
 dpctl_ext/tensor/_manipulation_functions.py |   2 +-
 dpctl_ext/tensor/_scalar_utils.py           |   6 +-
 dpctl_ext/tensor/_search_functions.py       |   4 +-
 8 files changed, 546 insertions(+), 12 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 084d4db3bd26..1c4009e15fd8 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -38,6 +38,7 @@
 )
 from dpctl_ext.tensor._ctors import (
     arange,
+    asarray,
     empty,
     empty_like,
     eye,
@@ -66,6 +67,7 @@
 
 __all__ = [
     "arange",
+    "asarray",
     "asnumpy",
     "astype",
     "can_cast",
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index 8b53552b034b..bab2271c6453 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -176,7 +176,7 @@ def _clip_none(x, val, out, order, _binary_fn):
     if isinstance(val, dpt.usm_ndarray):
         val_ary = val
     else:
-        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+        val_ary = dpt_ext.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
 
     if order == "A":
         order = (
@@ -540,11 +540,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if isinstance(min, dpt.usm_ndarray):
             a_min = min
         else:
-            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+            a_min = dpt_ext.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
         if isinstance(max, dpt.usm_ndarray):
             a_max = max
         else:
-            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+            a_max = dpt_ext.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
 
         if order == "A":
             order = (
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 1b54587ee393..bdcd5d123072 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -159,7 +159,7 @@ def _extract_impl(ary, ary_mask, axis=0):
     elif isinstance(ary_mask, np.ndarray):
         dst_usm_type = ary.usm_type
         exec_q = ary.sycl_queue
-        ary_mask = dpt.asarray(
+        ary_mask = dpt_ext.asarray(
             ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
         )
     else:
@@ -284,7 +284,7 @@ def _prepare_indices_arrays(inds, q, usm_type):
             lambda ind: (
                 ind
                 if isinstance(ind, dpt.usm_ndarray)
-                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
+                else dpt_ext.asarray(ind, usm_type=usm_type, sycl_queue=q)
             ),
             inds,
         )
@@ -332,7 +332,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
 
     if exec_q is not None:
         if not isinstance(vals, dpt.usm_ndarray):
-            vals = dpt.asarray(
+            vals = dpt_ext.asarray(
                 vals,
                 dtype=ary.dtype,
                 usm_type=coerced_usm_type,
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 9b80c3bc685a..fb30b89c684b 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -44,6 +44,7 @@
 import dpctl_ext.tensor._tensor_impl as ti
 from dpctl_ext.tensor._copy_utils import (
     _empty_like_orderK,
+    _from_numpy_empty_like_orderK,
 )
 
 __doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"
@@ -112,6 +113,209 @@ def _array_info_sequence(li):
     return (n,) + dim, dt, device
 
 
+def _asarray_from_numpy_ndarray(
+    ary, dtype=None, usm_type=None, sycl_queue=None, order="K"
+):
+    if not isinstance(ary, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(ary)}")
+    if usm_type is None:
+        usm_type = "device"
+    copy_q = normalize_queue_device(sycl_queue=None, device=sycl_queue)
+    if ary.dtype.char not in "?bBhHiIlLqQefdFD":
+        raise TypeError(
+            f"Numpy array of data type {ary.dtype} is not supported. "
+            "Please convert the input to an array with numeric data type."
+        )
+    if dtype is None:
+        # deduce device-representable output data type
+        dtype = _map_to_device_dtype(ary.dtype, copy_q)
+    _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+    f_contig = ary.flags["F"]
+    c_contig = ary.flags["C"]
+    fc_contig = f_contig or c_contig
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and fc_contig:
+        order = "C" if c_contig else "F"
+    if order == "K":
+        # new USM allocation
+        res = _from_numpy_empty_like_orderK(ary, dtype, usm_type, copy_q)
+    else:
+        res = dpt.usm_ndarray(
+            ary.shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+    res[...] = ary
+    return res
+
+
+def _asarray_from_seq(
+    seq_obj,
+    seq_shape,
+    seq_dt,
+    alloc_q,
+    exec_q,
+    dtype=None,
+    usm_type=None,
+    order="C",
+):
+    """`seq_obj` is a sequence"""
+    if usm_type is None:
+        usm_types_in_seq = []
+        _usm_types_walker(seq_obj, usm_types_in_seq)
+        usm_type = dpctl.utils.get_coerced_usm_type(usm_types_in_seq)
+    dpctl.utils.validate_usm_type(usm_type)
+    if dtype is None:
+        dtype = _map_to_device_dtype(seq_dt, alloc_q)
+    else:
+        _mapped_dt = _map_to_device_dtype(dtype, alloc_q)
+        if _mapped_dt != dtype:
+            raise ValueError(
+                f"Device {alloc_q.sycl_device} "
+                f"does not support {dtype} natively."
+            )
+        dtype = _mapped_dt
+    if order in "KA":
+        order = "C"
+    if isinstance(exec_q, dpctl.SyclQueue):
+        res = dpt_ext.empty(
+            seq_shape,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=alloc_q,
+            order=order,
+        )
+        _manager = dpctl.utils.SequentialOrderManager[exec_q]
+        _device_copy_walker(seq_obj, res, _manager)
+        return res
+    else:
+        res = dpt_ext.empty(
+            seq_shape,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=alloc_q,
+            order=order,
+        )
+        _copy_through_host_walker(seq_obj, res)
+        return res
+
+
+def _asarray_from_seq_single_device(
+    obj,
+    seq_shape,
+    seq_dt,
+    seq_dev,
+    dtype=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="C",
+):
+    if sycl_queue is None:
+        exec_q = seq_dev
+        alloc_q = seq_dev
+    else:
+        exec_q = dpctl.utils.get_execution_queue(
+            (
+                sycl_queue,
+                seq_dev,
+            )
+        )
+        alloc_q = sycl_queue
+    return _asarray_from_seq(
+        obj,
+        seq_shape,
+        seq_dt,
+        alloc_q,
+        exec_q,
+        dtype=dtype,
+        usm_type=usm_type,
+        order=order,
+    )
+
+
+def _asarray_from_usm_ndarray(
+    usm_ndary,
+    dtype=None,
+    copy=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="K",
+):
+    if not isinstance(usm_ndary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ndary)}"
+        )
+    if usm_type is None:
+        usm_type = usm_ndary.usm_type
+    if sycl_queue is not None:
+        exec_q = dpctl.utils.get_execution_queue(
+            [usm_ndary.sycl_queue, sycl_queue]
+        )
+        copy_q = normalize_queue_device(sycl_queue=sycl_queue, device=exec_q)
+    else:
+        copy_q = usm_ndary.sycl_queue
+    if dtype is None:
+        dtype = _map_to_device_dtype(usm_ndary.dtype, copy_q)
+    # Conditions for zero copy:
+    can_zero_copy = copy is not True
+    #    dtype is unchanged
+    can_zero_copy = can_zero_copy and dtype == usm_ndary.dtype
+    #    USM allocation type is unchanged
+    can_zero_copy = can_zero_copy and usm_type == usm_ndary.usm_type
+    #    sycl_queue is unchanged
+    can_zero_copy = can_zero_copy and copy_q is usm_ndary.sycl_queue
+    #    order is unchanged
+    c_contig = usm_ndary.flags.c_contiguous
+    f_contig = usm_ndary.flags.f_contiguous
+    fc_contig = usm_ndary.flags.forc
+    if can_zero_copy:
+        if order == "C" and c_contig:
+            pass
+        elif order == "F" and f_contig:
+            pass
+        elif order == "A" and fc_contig:
+            pass
+        elif order == "K":
+            pass
+        else:
+            can_zero_copy = False
+    if copy is False and can_zero_copy is False:
+        raise ValueError("asarray(..., copy=False) is not possible")
+    if can_zero_copy:
+        return usm_ndary
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and fc_contig:
+        order = "C" if c_contig else "F"
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+        res = _empty_like_orderK(usm_ndary, dtype, usm_type, copy_q)
+    else:
+        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+        res = dpt.usm_ndarray(
+            usm_ndary.shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+    eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q])
+    if eq is not None:
+        _manager = dpctl.utils.SequentialOrderManager[eq]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    else:
+        tmp = dpt_ext.asnumpy(usm_ndary)
+        res[...] = tmp
+    return res
+
+
 def _cast_fill_val(fill_val, dt):
     """
     Casts the Python scalar `fill_val` to another Python type coercible to the
@@ -145,6 +349,82 @@ def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False):
     raise ValueError(f"Data type {dt} is not supported")
 
 
+def _copy_through_host_walker(seq_o, usm_res):
+    if isinstance(seq_o, dpt.usm_ndarray):
+        if (
+            dpctl.utils.get_execution_queue(
+                (
+                    usm_res.sycl_queue,
+                    seq_o.sycl_queue,
+                )
+            )
+            is None
+        ):
+            usm_res[...] = dpt.asnumpy(seq_o).copy()
+            return
+        else:
+            usm_res[...] = seq_o
+    if hasattr(seq_o, "__usm_ndarray__"):
+        usm_arr = seq_o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            _copy_through_host_walker(usm_arr, usm_res)
+            return
+    if hasattr(seq_o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(seq_o)
+        if (
+            dpctl.utils.get_execution_queue(
+                (
+                    usm_res.sycl_queue,
+                    usm_ar.sycl_queue,
+                )
+            )
+            is None
+        ):
+            usm_res[...] = dpt_ext.asnumpy(usm_ar).copy()
+        else:
+            usm_res[...] = usm_ar
+        return
+    if _is_object_with_buffer_protocol(seq_o):
+        np_ar = np.asarray(seq_o)
+        usm_res[...] = np_ar
+        return
+    if isinstance(seq_o, (list, tuple)):
+        for i, el in enumerate(seq_o):
+            _copy_through_host_walker(el, usm_res[i])
+        return
+    usm_res[...] = np.asarray(seq_o)
+
+
+def _device_copy_walker(seq_o, res, _manager):
+    if isinstance(seq_o, dpt.usm_ndarray):
+        exec_q = res.sycl_queue
+        deps = _manager.submitted_events
+        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=seq_o, dst=res, sycl_queue=exec_q, depends=deps
+        )
+        _manager.add_event_pair(ht_ev, cpy_ev)
+        return
+    if hasattr(seq_o, "__usm_ndarray__"):
+        usm_arr = seq_o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            _device_copy_walker(usm_arr, res, _manager)
+            return
+    if hasattr(seq_o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(seq_o)
+        exec_q = res.sycl_queue
+        deps = _manager.submitted_events
+        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps
+        )
+        _manager.add_event_pair(ht_ev, cpy_ev)
+        return
+    if isinstance(seq_o, (list, tuple)):
+        for i, el in enumerate(seq_o):
+            _device_copy_walker(el, res[i], _manager)
+        return
+    raise TypeError
+
+
 def _ensure_native_dtype_device_support(dtype, dev) -> None:
     """Check that dtype is natively supported by device.
 
@@ -191,6 +471,28 @@ def _get_arange_length(start, stop, step):
     return _round_for_arange(tmp)
 
 
+def _map_to_device_dtype(dt, q):
+    dtc = dt.char
+    if dtc == "?" or np.issubdtype(dt, np.integer):
+        return dt
+    d = q.sycl_device
+    if np.issubdtype(dt, np.floating):
+        if dtc == "f":
+            return dt
+        if dtc == "d" and d.has_aspect_fp64:
+            return dt
+        if dtc == "e" and d.has_aspect_fp16:
+            return dt
+        return dpt.dtype("f4")
+    if np.issubdtype(dt, np.complexfloating):
+        if dtc == "F":
+            return dt
+        if dtc == "D" and d.has_aspect_fp64:
+            return dt
+        return dpt.dtype("c8")
+    raise RuntimeError(f"Unrecognized data type '{dt}' encountered.")
+
+
 def _normalize_order(order, arr):
     """
     Utility function for processing the `order` keyword of array-like
@@ -241,6 +543,30 @@ def _usm_ndarray_from_suai(obj):
     return ary
 
 
+def _usm_types_walker(o, usm_types_list):
+    if isinstance(o, dpt.usm_ndarray):
+        usm_types_list.append(o.usm_type)
+        return
+    if hasattr(o, "__usm_ndarray__"):
+        usm_arr = o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            usm_types_list.append(usm_arr.usm_type)
+            return
+    if hasattr(o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(o)
+        usm_types_list.append(usm_ar.usm_type)
+        return
+    if _is_object_with_buffer_protocol(o):
+        return
+    if isinstance(o, (int, bool, float, complex)):
+        return
+    if isinstance(o, (list, tuple, range)):
+        for el in o:
+            _usm_types_walker(el, usm_types_list)
+        return
+    raise TypeError
+
+
 def arange(
     start,
     /,
@@ -352,6 +678,208 @@ def arange(
     return res
 
 
+def asarray(
+    obj,
+    /,
+    *,
+    dtype=None,
+    device=None,
+    copy=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="K",
+):
+    """
+    Converts input object to :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        obj: Python object to convert. Can be an instance of
+            :class:`dpctl.tensor.usm_ndarray`,
+            an object representing SYCL USM allocation and implementing
+            ``__sycl_usm_array_interface__`` protocol, an instance
+            of :class:`numpy.ndarray`, an object supporting Python buffer
+            protocol, a Python scalar, or a (possibly nested) sequence of
+            Python scalars.
+        dtype (data type, optional):
+            output array data type. If ``dtype`` is
+            ``None``, the output array data type is inferred from data types in
+            ``obj``. Default: ``None``
+        copy (`bool`, optional):
+            boolean indicating whether or not to copy the
+            input. If ``True``, always creates a copy. If ``False``, the
+            need to copy raises :exc:`ValueError`. If ``None``, tries to reuse
+            existing memory allocations if possible, but allows to perform
+            a copy otherwise. Default: ``None``
+        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
+            memory layout of the output array. Default: ``"K"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Array created from input object.
+    """
+    # 1. Check that copy is a valid keyword
+    if copy not in [None, True, False]:
+        raise TypeError(
+            "Recognized copy keyword values should be True, False, or None"
+        )
+    # 2. Check that dtype is None, or a valid dtype
+    if dtype is not None:
+        dtype = dpt.dtype(dtype)
+    # 3. Validate order
+    if not isinstance(order, str):
+        raise TypeError(
+            f"Expected order keyword to be of type str, got {type(order)}"
+        )
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    # 4. Check that usm_type is None, or a valid value
+    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
+    # 5. Normalize device/sycl_queue [keep it None if was None]
+    if device is not None or sycl_queue is not None:
+        sycl_queue = normalize_queue_device(
+            sycl_queue=sycl_queue, device=device
+        )
+
+    # handle instance(obj, usm_ndarray)
+    if isinstance(obj, dpt.usm_ndarray):
+        return _asarray_from_usm_ndarray(
+            obj,
+            dtype=dtype,
+            copy=copy,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if hasattr(obj, "__usm_ndarray__"):
+        usm_arr = obj.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            return _asarray_from_usm_ndarray(
+                usm_arr,
+                dtype=dtype,
+                copy=copy,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+    if hasattr(obj, "__sycl_usm_array_interface__"):
+        ary = _usm_ndarray_from_suai(obj)
+        return _asarray_from_usm_ndarray(
+            ary,
+            dtype=dtype,
+            copy=copy,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if isinstance(obj, np.ndarray):
+        if copy is False:
+            raise ValueError(
+                "Converting numpy.ndarray to usm_ndarray requires a copy"
+            )
+        return _asarray_from_numpy_ndarray(
+            obj,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if _is_object_with_buffer_protocol(obj):
+        if copy is False:
+            raise ValueError(
+                f"Converting {type(obj)} to usm_ndarray requires a copy"
+            )
+        return _asarray_from_numpy_ndarray(
+            np.array(obj),
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if isinstance(obj, (list, tuple, range)):
+        if copy is False:
+            raise ValueError(
+                "Converting Python sequence to usm_ndarray requires a copy"
+            )
+        seq_shape, seq_dt, devs = _array_info_sequence(obj)
+        if devs == _host_set:
+            return _asarray_from_numpy_ndarray(
+                np.asarray(obj, dtype=dtype, order=order),
+                dtype=dtype,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+        elif len(devs) == 1:
+            seq_dev = list(devs)[0]
+            return _asarray_from_seq_single_device(
+                obj,
+                seq_shape,
+                seq_dt,
+                seq_dev,
+                dtype=dtype,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+        elif len(devs) > 1:
+            devs = [dev for dev in devs if dev is not None]
+            if sycl_queue is None:
+                if len(devs) == 1:
+                    alloc_q = devs[0]
+                else:
+                    raise dpctl.utils.ExecutionPlacementError(
+                        "Please specify `device` or `sycl_queue` keyword "
+                        "argument to determine where to allocate the "
+                        "resulting array."
+                    )
+            else:
+                alloc_q = sycl_queue
+            return _asarray_from_seq(
+                obj,
+                seq_shape,
+                seq_dt,
+                alloc_q,
+                #  force copying via host
+                None,
+                dtype=dtype,
+                usm_type=usm_type,
+                order=order,
+            )
+    if copy is False:
+        raise ValueError(
+            f"Converting {type(obj)} to usm_ndarray requires a copy"
+        )
+    # obj is a scalar, create 0d array
+    return _asarray_from_numpy_ndarray(
+        np.asarray(obj, dtype=dtype),
+        dtype=dtype,
+        usm_type=usm_type,
+        sycl_queue=sycl_queue,
+        order="C",
+    )
+
+
 def empty(
     shape,
     *,
@@ -665,7 +1193,7 @@ def full(
             sycl_queue = normalize_queue_device(
                 sycl_queue=sycl_queue, device=device
             )
-        X = dpt.asarray(
+        X = dpt_ext.asarray(
             fill_value,
             dtype=dtype,
             order=order,
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 0dcc6440a275..369bd6a49022 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -329,7 +329,7 @@ def put_vec_duplicates(vec, ind, vals):
         val_shape = indices.shape
 
     if not isinstance(vals, dpt.usm_ndarray):
-        vals = dpt.asarray(
+        vals = dpt_ext.asarray(
             vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
         )
     # choose to throw here for consistency with `place`
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 470bf3ef3876..37cba6828818 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -204,7 +204,7 @@ def repeat(x, repeats, /, *, axis=None):
                     "`repeats` sequence must have the same length as the "
                     "repeated axis"
                 )
-            repeats = dpt.asarray(
+            repeats = dpt_ext.asarray(
                 repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
             )
             if not dpt.all(repeats >= 0):
diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py
index 86787baea8cc..3ab92b42ad00 100644
--- a/dpctl_ext/tensor/_scalar_utils.py
+++ b/dpctl_ext/tensor/_scalar_utils.py
@@ -33,6 +33,10 @@
 import numpy as np
 from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+
 from ._type_utils import (
     WeakBooleanType,
     WeakComplexType,
@@ -59,7 +63,7 @@ def _get_dtype(o, dev):
     if isinstance(o, dpt.usm_ndarray):
         return o.dtype
     if hasattr(o, "__sycl_usm_array_interface__"):
-        return dpt.asarray(o).dtype
+        return dpt_ext.asarray(o).dtype
     if _is_buffer(o):
         host_dt = np.array(o).dtype
         dev_dt = _to_device_supported_dtype(host_dt, dev)
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
index e7eb33a5f3fe..cc76bfe8b543 100644
--- a/dpctl_ext/tensor/_search_functions.py
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -323,9 +323,9 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             else "C"
         )
     if not isinstance(x1, dpt.usm_ndarray):
-        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
+        x1 = dpt_ext.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
     if not isinstance(x2, dpt.usm_ndarray):
-        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
+        x2 = dpt_ext.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
 
     if condition.size == 0:
         if out is not None:

From effcbe8cab3a74dc04daf491292e16c1640f31c4 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 03:58:49 -0800
Subject: [PATCH 083/145] Reuse dpctl_ext.tensor.asarray() in dpnp

---
 dpnp/dpnp_algo/dpnp_arraycreation.py      | 8 ++++----
 dpnp/dpnp_algo/dpnp_elementwise_common.py | 4 ++--
 dpnp/dpnp_container.py                    | 4 ++--
 dpnp/dpnp_iface.py                        | 2 +-
 dpnp/dpnp_iface_arraycreation.py          | 2 +-
 dpnp/dpnp_iface_indexing.py               | 4 ++--
 dpnp/dpnp_iface_manipulation.py           | 2 +-
 dpnp/dpnp_iface_searching.py              | 2 +-
 dpnp/tests/test_arraycreation.py          | 8 ++++----
 dpnp/tests/test_arraymanipulation.py      | 6 +++---
 dpnp/tests/test_fft.py                    | 9 ++++-----
 dpnp/tests/test_indexing.py               | 5 ++---
 dpnp/tests/test_linalg.py                 | 4 +++-
 dpnp/tests/test_manipulation.py           | 4 +++-
 dpnp/tests/test_mathematical.py           | 8 ++++----
 dpnp/tests/test_nanfunctions.py           | 9 ++++-----
 dpnp/tests/test_ndarray.py                | 5 ++++-
 dpnp/tests/test_statistics.py             | 5 ++---
 dpnp/tests/test_utils.py                  | 4 +++-
 19 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 6230054645a8..86b9642ecd5d 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -53,7 +53,7 @@ def _as_usm_ndarray(a, usm_type, sycl_queue):
 
     if isinstance(a, dpnp_array):
         a = a.get_array()
-    return dpt.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
+    return dpt_ext.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
 
 
 def _check_has_zero_val(a):
@@ -213,13 +213,13 @@ def dpnp_linspace(
             else:
                 step = dpnp.nan
     else:
-        usm_start = dpt.asarray(
+        usm_start = dpt_ext.asarray(
             start,
             dtype=dt,
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_stop = dpt.asarray(
+        usm_stop = dpt_ext.asarray(
             stop, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
         )
 
@@ -266,7 +266,7 @@ def dpnp_linspace(
 
     if retstep is True:
         if dpnp.isscalar(step):
-            step = dpt.asarray(
+            step = dpt_ext.asarray(
                 step, usm_type=res.usm_type, sycl_queue=res.sycl_queue
             )
         return res, dpnp_array._create_from_usm_ndarray(step)
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 2e280b58c1d2..85dc9473206c 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -713,7 +713,7 @@ def __call__(
 
         if dtype is not None:
             if dpnp.isscalar(x1):
-                x1_usm = dpt.asarray(
+                x1_usm = dpt_ext.asarray(
                     x1,
                     dtype=dtype,
                     sycl_queue=x2.sycl_queue,
@@ -722,7 +722,7 @@ def __call__(
                 x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
             elif dpnp.isscalar(x2):
                 x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt.asarray(
+                x2_usm = dpt_ext.asarray(
                     x2,
                     dtype=dtype,
                     sycl_queue=x1.sycl_queue,
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 74ddda10ba02..e2c5070d807e 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -103,7 +103,7 @@ def asarray(
 
     """Converts incoming 'x1' object to 'dpnp_array'."""
     if isinstance(x1, (list, tuple, range)):
-        array_obj = dpt.asarray(
+        array_obj = dpt_ext.asarray(
             x1,
             dtype=dtype,
             copy=copy,
@@ -122,7 +122,7 @@ def asarray(
             x1_obj, device=device, sycl_queue=sycl_queue
         )
 
-        array_obj = dpt.asarray(
+        array_obj = dpt_ext.asarray(
             x1_obj,
             dtype=dtype,
             copy=copy,
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 6c050a208981..9fca083a6413 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -191,7 +191,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     if is_supported_array_type(a):
         return get_usm_ndarray(a)
 
-    return dpt.asarray(
+    return dpt_ext.asarray(
         a, dtype=dtype, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 290cc7aae7e6..64c34ed80fce 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -3912,7 +3912,7 @@ def vander(
 
     if dpnp.is_supported_array_type(x):
         x = dpnp.get_usm_ndarray(x)
-    usm_x = dpt.asarray(
+    usm_x = dpt_ext.asarray(
         x, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 503c66bfec58..58c8f495c440 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -1805,7 +1805,7 @@ def put_along_axis(a, ind, values, axis, mode="wrap"):
     if dpnp.is_supported_array_type(values):
         usm_vals = dpnp.get_usm_ndarray(values)
     else:
-        usm_vals = dpt.asarray(
+        usm_vals = dpt_ext.asarray(
             values, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
 
@@ -2153,7 +2153,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if not dpnp.is_supported_array_type(indices):
-        usm_ind = dpt.asarray(
+        usm_ind = dpt_ext.asarray(
             indices, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
     else:
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 08fd55c58ace..e4a12346bc6c 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -375,7 +375,7 @@ def _get_first_nan_index(usm_a):
         ):
             if dpnp.issubdtype(usm_a.dtype, dpnp.complexfloating):
                 # for complex all NaNs are considered equivalent
-                true_val = dpt.asarray(
+                true_val = dpt_ext.asarray(
                     True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type
                 )
                 return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left")
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index a2389978d506..15f52338ec7e 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -376,7 +376,7 @@ def searchsorted(a, v, side="left", sorter=None):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if dpnp.isscalar(v):
-        usm_v = dpt.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
+        usm_v = dpt_ext.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
     else:
         usm_v = dpnp.get_usm_ndarray(v)
 
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index 382606ec377a..698e22b9f873 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -668,7 +668,7 @@ def test_tri_default_dtype():
         5,
         numpy.array(1),
         dpnp.array(2),
-        dpt.asarray(3),
+        dpt_ext.asarray(3),
     ],
     ids=[
         "-3",
@@ -682,7 +682,7 @@ def test_tri_default_dtype():
         "5",
         "np.array(1)",
         "dpnp.array(2)",
-        "dpt.asarray(3)",
+        "dpt_ext.asarray(3)",
     ],
 )
 @pytest.mark.parametrize(
@@ -725,7 +725,7 @@ def test_tril(m, k, dtype):
         5,
         numpy.array(1),
         dpnp.array(2),
-        dpt.asarray(3),
+        dpt_ext.asarray(3),
     ],
     ids=[
         "-3",
@@ -739,7 +739,7 @@ def test_tril(m, k, dtype):
         "5",
         "np.array(1)",
         "dpnp.array(2)",
-        "dpt.asarray(3)",
+        "dpt_ext.asarray(3)",
     ],
 )
 @pytest.mark.parametrize(
diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py
index ba83ee94d8b0..7d5d2efeebb1 100644
--- a/dpnp/tests/test_arraymanipulation.py
+++ b/dpnp/tests/test_arraymanipulation.py
@@ -1,11 +1,11 @@
-import warnings
-
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.tensor._numpy_helper import AxisError
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import get_all_dtypes, get_float_complex_dtypes
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index cf6da5830f10..3a19a2cf3668 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -7,7 +6,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_utils import map_dtype_to_device
 
@@ -83,7 +82,7 @@ def test_usm_ndarray(self, n):
         a_usm = dpt.asarray(a)
 
         expected = numpy.fft.fft(a, n=n)
-        out = dpt_ext.empty(expected.shape, dtype=a_usm.dtype)
+        out = dpt.empty(expected.shape, dtype=a_usm.dtype)
         result = dpnp.fft.fft(a_usm, n=n, out=out)
         assert out is result.get_array()
         assert_dtype_allclose(result, expected)
@@ -642,7 +641,7 @@ def test_usm_ndarray(self, n):
         a_usm = dpt.asarray(a)
 
         expected = numpy.fft.irfft(a, n=n)
-        out = dpt_ext.empty(expected.shape, dtype=a_usm.real.dtype)
+        out = dpt.empty(expected.shape, dtype=a_usm.real.dtype)
         result = dpnp.fft.irfft(a_usm, n=n, out=out)
         assert out is result.get_array()
         assert_dtype_allclose(result, expected)
@@ -730,7 +729,7 @@ def test_usm_ndarray(self, n):
 
         expected = numpy.fft.rfft(a, n=n)
         out_dt = map_dtype_to_device(dpnp.complex128, a_usm.sycl_device)
-        out = dpt_ext.empty(expected.shape, dtype=out_dt)
+        out = dpt.empty(expected.shape, dtype=out_dt)
 
         result = dpnp.fft.rfft(a_usm, n=n, out=out)
         assert out is result.get_array()
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index 79c41a2f45f7..d8822d77080b 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -1,7 +1,6 @@
 import functools
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -13,10 +12,10 @@
     assert_raises_regex,
 )
 
-import dpnp
-
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+import dpnp
 from dpctl_ext.tensor._numpy_helper import AxisError
 from dpctl_ext.tensor._type_utils import _to_device_supported_dtype
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 31d99d71ce49..b9673d21a161 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -1,7 +1,6 @@
 import warnings
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.tensor._numpy_helper import AxisError
@@ -14,6 +13,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py
index 8ddba08dbb92..8095a0daa858 100644
--- a/dpnp/tests/test_manipulation.py
+++ b/dpnp/tests/test_manipulation.py
@@ -1,6 +1,5 @@
 import itertools
 
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.tensor._numpy_helper import AxisError
@@ -10,6 +9,9 @@
     assert_raises,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index 7b6d39875d67..841494bde1ed 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -669,15 +669,15 @@ def test_to_begin_to_end(self, to_begin, to_end):
         "to_begin, to_end",
         [
             (-20, 20),
-            (dpt.asarray([-20, -30]), dpt.asarray([20, 15])),
-            (dpt.asarray([[-20, -30]]), dpt.asarray([[20, 15]])),
+            (dpt_ext.asarray([-20, -30]), dpt_ext.asarray([20, 15])),
+            (dpt_ext.asarray([[-20, -30]]), dpt_ext.asarray([[20, 15]])),
             ([1, 2], [3, 4]),
             ((1, 2), (3, 4)),
         ],
     )
     def test_usm_ndarray(self, to_begin, to_end):
         a = numpy.array([[1, 2, 0]])
-        dpt_a = dpt.asarray(a)
+        dpt_a = dpt_ext.asarray(a)
 
         if isinstance(to_begin, dpt.usm_ndarray):
             np_to_begin = dpt.asnumpy(to_begin)
@@ -2631,7 +2631,7 @@ def test_out_float16(self, func):
     def test_out_usm_ndarray(self, func, dt):
         a = generate_random_numpy_array(10, dt)
         out = numpy.empty(a.shape, dtype=dt)
-        ia, usm_out = dpnp.array(a), dpt.asarray(out)
+        ia, usm_out = dpnp.array(a), dpt_ext.asarray(out)
 
         expected = getattr(numpy, func)(a, out=out)
         result = getattr(dpnp, func)(ia, out=usm_out)
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index 2d3dbd864a2e..2cb70df5954a 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -14,7 +13,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
@@ -58,7 +57,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
@@ -239,7 +238,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
@@ -548,7 +547,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index 4e4e42bbc85e..a27f0fe6aa14 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -9,6 +9,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -407,7 +410,7 @@ def test_error(self):
 class TestUsmNdarrayProtocol:
     def test_basic(self):
         a = dpnp.arange(256, dtype=dpnp.int64)
-        usm_a = dpt.asarray(a)
+        usm_a = dpt_ext.asarray(a)
 
         assert a.sycl_queue == usm_a.sycl_queue
         assert a.usm_type == usm_a.usm_type
diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py
index 6944d6bd5bdd..fe8848b6c858 100644
--- a/dpnp/tests/test_statistics.py
+++ b/dpnp/tests/test_statistics.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -11,7 +10,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
@@ -815,7 +814,7 @@ def test_out(self, func):
         assert_allclose(result, expected)
 
         # out is usm_ndarray
-        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
         result = getattr(dpnp, func)(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
diff --git a/dpnp/tests/test_utils.py b/dpnp/tests/test_utils.py
index eef9132e5b55..ddbd267c2108 100644
--- a/dpnp/tests/test_utils.py
+++ b/dpnp/tests/test_utils.py
@@ -1,7 +1,9 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 

From f9f547ad05e4549090b7e8f5b043a6d24ca482ef Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 04:04:46 -0800
Subject: [PATCH 084/145] Move ti.full_like() to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py |   2 +
 dpctl_ext/tensor/_ctors.py   | 117 +++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 1c4009e15fd8..bbf1f2107347 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -43,6 +43,7 @@
     empty_like,
     eye,
     full,
+    full_like,
     linspace,
     tril,
     triu,
@@ -80,6 +81,7 @@
     "finfo",
     "from_numpy",
     "full",
+    "full_like",
     "iinfo",
     "isdtype",
     "linspace",
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index fb30b89c684b..664c238cbdbf 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1223,6 +1223,123 @@ def full(
     return res
 
 
+def full_like(
+    x,
+    /,
+    fill_value,
+    *,
+    dtype=None,
+    order="K",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """full_like(x, fill_value, dtype=None, order="K", \
+                 device=None, usm_type=None, sycl_queue=None)
+
+    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with `fill_value`
+    and having the same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray): Input array from which to derive the output array
+            shape.
+        fill_value: the value to fill output array with
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
+            NumPy scalar type. If ``dtype`` is ``None``, the output array data
+            type is inferred from ``x``. Default: ``None``
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"K"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    sh = x.shape
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+            X = dpt_ext.asarray(
+                fill_value,
+                dtype=dtype,
+                order=order,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+            )
+            X = dpt.broadcast_to(X, sh)
+            res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+            _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+            # order copy after tasks populating X
+            dep_evs = _manager.submitted_events
+            hev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=X, dst=res, sycl_queue=sycl_queue, depends=dep_evs
+            )
+            _manager.add_event_pair(hev, copy_ev)
+            return res
+        else:
+            _validate_fill_value(fill_value)
+
+        dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        fill_value = _cast_fill_val(fill_value, dtype)
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        return full(
+            sh,
+            fill_value,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+
+
 def linspace(
     start,
     stop,

From 0d84d7b8bd9ae7b395a9b0c6ef2954cb495e3835 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 04:08:07 -0800
Subject: [PATCH 085/145] Move ti.meshgrid() to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py |  2 +
 dpctl_ext/tensor/_ctors.py   | 75 ++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index bbf1f2107347..782170bcd07f 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -45,6 +45,7 @@
     full,
     full_like,
     linspace,
+    meshgrid,
     tril,
     triu,
 )
@@ -85,6 +86,7 @@
     "iinfo",
     "isdtype",
     "linspace",
+    "meshgrid",
     "nonzero",
     "place",
     "put",
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 664c238cbdbf..57f4769b6616 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1443,6 +1443,81 @@ def linspace(
     return res if int_dt is None else dpt.astype(res, int_dt)
 
 
+def meshgrid(*arrays, indexing="xy"):
+    """
+    Creates list of :class:`dpctl.tensor.usm_ndarray` coordinate matrices
+    from vectors.
+
+    Args:
+        arrays (usm_ndarray):
+            an arbitrary number of one-dimensional arrays
+            representing grid coordinates. Each array should have the same
+            numeric data type.
+        indexing (``"xy"``, or ``"ij"``):
+            Cartesian (``"xy"``) or matrix (``"ij"``) indexing of output.
+            If provided zero or one one-dimensional vector(s) (i.e., the
+            zero- and one-dimensional cases, respectively), the ``indexing``
+            keyword has no effect and should be ignored. Default: ``"xy"``
+
+    Returns:
+        List[array]:
+            list of ``N`` arrays, where ``N`` is the number of
+            provided one-dimensional input arrays. Each returned array must
+            have rank ``N``.
+            For a set of ``n`` vectors with lengths ``N0``, ``N1``, ``N2``, ...
+            The cartesian indexing results in arrays of shape
+            ``(N1, N0, N2, ...)``, while the
+            matrix indexing results in arrays of shape
+            ``(N0, N1, N2, ...)``.
+            Default: ``"xy"``.
+
+    Raises:
+        ValueError: If vectors are not of the same data type, or are not
+            one-dimensional.
+
+    """
+    ref_dt = None
+    ref_unset = True
+    for array in arrays:
+        if not isinstance(array, dpt.usm_ndarray):
+            raise TypeError(
+                f"Expected instance of dpt.usm_ndarray, got {type(array)}."
+            )
+        if array.ndim != 1:
+            raise ValueError("All arrays must be one-dimensional.")
+        if ref_unset:
+            ref_unset = False
+            ref_dt = array.dtype
+        else:
+            if not ref_dt == array.dtype:
+                raise ValueError(
+                    "All arrays must be of the same numeric data type."
+                )
+    if indexing not in ["xy", "ij"]:
+        raise ValueError(
+            "Unrecognized indexing keyword value, expecting 'xy' or 'ij.'"
+        )
+    n = len(arrays)
+    if n == 0:
+        return []
+
+    sh = (-1,) + (1,) * (n - 1)
+
+    res = []
+    if n > 1 and indexing == "xy":
+        res.append(dpt_ext.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
+        res.append(dpt_ext.reshape(arrays[1], sh, copy=True))
+        arrays, sh = arrays[2:], sh[-2:] + sh[:-2]
+
+    for array in arrays:
+        res.append(dpt_ext.reshape(array, sh, copy=True))
+        sh = sh[-1:] + sh[:-1]
+
+    output = dpt.broadcast_arrays(*res)
+
+    return output
+
+
 def tril(x, /, *, k=0):
     """
     Returns the lower triangular part of a matrix (or a stack of matrices)

From 8c15ddb7b5c4bfe555b79432cb3281f9c3654890 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 04:23:19 -0800
Subject: [PATCH 086/145] Move ti.ones() to dpctl_ext/tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py     |  2 +
 dpctl_ext/tensor/_ctors.py       | 68 ++++++++++++++++++++++++++++++++
 dpnp/dpnp_container.py           |  2 +-
 dpnp/dpnp_iface_arraycreation.py |  2 +-
 dpnp/tests/test_memory.py        |  5 ++-
 dpnp/tests/test_sycl_queue.py    |  4 +-
 dpnp/tests/test_usm_type.py      |  4 +-
 7 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 782170bcd07f..faa2cf3a07d8 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -46,6 +46,7 @@
     full_like,
     linspace,
     meshgrid,
+    ones,
     tril,
     triu,
 )
@@ -88,6 +89,7 @@
     "linspace",
     "meshgrid",
     "nonzero",
+    "ones",
     "place",
     "put",
     "put_along_axis",
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 57f4769b6616..ede3a9e207a1 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1518,6 +1518,74 @@ def meshgrid(*arrays, indexing="xy"):
     return output
 
 
+def ones(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """ones(shape, dtype=None, order="C", \
+            device=None, usm_type="device", sycl_queue=None)
+
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with ones.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"): memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created array initialized with ones.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
+    return res
+
+
 def tril(x, /, *, k=0):
     """
     Returns the lower triangular part of a matrix (or a stack of matrices)
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index e2c5070d807e..cb2e775a9653 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -260,7 +260,7 @@ def ones(
         order = "C"
 
     """Creates `dpnp_array` of ones with the given shape, dtype, and order."""
-    array_obj = dpt.ones(
+    array_obj = dpt_ext.ones(
         shape,
         dtype=dtype,
         order=order,
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 64c34ed80fce..078ac7a10f4b 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -3696,7 +3696,7 @@ def tri(
     if usm_type is None:
         usm_type = "device"
 
-    m = dpt.ones(
+    m = dpt_ext.ones(
         (N, M),
         dtype=_dtype,
         device=device,
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index 1bc0da8c1535..94aeda33f505 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -2,6 +2,9 @@
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 import dpnp.memory as dpm
 
@@ -21,7 +24,7 @@ def test_wrong_input_type(self, x):
             dpm.create_data(x)
 
     def test_wrong_usm_data(self):
-        a = dpt.ones(10)
+        a = dpt_ext.ones(10)
         d = IntUsmData(a.shape, buffer=a)
 
         with pytest.raises(TypeError):
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index d1853579036a..a9c076a7c476 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -2,12 +2,14 @@
 import tempfile
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
 from numpy.testing import assert_array_equal, assert_raises
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.linalg
 from dpnp.dpnp_array import dpnp_array
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 4fc0f2b958fa..8f8efd1cdd10 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -2,11 +2,13 @@
 import tempfile
 from math import prod
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_utils import get_usm_allocations
 

From 23d2229b2f643cd0583d26e640b93bb4f779d86b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 04:34:06 -0800
Subject: [PATCH 087/145] Move ti.ones_like() to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py |  2 +
 dpctl_ext/tensor/_ctors.py   | 81 ++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index faa2cf3a07d8..b87d5f435782 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -47,6 +47,7 @@
     linspace,
     meshgrid,
     ones,
+    ones_like,
     tril,
     triu,
 )
@@ -90,6 +91,7 @@
     "meshgrid",
     "nonzero",
     "ones",
+    "ones_like",
     "place",
     "put",
     "put_along_axis",
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index ede3a9e207a1..ced78b9d2dcc 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1586,6 +1586,87 @@ def ones(
     return res
 
 
+def ones_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with ones and
+    having the same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the output array shape
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: `None`
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with ones.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        sh = x.shape
+        return ones(
+            sh,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+
+
 def tril(x, /, *, k=0):
     """
     Returns the lower triangular part of a matrix (or a stack of matrices)

From 97dc7e1e67da1aa83b83c7b99c65bfa8865bca11 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 04:39:19 -0800
Subject: [PATCH 088/145] Move ti.zeros() to dpctl_ext/tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py |  2 +
 dpctl_ext/tensor/_ctors.py   | 75 ++++++++++++++++++++++++++++++++++--
 dpnp/dpnp_container.py       | 23 ++++++-----
 3 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index b87d5f435782..7da3d622bb02 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -50,6 +50,7 @@
     ones_like,
     tril,
     triu,
+    zeros,
 )
 from dpctl_ext.tensor._indexing_functions import (
     extract,
@@ -105,4 +106,5 @@
     "tril",
     "triu",
     "where",
+    "zeros",
 ]
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index ced78b9d2dcc..da1c3dc19017 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1091,7 +1091,7 @@ def eye(
     n_cols = n_rows if n_cols is None else operator.index(n_cols)
     k = operator.index(k)
     if k >= n_cols or -k >= n_rows:
-        return dpt.zeros(
+        return dpt_ext.zeros(
             (n_rows, n_cols),
             dtype=dtype,
             order=order,
@@ -1720,7 +1720,7 @@ def tril(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     elif k < -shape[nd - 2]:
-        res = dpt.zeros(
+        res = dpt_ext.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1784,7 +1784,7 @@ def triu(x, /, *, k=0):
 
     q = x.sycl_queue
     if k > shape[nd - 1]:
-        res = dpt.zeros(
+        res = dpt_ext.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1821,3 +1821,72 @@ def triu(x, /, *, k=0):
         _manager.add_event_pair(hev, triu_ev)
 
     return res
+
+
+def zeros(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with zeros.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Constructed array initialized with zeros.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue)
+    _manager.add_event_pair(hev, zeros_ev)
+
+    return res
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index cb2e775a9653..9fe955746593 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -35,12 +35,11 @@
 
 """
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 
@@ -75,7 +74,7 @@ def arange(
         sycl_queue=sycl_queue, device=device
     )
 
-    array_obj = dpt_ext.arange(
+    array_obj = dpt.arange(
         start,
         stop=stop,
         step=step,
@@ -103,7 +102,7 @@ def asarray(
 
     """Converts incoming 'x1' object to 'dpnp_array'."""
     if isinstance(x1, (list, tuple, range)):
-        array_obj = dpt_ext.asarray(
+        array_obj = dpt.asarray(
             x1,
             dtype=dtype,
             copy=copy,
@@ -122,7 +121,7 @@ def asarray(
             x1_obj, device=device, sycl_queue=sycl_queue
         )
 
-        array_obj = dpt_ext.asarray(
+        array_obj = dpt.asarray(
             x1_obj,
             dtype=dtype,
             copy=copy,
@@ -143,7 +142,7 @@ def copy(x1, /, *, order="K"):
     if order is None:
         order = "K"
 
-    array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order)
+    array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
@@ -165,7 +164,7 @@ def empty(
         order = "C"
 
     """Creates `dpnp_array` from uninitialized USM allocation."""
-    array_obj = dpt_ext.empty(
+    array_obj = dpt.empty(
         shape,
         dtype=dtype,
         order=order,
@@ -196,7 +195,7 @@ def eye(
         order = "C"
 
     """Creates `dpnp_array` with ones on the `k`th diagonal."""
-    array_obj = dpt_ext.eye(
+    array_obj = dpt.eye(
         N,
         M,
         k=k,
@@ -231,7 +230,7 @@ def full(
         fill_value = fill_value.get_array()
 
     """Creates `dpnp_array` having a specified shape, filled with fill_value."""
-    array_obj = dpt_ext.full(
+    array_obj = dpt.full(
         shape,
         fill_value,
         dtype=dtype,
@@ -260,7 +259,7 @@ def ones(
         order = "C"
 
     """Creates `dpnp_array` of ones with the given shape, dtype, and order."""
-    array_obj = dpt_ext.ones(
+    array_obj = dpt.ones(
         shape,
         dtype=dtype,
         order=order,
@@ -272,13 +271,13 @@ def ones(
 
 def tril(x1, /, *, k=0):
     """Creates `dpnp_array` as lower triangular part of an input array."""
-    array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
 def triu(x1, /, *, k=0):
     """Creates `dpnp_array` as upper triangular part of an input array."""
-    array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k)
+    array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 

From a6c397ea61c8013dad9a0c4c6a14ffc155cdc2d4 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 04:41:49 -0800
Subject: [PATCH 089/145] Move ti.zeros_like() to dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py |  2 +
 dpctl_ext/tensor/_ctors.py   | 84 ++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 7da3d622bb02..a6c169d3fad9 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -51,6 +51,7 @@
     tril,
     triu,
     zeros,
+    zeros_like,
 )
 from dpctl_ext.tensor._indexing_functions import (
     extract,
@@ -107,4 +108,5 @@
     "triu",
     "where",
     "zeros",
+    "zeros_like",
 ]
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index da1c3dc19017..2648290f36b0 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1890,3 +1890,87 @@ def zeros(
     _manager.add_event_pair(hev, zeros_ev)
 
     return res
+
+
+def zeros_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Creates :class:`dpctl.tensor.usm_ndarray` from USM allocation
+    initialized with zeros.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the shape of the
+            output array.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
+            NumPy scalar type. If `None`, output array has the same data
+            type as the input array. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with zeros.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(0, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        sh = x.shape
+        return zeros(
+            sh,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )

From b8c5390868d14280c8c621c826fe110fa59bfc6b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 05:50:22 -0800
Subject: [PATCH 090/145] Move ti.broadcast_to() to dpctl_ext/tensor and reuse
 it in dpctl_ext/tensor

---
 dpctl_ext/tensor/__init__.py                |  2 +
 dpctl_ext/tensor/_clip.py                   | 32 ++++++------
 dpctl_ext/tensor/_copy_utils.py             |  2 +-
 dpctl_ext/tensor/_ctors.py                  |  4 +-
 dpctl_ext/tensor/_indexing_functions.py     |  2 +-
 dpctl_ext/tensor/_manipulation_functions.py | 54 +++++++++++++++++++++
 dpctl_ext/tensor/_search_functions.py       |  6 +--
 7 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a6c169d3fad9..240323cf45fa 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -63,6 +63,7 @@
     take_along_axis,
 )
 from dpctl_ext.tensor._manipulation_functions import (
+    broadcast_to,
     repeat,
     roll,
 )
@@ -76,6 +77,7 @@
     "asarray",
     "asnumpy",
     "astype",
+    "broadcast_to",
     "can_cast",
     "copy",
     "clip",
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index bab2271c6453..9fc42abc0d8b 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -205,9 +205,9 @@ def _clip_none(x, val, out, order, _binary_fn):
                     order=order,
                 )
         if x_shape != res_shape:
-            x = dpt.broadcast_to(x, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
         if val_ary.shape != res_shape:
-            val_ary = dpt.broadcast_to(val_ary, res_shape)
+            val_ary = dpt_ext.broadcast_to(val_ary, res_shape)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_binary_ev, binary_ev = _binary_fn(
@@ -251,8 +251,8 @@ def _clip_none(x, val, out, order, _binary_fn):
                 )
 
         if x_shape != res_shape:
-            x = dpt.broadcast_to(x, res_shape)
-        buf = dpt.broadcast_to(buf, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
+        buf = dpt_ext.broadcast_to(buf, res_shape)
         ht_binary_ev, binary_ev = _binary_fn(
             src1=x,
             src2=buf,
@@ -580,11 +580,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
             if x_shape != res_shape:
-                x = dpt.broadcast_to(x, res_shape)
+                x = dpt_ext.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt.broadcast_to(a_min, res_shape)
+                a_min = dpt_ext.broadcast_to(a_min, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt.broadcast_to(a_max, res_shape)
+                a_max = dpt_ext.broadcast_to(a_max, res_shape)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_binary_ev, binary_ev = ti._clip(
@@ -639,10 +639,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt.broadcast_to(x, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt.broadcast_to(a_min, res_shape)
-            buf2 = dpt.broadcast_to(buf2, res_shape)
+                a_min = dpt_ext.broadcast_to(a_min, res_shape)
+            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=a_min,
@@ -695,10 +695,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt.broadcast_to(x, res_shape)
-            buf1 = dpt.broadcast_to(buf1, res_shape)
+            x = dpt_ext.broadcast_to(x, res_shape)
+            buf1 = dpt_ext.broadcast_to(buf1, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt.broadcast_to(a_max, res_shape)
+                a_max = dpt_ext.broadcast_to(a_max, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=buf1,
@@ -766,9 +766,9 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     order=order,
                 )
 
-        x = dpt.broadcast_to(x, res_shape)
-        buf1 = dpt.broadcast_to(buf1, res_shape)
-        buf2 = dpt.broadcast_to(buf2, res_shape)
+        x = dpt_ext.broadcast_to(x, res_shape)
+        buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+        buf2 = dpt_ext.broadcast_to(buf2, res_shape)
         ht_, clip_ev = ti._clip(
             src=x,
             min=buf1,
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index bdcd5d123072..f2544e06718a 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -368,7 +368,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
         rhs = vals
     else:
         rhs = dpt_ext.astype(vals, ary.dtype)
-    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    rhs = dpt_ext.broadcast_to(rhs, expected_vals_shape)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
     hev, put_ev = ti._put(
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 2648290f36b0..03f97c7cd8f0 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1200,7 +1200,7 @@ def full(
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-        return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order)
+        return dpt_ext.copy(dpt_ext.broadcast_to(X, shape), order=order)
     else:
         _validate_fill_value(fill_value)
 
@@ -1307,7 +1307,7 @@ def full_like(
                 usm_type=usm_type,
                 sycl_queue=sycl_queue,
             )
-            X = dpt.broadcast_to(X, sh)
+            X = dpt_ext.broadcast_to(X, sh)
             res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
             _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
             # order copy after tasks populating X
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 369bd6a49022..91ffc759a920 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -341,7 +341,7 @@ def put_vec_duplicates(vec, ind, vals):
         rhs = vals
     else:
         rhs = dpt_ext.astype(vals, x.dtype)
-    rhs = dpt.broadcast_to(rhs, val_shape)
+    rhs = dpt_ext.broadcast_to(rhs, val_shape)
 
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 37cba6828818..9eeca08df8e3 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -86,6 +86,60 @@ def _broadcast_shape_impl(shapes):
     return tuple(common_shape)
 
 
+def _broadcast_strides(X_shape, X_strides, res_ndim):
+    """
+    Broadcasts strides to match the given dimensions;
+    returns tuple type strides.
+    """
+    out_strides = [0] * res_ndim
+    X_shape_len = len(X_shape)
+    str_dim = -X_shape_len
+    for i in range(X_shape_len):
+        shape_value = X_shape[i]
+        if not shape_value == 1:
+            out_strides[str_dim] = X_strides[i]
+        str_dim += 1
+
+    return tuple(out_strides)
+
+
+def broadcast_to(X, /, shape):
+    """broadcast_to(x, shape)
+
+    Broadcast an array to a new `shape`; returns the broadcasted
+    :class:`dpctl.tensor.usm_ndarray` as a view.
+
+    Args:
+        x (usm_ndarray): input array
+        shape (Tuple[int,...]): array shape. The `shape` must be
+            compatible with `x` according to broadcasting rules.
+
+    Returns:
+        usm_ndarray:
+            An array with the specified `shape`.
+            The output array is a view of the input array, and
+            hence has the same data type, USM allocation type and
+            device attributes.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    # Use numpy.broadcast_to to check the validity of the input
+    # parameter 'shape'. Raise ValueError if 'X' is not compatible
+    # with 'shape' according to NumPy's broadcasting rules.
+    new_array = np.broadcast_to(
+        np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape
+    )
+    new_sts = _broadcast_strides(X.shape, X.strides, new_array.ndim)
+    return dpt.usm_ndarray(
+        shape=new_array.shape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=new_sts,
+        offset=X._element_offset,
+    )
+
+
 def repeat(x, repeats, /, *, axis=None):
     """repeat(x, repeats, axis=None)
 
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
index cc76bfe8b543..26100b0479f7 100644
--- a/dpctl_ext/tensor/_search_functions.py
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -389,11 +389,11 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             )
 
     if condition_shape != res_shape:
-        condition = dpt.broadcast_to(condition, res_shape)
+        condition = dpt_ext.broadcast_to(condition, res_shape)
     if x1_shape != res_shape:
-        x1 = dpt.broadcast_to(x1, res_shape)
+        x1 = dpt_ext.broadcast_to(x1, res_shape)
     if x2_shape != res_shape:
-        x2 = dpt.broadcast_to(x2, res_shape)
+        x2 = dpt_ext.broadcast_to(x2, res_shape)
 
     dep_evs = _manager.submitted_events
     hev, where_ev = ti._where(

From a7cbfdc150e0854e0d7b4b659ea893e39da9a70b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 05:52:18 -0800
Subject: [PATCH 091/145] Reuse dpctl_ext.tensor.broadcast_to() in dpnp

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py | 4 ++--
 dpnp/dpnp_algo/dpnp_fill.py               | 5 ++---
 dpnp/dpnp_iface_manipulation.py           | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 85dc9473206c..b3e0c74c228e 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -1156,9 +1156,9 @@ def __call__(
 
         # Broadcast shapes of input arrays
         if x1.shape != res_shape:
-            x1 = dpt.broadcast_to(x1, res_shape)
+            x1 = dpt_ext.broadcast_to(x1, res_shape)
         if x2.shape != res_shape:
-            x2 = dpt.broadcast_to(x2, res_shape)
+            x2 = dpt_ext.broadcast_to(x2, res_shape)
 
         # Call the binary function with input and output arrays
         ht_binary_ev, binary_ev = self.get_implementation_function()(
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index ddba9f634cb1..7e0a70f25ff9 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -28,13 +28,12 @@
 
 from numbers import Number
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
 
 # TODO: revert to `from dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
@@ -56,7 +55,7 @@ def dpnp_fill(arr, val):
             raise dpu.ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
-        a_val = dpt_ext.astype(val, arr.dtype)
+        a_val = dpt.astype(val, arr.dtype)
         a_val = dpt.broadcast_to(a_val, arr.shape)
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index e4a12346bc6c..5727d082e163 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -1178,7 +1178,7 @@ def broadcast_to(array, /, shape, subok=False):
         raise NotImplementedError(f"subok={subok} is currently not supported")
 
     usm_array = dpnp.get_usm_ndarray(array)
-    new_array = dpt.broadcast_to(usm_array, shape)
+    new_array = dpt_ext.broadcast_to(usm_array, shape)
     return dpnp_array._create_from_usm_ndarray(new_array)
 
 

From bd265dabf9c6275255babae2f624ff94b2fe08d9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:02:49 -0800
Subject: [PATCH 092/145] Move ti.broadcast_arrays() to dpctl_ext.tensor and
 reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 ++
 dpctl_ext/tensor/_copy_utils.py             |  2 +-
 dpctl_ext/tensor/_ctors.py                  |  2 +-
 dpctl_ext/tensor/_manipulation_functions.py | 40 +++++++++++++++++++++
 dpnp/dpnp_iface_arraycreation.py            |  2 +-
 dpnp/dpnp_iface_indexing.py                 |  2 +-
 6 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 240323cf45fa..756ab4c3a422 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -63,6 +63,7 @@
     take_along_axis,
 )
 from dpctl_ext.tensor._manipulation_functions import (
+    broadcast_arrays,
     broadcast_to,
     repeat,
     roll,
@@ -77,6 +78,7 @@
     "asarray",
     "asnumpy",
     "astype",
+    "broadcast_arrays",
     "broadcast_to",
     "can_cast",
     "copy",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index f2544e06718a..78dbb5be17e8 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -306,7 +306,7 @@ def _prepare_indices_arrays(inds, q, usm_type):
     )
 
     # broadcast
-    inds = dpt.broadcast_arrays(*inds)
+    inds = dpt_ext.broadcast_arrays(*inds)
 
     return inds
 
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 03f97c7cd8f0..532802c0c519 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -1513,7 +1513,7 @@ def meshgrid(*arrays, indexing="xy"):
         res.append(dpt_ext.reshape(array, sh, copy=True))
         sh = sh[-1:] + sh[:-1]
 
-    output = dpt.broadcast_arrays(*res)
+    output = dpt_ext.broadcast_arrays(*res)
 
     return output
 
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 9eeca08df8e3..7a2b5b6de910 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -47,6 +47,15 @@
 )
 
 
+def _broadcast_shapes(*args):
+    """
+    Broadcast the input shapes into a single shape;
+    returns tuple broadcasted shape.
+    """
+    array_shapes = [array.shape for array in args]
+    return _broadcast_shape_impl(array_shapes)
+
+
 def _broadcast_shape_impl(shapes):
     if len(set(shapes)) == 1:
         return shapes[0]
@@ -103,6 +112,37 @@ def _broadcast_strides(X_shape, X_strides, res_ndim):
     return tuple(out_strides)
 
 
+def broadcast_arrays(*args):
+    """broadcast_arrays(*arrays)
+
+    Broadcasts one or more :class:`dpctl.tensor.usm_ndarrays` against
+    one another.
+
+    Args:
+        arrays (usm_ndarray): an arbitrary number of arrays to be
+            broadcasted.
+
+    Returns:
+        List[usm_ndarray]:
+            A list of broadcasted arrays. Each array
+            must have the same shape. Each array must have the same `dtype`,
+            `device` and `usm_type` attributes as its corresponding input
+            array.
+    """
+    if len(args) == 0:
+        raise ValueError("`broadcast_arrays` requires at least one argument")
+    for X in args:
+        if not isinstance(X, dpt.usm_ndarray):
+            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    shape = _broadcast_shapes(*args)
+
+    if all(X.shape == shape for X in args):
+        return args
+
+    return [broadcast_to(X, shape) for X in args]
+
+
 def broadcast_to(X, /, shape):
     """broadcast_to(x, shape)
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 078ac7a10f4b..d09cc17bde79 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -3131,7 +3131,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
         output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:])
 
     if not sparse:
-        output = dpt.broadcast_arrays(*output)
+        output = dpt_ext.broadcast_arrays(*output)
 
     if copy:
         output = [dpt_ext.copy(x) for x in output]
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 58c8f495c440..a52196e9e4db 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -260,7 +260,7 @@ def choose(a, choices, out=None, mode="wrap"):
                 choices,
             )
         )
-    arrs_broadcast = dpt.broadcast_arrays(inds, *choices)
+    arrs_broadcast = dpt_ext.broadcast_arrays(inds, *choices)
     inds = arrs_broadcast[0]
     choices = tuple(arrs_broadcast[1:])
 

From 080c7d801c29436b951dec9b7448c714107b30fc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:10:11 -0800
Subject: [PATCH 093/145] Move ti.concat() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |   2 +
 dpctl_ext/tensor/_manipulation_functions.py | 179 ++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |   2 +-
 3 files changed, 182 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 756ab4c3a422..84ae89b9a269 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -65,6 +65,7 @@
 from dpctl_ext.tensor._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
+    concat,
     repeat,
     roll,
 )
@@ -81,6 +82,7 @@
     "broadcast_arrays",
     "broadcast_to",
     "can_cast",
+    "concat",
     "copy",
     "clip",
     "empty",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 7a2b5b6de910..2bb92c2616d6 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -40,6 +40,7 @@
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._type_utils import _supported_dtype, _to_device_supported_dtype
 
 __doc__ = (
     "Implementation module for array manipulation "
@@ -47,6 +48,46 @@
 )
 
 
+def _arrays_validation(arrays, check_ndim=True):
+    n = len(arrays)
+    if n == 0:
+        raise TypeError("Missing 1 required positional argument: 'arrays'.")
+
+    if not isinstance(arrays, (list, tuple)):
+        raise TypeError(f"Expected tuple or list type, got {type(arrays)}.")
+
+    for X in arrays:
+        if not isinstance(X, dpt.usm_ndarray):
+            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    exec_q = dputils.get_execution_queue([X.sycl_queue for X in arrays])
+    if exec_q is None:
+        raise ValueError("All the input arrays must have same sycl queue.")
+
+    res_usm_type = dputils.get_coerced_usm_type([X.usm_type for X in arrays])
+    if res_usm_type is None:
+        raise ValueError("All the input arrays must have usm_type.")
+
+    X0 = arrays[0]
+    _supported_dtype(Xi.dtype for Xi in arrays)
+
+    res_dtype = X0.dtype
+    dev = exec_q.sycl_device
+    for i in range(1, n):
+        res_dtype = np.promote_types(res_dtype, arrays[i])
+        res_dtype = _to_device_supported_dtype(res_dtype, dev)
+
+    if check_ndim:
+        for i in range(1, n):
+            if X0.ndim != arrays[i].ndim:
+                raise ValueError(
+                    "All the input arrays must have same number of dimensions, "
+                    f"but the array at index 0 has {X0.ndim} dimension(s) and "
+                    f"the array at index {i} has {arrays[i].ndim} dimension(s)."
+                )
+    return res_dtype, res_usm_type, exec_q
+
+
 def _broadcast_shapes(*args):
     """
     Broadcast the input shapes into a single shape;
@@ -112,6 +153,74 @@ def _broadcast_strides(X_shape, X_strides, res_ndim):
     return tuple(out_strides)
 
 
+def _check_same_shapes(X0_shape, axis, n, arrays):
+    for i in range(1, n):
+        Xi_shape = arrays[i].shape
+        for j, X0j in enumerate(X0_shape):
+            if X0j != Xi_shape[j] and j != axis:
+                raise ValueError(
+                    "All the input array dimensions for the concatenation "
+                    f"axis must match exactly, but along dimension {j}, the "
+                    f"array at index 0 has size {X0j} and the array "
+                    f"at index {i} has size {Xi_shape[j]}."
+                )
+
+
+def _concat_axis_None(arrays):
+    """Implementation of concat(arrays, axis=None)."""
+    res_dtype, res_usm_type, exec_q = _arrays_validation(
+        arrays, check_ndim=False
+    )
+    res_shape = 0
+    for array in arrays:
+        res_shape += array.size
+    res = dpt_ext.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    fill_start = 0
+    _manager = dputils.SequentialOrderManager[exec_q]
+    deps = _manager.submitted_events
+    for array in arrays:
+        fill_end = fill_start + array.size
+        if array.flags.c_contiguous:
+            hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=dpt_ext.reshape(array, -1),
+                dst=res[fill_start:fill_end],
+                sycl_queue=exec_q,
+                depends=deps,
+            )
+            _manager.add_event_pair(hev, cpy_ev)
+        else:
+            src_ = array
+            # _copy_usm_ndarray_for_reshape requires src and dst to have
+            # the same data type
+            if not array.dtype == res_dtype:
+                src2_ = dpt_ext.empty_like(src_, dtype=res_dtype)
+                ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=src_, dst=src2_, sycl_queue=exec_q, depends=deps
+                )
+                _manager.add_event_pair(ht_copy_ev, cpy_ev)
+                hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape(
+                    src=src2_,
+                    dst=res[fill_start:fill_end],
+                    sycl_queue=exec_q,
+                    depends=[cpy_ev],
+                )
+                _manager.add_event_pair(hev, reshape_copy_ev)
+            else:
+                hev, cpy_ev = ti._copy_usm_ndarray_for_reshape(
+                    src=src_,
+                    dst=res[fill_start:fill_end],
+                    sycl_queue=exec_q,
+                    depends=deps,
+                )
+                _manager.add_event_pair(hev, cpy_ev)
+        fill_start = fill_end
+
+    return res
+
+
 def broadcast_arrays(*args):
     """broadcast_arrays(*arrays)
 
@@ -180,6 +289,76 @@ def broadcast_to(X, /, shape):
     )
 
 
+def concat(arrays, /, *, axis=0):
+    """concat(arrays, axis)
+
+    Joins a sequence of arrays along an existing axis.
+
+    Args:
+        arrays (Union[List[usm_ndarray, Tuple[usm_ndarray,...]]]):
+            input arrays to join. The arrays must have the same shape,
+            except in the dimension specified by `axis`.
+        axis (Optional[int]): axis along which the arrays will be joined.
+            If `axis` is `None`, arrays must be flattened before
+            concatenation. If `axis` is negative, it is understood as
+            being counted from the last dimension. Default: `0`.
+
+    Returns:
+        usm_ndarray:
+            An output array containing the concatenated
+            values. The output array data type is determined by Type
+            Promotion Rules of array API.
+
+    All input arrays must have the same device attribute. The output array
+    is allocated on that same device, and data movement operations are
+    scheduled on a queue underlying the device. The USM allocation type
+    of the output array is determined by USM allocation type promotion
+    rules.
+    """
+    if axis is None:
+        return _concat_axis_None(arrays)
+
+    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
+    n = len(arrays)
+    X0 = arrays[0]
+
+    axis = normalize_axis_index(axis, X0.ndim)
+    X0_shape = X0.shape
+    _check_same_shapes(X0_shape, axis, n, arrays)
+
+    res_shape_axis = 0
+    for X in arrays:
+        res_shape_axis = res_shape_axis + X.shape[axis]
+
+    res_shape = tuple(
+        X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim)
+    )
+
+    res = dpt_ext.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    _manager = dputils.SequentialOrderManager[exec_q]
+    deps = _manager.submitted_events
+    fill_start = 0
+    for i in range(n):
+        fill_end = fill_start + arrays[i].shape[axis]
+        c_shapes_copy = tuple(
+            np.s_[fill_start:fill_end] if j == axis else np.s_[:]
+            for j in range(X0.ndim)
+        )
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arrays[i],
+            dst=res[c_shapes_copy],
+            sycl_queue=exec_q,
+            depends=deps,
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+        fill_start = fill_end
+
+    return res
+
+
 def repeat(x, repeats, /, *, axis=None):
     """repeat(x, repeats, axis=None)
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 5727d082e163..e90993da7b8a 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -1416,7 +1416,7 @@ def concatenate(
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt.concat(usm_arrays, axis=axis)
+    usm_res = dpt_ext.concat(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:

From ea7ea3c8d0b392a6612f1cc175de2d4562bc75c6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:13:46 -0800
Subject: [PATCH 094/145] Move ti.expand_dims() to dpctl_ext.tensor and reuse
 it

---
 dpctl_ext/tensor/__init__.py                |  2 +
 dpctl_ext/tensor/_manipulation_functions.py | 46 +++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |  2 +-
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 84ae89b9a269..10030b6c2107 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -66,6 +66,7 @@
     broadcast_arrays,
     broadcast_to,
     concat,
+    expand_dims,
     repeat,
     roll,
 )
@@ -88,6 +89,7 @@
     "empty",
     "empty_like",
     "extract",
+    "expand_dims",
     "eye",
     "finfo",
     "from_numpy",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 2bb92c2616d6..969f5653f68c 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -359,6 +359,52 @@ def concat(arrays, /, *, axis=0):
     return res
 
 
+def expand_dims(X, /, *, axis=0):
+    """expand_dims(x, axis)
+
+    Expands the shape of an array by inserting a new axis (dimension)
+    of size one at the position specified by axis.
+
+    Args:
+        x (usm_ndarray):
+            input array
+        axis (Union[int, Tuple[int]]):
+            axis position in the expanded axes (zero-based). If `x` has rank
+            (i.e, number of dimensions) `N`, a valid `axis` must reside
+            in the closed-interval `[-N-1, N]`. If provided a negative
+            `axis`, the `axis` position at which to insert a singleton
+            dimension is computed as `N + axis + 1`. Hence, if
+            provided `-1`, the resolved axis position is `N` (i.e.,
+            a singleton dimension must be appended to the input array `x`).
+            If provided `-N-1`, the resolved axis position is `0` (i.e., a
+            singleton dimension is prepended to the input array `x`).
+
+    Returns:
+        usm_ndarray:
+            Returns a view, if possible, and a copy otherwise with the number
+            of dimensions increased.
+            The expanded array has the same data type as the input array `x`.
+            The expanded array is located on the same device as the input
+            array, and has the same USM allocation type.
+
+    Raises:
+        IndexError: if `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    if type(axis) not in (tuple, list):
+        axis = (axis,)
+
+    out_ndim = len(axis) + X.ndim
+    axis = normalize_axis_tuple(axis, out_ndim)
+
+    shape_it = iter(X.shape)
+    shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim))
+
+    return dpt_ext.reshape(X, shape)
+
+
 def repeat(x, repeats, /, *, axis=None):
     """repeat(x, repeats, axis=None)
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index e90993da7b8a..95897843821d 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -1849,7 +1849,7 @@ def expand_dims(a, axis):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.expand_dims(usm_a, axis=axis)
+    usm_res = dpt_ext.expand_dims(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From d2e9279806cdb47075c9d4d2b00c496164c65027 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:16:20 -0800
Subject: [PATCH 095/145] Move ti.flip() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 ++
 dpctl_ext/tensor/_manipulation_functions.py | 32 +++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |  2 +-
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 10030b6c2107..6c9a6b17cdde 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -67,6 +67,7 @@
     broadcast_to,
     concat,
     expand_dims,
+    flip,
     repeat,
     roll,
 )
@@ -92,6 +93,7 @@
     "expand_dims",
     "eye",
     "finfo",
+    "flip",
     "from_numpy",
     "full",
     "full_like",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 969f5653f68c..5b06b6815162 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -405,6 +405,38 @@ def expand_dims(X, /, *, axis=0):
     return dpt_ext.reshape(X, shape)
 
 
+def flip(X, /, *, axis=None):
+    """flip(x, axis)
+
+    Reverses the order of elements in an array `x` along the given `axis`.
+    The shape of the array is preserved, but the elements are reordered.
+
+    Args:
+        x (usm_ndarray): input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along
+            which to flip.
+            If `axis` is `None`, all input array axes are flipped.
+            If `axis` is negative, the flipped axis is counted from the
+            last dimension. If provided more than one axis, only the specified
+            axes are flipped. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            A view of `x` with the entries of `axis` reversed.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    X_ndim = X.ndim
+    if axis is None:
+        indexer = (np.s_[::-1],) * X_ndim
+    else:
+        axis = normalize_axis_tuple(axis, X_ndim)
+        indexer = tuple(
+            np.s_[::-1] if i in axis else np.s_[:] for i in range(X.ndim)
+        )
+    return X[indexer]
+
+
 def repeat(x, repeats, /, *, axis=None):
     """repeat(x, repeats, axis=None)
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 95897843821d..a46c117a8ce2 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -1920,7 +1920,7 @@ def flip(m, axis=None):
     """
 
     m_usm = dpnp.get_usm_ndarray(m)
-    return dpnp_array._create_from_usm_ndarray(dpt.flip(m_usm, axis=axis))
+    return dpnp_array._create_from_usm_ndarray(dpt_ext.flip(m_usm, axis=axis))
 
 
 def fliplr(m):

From 4e63cca92daf956d097cfe3c2971ff5d4c9e9ea5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:20:48 -0800
Subject: [PATCH 096/145] Move ti.permute_dims() to dpctl_ext.tensor and reuse
 it

---
 dpctl_ext/tensor/__init__.py                |  2 ++
 dpctl_ext/tensor/_copy_utils.py             |  6 ++--
 dpctl_ext/tensor/_manipulation_functions.py | 37 +++++++++++++++++++++
 dpctl_ext/tensor/_reshape.py                |  6 +++-
 dpnp/dpnp_array.py                          |  2 +-
 5 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 6c9a6b17cdde..6c237b2e26c6 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -68,6 +68,7 @@
     concat,
     expand_dims,
     flip,
+    permute_dims,
     repeat,
     roll,
 )
@@ -101,6 +102,7 @@
     "isdtype",
     "linspace",
     "meshgrid",
+    "permute_dims",
     "nonzero",
     "ones",
     "ones_like",
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 78dbb5be17e8..878dabc581d2 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -695,7 +695,7 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
             for i in range(x.ndim)
         )
         R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
+    return dpt_ext.permute_dims(R, inv_perm)
 
 
 def _empty_like_orderK(x, dt, usm_type=None, dev=None):
@@ -800,7 +800,7 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
+    return dpt_ext.permute_dims(R, inv_perm)
 
 
 def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
@@ -876,7 +876,7 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
+    return dpt_ext.permute_dims(R, inv_perm)
 
 
 def copy(usm_ary, /, *, order="K"):
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 5b06b6815162..4dbd524423fd 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -437,6 +437,43 @@ def flip(X, /, *, axis=None):
     return X[indexer]
 
 
+def permute_dims(X, /, axes):
+    """permute_dims(x, axes)
+
+    Permute the axes (dimensions) of an array; returns the permuted
+    array as a view.
+
+    Args:
+        x (usm_ndarray): input array.
+        axes (Tuple[int, ...]): tuple containing permutation of
+           `(0,1,...,N-1)` where `N` is the number of axes (dimensions)
+           of `x`.
+    Returns:
+        usm_ndarray:
+            An array with permuted axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same USM allocation
+            type as `x`.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    axes = normalize_axis_tuple(axes, X.ndim, "axes")
+    if not X.ndim == len(axes):
+        raise ValueError(
+            "The length of the passed axes does not match "
+            "to the number of usm_ndarray dimensions."
+        )
+    newstrides = tuple(X.strides[i] for i in axes)
+    newshape = tuple(X.shape[i] for i in axes)
+    return dpt.usm_ndarray(
+        shape=newshape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=newstrides,
+        offset=X._element_offset,
+    )
+
+
 def repeat(x, repeats, /, *, axis=None):
     """repeat(x, repeats, axis=None)
 
diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py
index 6afa1dc245c3..b7b6b068bfd0 100644
--- a/dpctl_ext/tensor/_reshape.py
+++ b/dpctl_ext/tensor/_reshape.py
@@ -37,6 +37,10 @@
     _unravel_index,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+
 __doc__ = "Implementation module for :func:`dpctl.tensor.reshape`."
 
 
@@ -184,7 +188,7 @@ def reshape(X, /, shape, *, order="C", copy=None):
                 src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
         else:
-            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
+            X_t = dpt_ext.permute_dims(X, range(X.ndim - 1, -1, -1))
             hev, r_e = _copy_usm_ndarray_for_reshape(
                 src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 564627bacf2f..0d05ef8d49d5 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -2283,7 +2283,7 @@ def transpose(self, *axes):
                 # self.transpose(None).shape == self.shape[::-1]
                 axes = tuple((ndim - x - 1) for x in range(ndim))
 
-            usm_res = dpt.permute_dims(self._array_obj, axes)
+            usm_res = dpt_ext.permute_dims(self._array_obj, axes)
         return dpnp_array._create_from_usm_ndarray(usm_res)
 
     def var(

From 9c88edbcfc45d2912fb8584fd9a567bed94bc920 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:23:20 -0800
Subject: [PATCH 097/145] Move ti.moveaxis() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 +
 dpctl_ext/tensor/_manipulation_functions.py | 51 +++++++++++++++++++++
 dpnp/dpnp_algo/dpnp_arraycreation.py        |  2 +-
 dpnp/dpnp_iface_manipulation.py             |  2 +-
 4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 6c237b2e26c6..2a6b79b715d6 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -68,6 +68,7 @@
     concat,
     expand_dims,
     flip,
+    moveaxis,
     permute_dims,
     repeat,
     roll,
@@ -102,6 +103,7 @@
     "isdtype",
     "linspace",
     "meshgrid",
+    "moveaxis",
     "permute_dims",
     "nonzero",
     "ones",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 4dbd524423fd..97fc344b07c9 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -437,6 +437,57 @@ def flip(X, /, *, axis=None):
     return X[indexer]
 
 
+def moveaxis(X, source, destination, /):
+    """moveaxis(x, source, destination)
+
+    Moves axes of an array to new positions.
+
+    Args:
+        x (usm_ndarray): input array
+
+        source (int or a sequence of int):
+            Original positions of the axes to move.
+            These must be unique. If `x` has rank (i.e., number of
+            dimensions) `N`, a valid `axis` must be in the
+            half-open interval `[-N, N)`.
+
+        destination (int or a sequence of int):
+            Destination positions for each of the original axes.
+            These must also be unique. If `x` has rank
+            (i.e., number of dimensions) `N`, a valid `axis` must be
+            in the half-open interval `[-N, N)`.
+
+    Returns:
+        usm_ndarray:
+            Array with moved axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same
+            USM allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+        ValueError: if `src` and `dst` have not equal number of elements.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    source = normalize_axis_tuple(source, X.ndim, "source")
+    destination = normalize_axis_tuple(destination, X.ndim, "destination")
+
+    if len(source) != len(destination):
+        raise ValueError(
+            "`source` and `destination` arguments must have "
+            "the same number of elements"
+        )
+
+    ind = [n for n in range(X.ndim) if n not in source]
+
+    for src, dst in sorted(zip(destination, source)):
+        ind.insert(src, dst)
+
+    return dpt_ext.permute_dims(X, tuple(ind))
+
+
 def permute_dims(X, /, axes):
     """permute_dims(x, axes)
 
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 86b9642ecd5d..4e2ee8531a18 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -256,7 +256,7 @@ def dpnp_linspace(
             usm_res[-1, ...] = usm_stop
 
     if axis != 0:
-        usm_res = dpt.moveaxis(usm_res, 0, axis)
+        usm_res = dpt_ext.moveaxis(usm_res, 0, axis)
 
     if dpnp.issubdtype(dtype, dpnp.integer):
         dpt.floor(usm_res, out=usm_res)
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index a46c117a8ce2..625da379b0d5 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -2408,7 +2408,7 @@ def moveaxis(a, source, destination):
 
     usm_array = dpnp.get_usm_ndarray(a)
     return dpnp_array._create_from_usm_ndarray(
-        dpt.moveaxis(usm_array, source, destination)
+        dpt_ext.moveaxis(usm_array, source, destination)
     )
 
 

From ba5163674a38f720259143435443af23a89ff73f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:25:54 -0800
Subject: [PATCH 098/145] Move ti.squeeze() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 +
 dpctl_ext/tensor/_manipulation_functions.py | 45 +++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |  2 +-
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 2a6b79b715d6..d971c2b862cd 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -72,6 +72,7 @@
     permute_dims,
     repeat,
     roll,
+    squeeze,
 )
 from dpctl_ext.tensor._reshape import reshape
 
@@ -115,6 +116,7 @@
     "reshape",
     "result_type",
     "roll",
+    "squeeze",
     "take",
     "take_along_axis",
     "to_numpy",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 97fc344b07c9..22cda939c2c3 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -828,3 +828,48 @@ def roll(x, /, shift, *, axis=None):
     )
     _manager.add_event_pair(ht_e, roll_ev)
     return res
+
+
+def squeeze(X, /, axis=None):
+    """squeeze(x, axis)
+
+    Removes singleton dimensions (axes) from array `x`.
+
+    Args:
+        x (usm_ndarray): input array
+        axis (Union[int, Tuple[int,...]]): axis (or axes) to squeeze.
+
+    Returns:
+        usm_ndarray:
+            Output array is a view, if possible,
+            and a copy otherwise, but with all or a subset of the
+            dimensions of length 1 removed. Output has the same data
+            type as the input, is allocated on the same device as the
+            input and has the same USM allocation type as the input
+            array `x`.
+
+    Raises:
+        ValueError: if the specified axis has a size greater than one.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    X_shape = X.shape
+    if axis is not None:
+        axis = normalize_axis_tuple(axis, X.ndim if X.ndim != 0 else X.ndim + 1)
+        new_shape = []
+        for i, x in enumerate(X_shape):
+            if i not in axis:
+                new_shape.append(x)
+            else:
+                if x != 1:
+                    raise ValueError(
+                        "Cannot select an axis to squeeze out "
+                        "which has size not equal to one."
+                    )
+        new_shape = tuple(new_shape)
+    else:
+        new_shape = tuple(axis for axis in X_shape if axis != 1)
+    if new_shape == X.shape:
+        return X
+    else:
+        return dpt_ext.reshape(X, new_shape)
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 625da379b0d5..c495b8866578 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -3663,7 +3663,7 @@ def squeeze(a, /, axis=None):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.squeeze(usm_a, axis=axis)
+    usm_res = dpt_ext.squeeze(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From bb16c19b6dbecfed2276a8c4a2d06df6d69f0f9c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:27:53 -0800
Subject: [PATCH 099/145] Move ti.stack() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 +
 dpctl_ext/tensor/_manipulation_functions.py | 61 +++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |  2 +-
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index d971c2b862cd..626e2063a43e 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -73,6 +73,7 @@
     repeat,
     roll,
     squeeze,
+    stack,
 )
 from dpctl_ext.tensor._reshape import reshape
 
@@ -117,6 +118,7 @@
     "result_type",
     "roll",
     "squeeze",
+    "stack",
     "take",
     "take_along_axis",
     "to_numpy",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 22cda939c2c3..0fe05ef70998 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -873,3 +873,64 @@ def squeeze(X, /, axis=None):
         return X
     else:
         return dpt_ext.reshape(X, new_shape)
+
+
+def stack(arrays, /, *, axis=0):
+    """
+    stack(arrays, axis)
+
+    Joins a sequence of arrays along a new axis.
+
+    Args:
+        arrays (Union[List[usm_ndarray], Tuple[usm_ndarray,...]]):
+            input arrays to join. Each array must have the same shape.
+        axis (int): axis along which the arrays will be joined. Providing
+            an `axis` specified the index of the new axis in the dimensions
+            of the output array. A valid axis must be on the interval
+            `[-N, N)`, where `N` is the rank (number of dimensions) of `x`.
+            Default: `0`.
+
+    Returns:
+        usm_ndarray:
+            An output array having rank `N+1`, where `N` is
+            the rank (number of dimensions) of `x`. If the input arrays have
+            different data types, array API Type Promotion Rules apply.
+
+    Raises:
+        ValueError: if not all input arrays have the same shape
+        IndexError: if provided an `axis` outside of the required interval.
+    """
+    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
+
+    n = len(arrays)
+    X0 = arrays[0]
+    res_ndim = X0.ndim + 1
+    axis = normalize_axis_index(axis, res_ndim)
+    X0_shape = X0.shape
+
+    for i in range(1, n):
+        if X0_shape != arrays[i].shape:
+            raise ValueError("All input arrays must have the same shape")
+
+    res_shape = tuple(
+        X0_shape[i - 1 * (i >= axis)] if i != axis else n
+        for i in range(res_ndim)
+    )
+
+    res = dpt_ext.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    _manager = dputils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    for i in range(n):
+        c_shapes_copy = tuple(
+            i if j == axis else np.s_[:] for j in range(res_ndim)
+        )
+        _dst = res[c_shapes_copy]
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+
+    return res
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index c495b8866578..a511c31a8177 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -3751,7 +3751,7 @@ def stack(arrays, /, *, axis=0, out=None, dtype=None, casting="same_kind"):
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt.stack(usm_arrays, axis=axis)
+    usm_res = dpt_ext.stack(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:

From ccad5f06e0d345e5bf4f53ac0fd0b67b784c294b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:30:21 -0800
Subject: [PATCH 100/145] Move ti.swapaxes() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 ++
 dpctl_ext/tensor/_manipulation_functions.py | 38 +++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |  2 +-
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 626e2063a43e..793ff8c7f31f 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -74,6 +74,7 @@
     roll,
     squeeze,
     stack,
+    swapaxes,
 )
 from dpctl_ext.tensor._reshape import reshape
 
@@ -119,6 +120,7 @@
     "roll",
     "squeeze",
     "stack",
+    "swapaxes",
     "take",
     "take_along_axis",
     "to_numpy",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 0fe05ef70998..e07b198f5566 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -934,3 +934,41 @@ def stack(arrays, /, *, axis=0):
         _manager.add_event_pair(hev, cpy_ev)
 
     return res
+
+
+def swapaxes(X, axis1, axis2):
+    """swapaxes(x, axis1, axis2)
+
+    Interchanges two axes of an array.
+
+    Args:
+        x (usm_ndarray): input array
+
+        axis1 (int): First axis.
+            If `x` has rank (i.e., number of dimensions) `N`,
+            a valid `axis` must be in the half-open interval `[-N, N)`.
+
+        axis2 (int): Second axis.
+            If `x` has rank (i.e., number of dimensions) `N`,
+            a valid `axis` must be in the half-open interval `[-N, N)`.
+
+    Returns:
+        usm_ndarray:
+            Array with swapped axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same USM
+            allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    axis1 = normalize_axis_index(axis1, X.ndim, "axis1")
+    axis2 = normalize_axis_index(axis2, X.ndim, "axis2")
+
+    ind = list(range(0, X.ndim))
+    ind[axis1] = axis2
+    ind[axis2] = axis1
+    return dpt_ext.permute_dims(X, tuple(ind))
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index a511c31a8177..4f0531f8b27f 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -3812,7 +3812,7 @@ def swapaxes(a, axis1, axis2):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.swapaxes(usm_a, axis1=axis1, axis2=axis2)
+    usm_res = dpt_ext.swapaxes(usm_a, axis1=axis1, axis2=axis2)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From b5e3541e6ea0943df885bb2c76089be76d18a40d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:33:25 -0800
Subject: [PATCH 101/145] Move ti.tile() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 +
 dpctl_ext/tensor/_manipulation_functions.py | 97 +++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |  2 +-
 3 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 793ff8c7f31f..ceefa3572ea1 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -75,6 +75,7 @@
     squeeze,
     stack,
     swapaxes,
+    tile,
 )
 from dpctl_ext.tensor._reshape import reshape
 
@@ -123,6 +124,7 @@
     "swapaxes",
     "take",
     "take_along_axis",
+    "tile",
     "to_numpy",
     "tril",
     "triu",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index e07b198f5566..6e26ddb88bcb 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -972,3 +972,100 @@ def swapaxes(X, axis1, axis2):
     ind[axis1] = axis2
     ind[axis2] = axis1
     return dpt_ext.permute_dims(X, tuple(ind))
+
+
+def tile(x, repetitions, /):
+    """tile(x, repetitions)
+
+    Repeat an input array `x` along each axis a number of times given by
+    `repetitions`.
+
+    For `N` = len(`repetitions`) and `M` = len(`x.shape`):
+
+        * If `M < N`, `x` will have `N - M` new axes prepended to its shape
+        * If `M > N`, `repetitions` will have `M - N` ones prepended to it
+
+    Args:
+        x (usm_ndarray): input array
+
+        repetitions (Union[int, Tuple[int, ...]]):
+            The number of repetitions along each dimension of `x`.
+
+    Returns:
+        usm_ndarray:
+            tiled output array.
+
+            The returned array will have rank `max(M, N)`. If `S` is the
+            shape of `x` after prepending dimensions and `R` is
+            `repetitions` after prepending ones, then the shape of the
+            result will be `S[i] * R[i]` for each dimension `i`.
+
+            The returned array will have the same data type as `x`.
+            The returned array will be located on the same device as `x` and
+            have the same USM allocation type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    if not isinstance(repetitions, tuple):
+        if isinstance(repetitions, int):
+            repetitions = (repetitions,)
+        else:
+            raise TypeError(
+                f"Expected tuple or integer type, got {type(repetitions)}."
+            )
+
+    rep_dims = len(repetitions)
+    x_dims = x.ndim
+    if rep_dims < x_dims:
+        repetitions = (x_dims - rep_dims) * (1,) + repetitions
+    elif x_dims < rep_dims:
+        x = dpt_ext.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
+    res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions))
+    # case of empty input
+    if x.size == 0:
+        return dpt_ext.empty(
+            res_shape,
+            dtype=x.dtype,
+            usm_type=x.usm_type,
+            sycl_queue=x.sycl_queue,
+        )
+    in_sh = x.shape
+    if res_shape == in_sh:
+        return dpt_ext.copy(x)
+    expanded_sh = []
+    broadcast_sh = []
+    out_sz = 1
+    for i in range(len(res_shape)):
+        out_sz *= res_shape[i]
+        reps, sh = repetitions[i], in_sh[i]
+        if reps == 1:
+            # dimension will be unchanged
+            broadcast_sh.append(sh)
+            expanded_sh.append(sh)
+        elif sh == 1:
+            # dimension will be broadcast
+            broadcast_sh.append(reps)
+            expanded_sh.append(sh)
+        else:
+            broadcast_sh.extend([reps, sh])
+            expanded_sh.extend([1, sh])
+    exec_q = x.sycl_queue
+    xdt = x.dtype
+    xut = x.usm_type
+    res = dpt_ext.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
+    # no need to copy data for empty output
+    if out_sz > 0:
+        x = dpt_ext.broadcast_to(
+            # this reshape should never copy
+            dpt_ext.reshape(x, expanded_sh),
+            broadcast_sh,
+        )
+        # copy broadcast input into flat array
+        _manager = dputils.SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        hev, cp_ev = ti._copy_usm_ndarray_for_reshape(
+            src=x, dst=res, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cp_ev)
+    return dpt_ext.reshape(res, res_shape)
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 4f0531f8b27f..d11d67c2de0b 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -3892,7 +3892,7 @@ def tile(A, reps):
     """
 
     usm_a = dpnp.get_usm_ndarray(A)
-    usm_res = dpt.tile(usm_a, reps)
+    usm_res = dpt_ext.tile(usm_a, reps)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From 325729bfbad021a18671e0815c20c18deeb869b8 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 23 Feb 2026 07:37:38 -0800
Subject: [PATCH 102/145] Move ti.unstack() to dpctl_ext.tensor and reuse it

---
 dpctl_ext/tensor/__init__.py                |  2 ++
 dpctl_ext/tensor/_manipulation_functions.py | 29 +++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py             |  8 +++---
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index ceefa3572ea1..d46cf4a4a65f 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -76,6 +76,7 @@
     stack,
     swapaxes,
     tile,
+    unstack,
 )
 from dpctl_ext.tensor._reshape import reshape
 
@@ -128,6 +129,7 @@
     "to_numpy",
     "tril",
     "triu",
+    "unstack",
     "where",
     "zeros",
     "zeros_like",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 6e26ddb88bcb..08459dcaea76 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -974,6 +974,35 @@ def swapaxes(X, axis1, axis2):
     return dpt_ext.permute_dims(X, tuple(ind))
 
 
+def unstack(X, /, *, axis=0):
+    """unstack(x, axis=0)
+
+    Splits an array in a sequence of arrays along the given axis.
+
+    Args:
+        x (usm_ndarray): input array
+
+        axis (int, optional): axis along which `x` is unstacked.
+            If `x` has rank (i.e, number of dimensions) `N`,
+            a valid `axis` must reside in the half-open interval `[-N, N)`.
+            Default: `0`.
+
+    Returns:
+        Tuple[usm_ndarray,...]:
+            Output sequence of arrays which are views into the input array.
+
+    Raises:
+        AxisError: if the `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    axis = normalize_axis_index(axis, X.ndim)
+    Y = dpt_ext.moveaxis(X, axis, 0)
+
+    return tuple(Y[i] for i in range(Y.shape[0]))
+
+
 def tile(x, repetitions, /):
     """tile(x, repetitions)
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index d11d67c2de0b..c034d1c43793 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -1093,7 +1093,9 @@ def broadcast_arrays(*args, subok=False):
     if len(args) == 0:
         return []
 
-    usm_arrays = dpt.broadcast_arrays(*[dpnp.get_usm_ndarray(a) for a in args])
+    usm_arrays = dpt_ext.broadcast_arrays(
+        *[dpnp.get_usm_ndarray(a) for a in args]
+    )
     return [dpnp_array._create_from_usm_ndarray(a) for a in usm_arrays]
 
 
@@ -1521,7 +1523,7 @@ def copyto(dst, src, casting="same_kind", where=True):
                 f"but got {where.dtype}"
             )
 
-        dst_usm, src_usm, mask_usm = dpt.broadcast_arrays(
+        dst_usm, src_usm, mask_usm = dpt_ext.broadcast_arrays(
             dpnp.get_usm_ndarray(dst),
             dpnp.get_usm_ndarray(src),
             dpnp.get_usm_ndarray(where),
@@ -4522,7 +4524,7 @@ def unstack(x, /, *, axis=0):
     if usm_x.ndim == 0:
         raise ValueError("Input array must be at least 1-d.")
 
-    res = dpt.unstack(usm_x, axis=axis)
+    res = dpt_ext.unstack(usm_x, axis=axis)
     return tuple(dpnp_array._create_from_usm_ndarray(a) for a in res)
 
 

From 4552e78359b8fb67094614b106c99dd9fb8acea9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 27 Feb 2026 08:57:58 -0800
Subject: [PATCH 103/145] Initialize _tensor_accumulation_impl extension and
 move _cumsum_over_axis

---
 dpctl_ext/tensor/CMakeLists.txt               |  32 +-
 .../accumulators/accumulate_over_axis.hpp     | 459 ++++++++++++++++++
 .../accumulators/accumulators_common.cpp      |  55 +++
 .../accumulators/accumulators_common.hpp      |  46 ++
 .../source/accumulators/cumulative_sum.cpp    | 357 ++++++++++++++
 .../source/accumulators/cumulative_sum.hpp    |  46 ++
 .../libtensor/source/tensor_accumulation.cpp  |  43 ++
 7 files changed, 1030 insertions(+), 8 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 864e34ddaba4..7cd60cda6edd 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -63,6 +63,16 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
 )
+set(_accumulator_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
+)
+set(_tensor_accumulation_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
+    ${_accumulator_sources}
+)
 
 set(_static_lib_trgt simplify_iteration_space)
 
@@ -85,6 +95,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_accumulation_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if(WIN32)
     set(_clang_prefix "/clang:")
@@ -97,14 +113,14 @@ set(_no_fast_math_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
 )
-#list(
-#APPEND _no_fast_math_sources
-# ${_elementwise_sources}
-# ${_reduction_sources}
-# ${_sorting_sources}
-# ${_linalg_sources}
-# ${_accumulator_sources}
-#)
+list(
+    APPEND _no_fast_math_sources
+    # ${_elementwise_sources}
+    # ${_reduction_sources}
+    # ${_sorting_sources}
+    # ${_linalg_sources}
+    ${_accumulator_sources}
+)
 
 foreach(_src_fn ${_no_fast_math_sources})
     get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
new file mode 100644
index 000000000000..712ab180ef37
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -0,0 +1,459 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event>
+    py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src,
+                            const int trailing_dims_to_accumulate,
+                            const dpctl::tensor::usm_ndarray &dst,
+                            sycl::queue &exec_q,
+                            std::vector<sycl::event> const &depends,
+                            const strided_fnT &strided_dispatch_table,
+                            const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iter_nd = src_nd - trailing_dims_to_accumulate;
+    if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) {
+        throw py::value_error(
+            "trailing_dims_to_accumulate must be positive, but no "
+            "greater than rank of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i);
+        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    int acc_nd = trailing_dims_to_accumulate;
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    auto packed_shapes_and_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_and_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    std::vector<sycl::event> const &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+
+    static constexpr int acc_nd = 1;
+
+    int iter_nd = src_nd - acc_nd;
+    if (iter_nd < 0) {
+        throw py::value_error("accumulation axis must not be greater than rank "
+                              "of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i);
+        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    auto packed_shapes_and_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_and_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+/*! @brief Template implementing Python API for querying accumulation
+ * type support */
+template <typename fnT>
+bool py_accumulate_dtype_supported(const py::dtype &input_dtype,
+                                   const py::dtype &output_dtype,
+                                   const fnT &dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    fn = dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
new file mode 100644
index 000000000000..c6cbce0f6ced
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+// #include "cumulative_logsumexp.hpp"
+// #include "cumulative_prod.hpp"
+#include "cumulative_sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+/*! @brief Add accumulators to Python module */
+void init_accumulator_functions(py::module_ m)
+{
+    // init_cumulative_logsumexp(m);
+    // init_cumulative_prod(m);
+    init_cumulative_sum(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp
new file mode 100644
index 000000000000..c33a040a7fa7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_accumulator_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
new file mode 100644
index 000000000000..0087122ce1ac
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -0,0 +1,357 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename T>
+using CumSumScanOpT = std::
+    conditional_t<std::is_same_v<T, bool>, sycl::logical_or<T>, sycl::plus<T>>;
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
+                                                            dstTy>::is_defined)
+        {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumsum_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumsum_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumsum_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumsum_dispatch_tables;
+    populate_cumsum_dispatch_tables();
+
+    using impl::cumsum_1d_contig_dispatch_table;
+    using impl::cumsum_strided_dispatch_table;
+    auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                            const arrayT &dst, sycl::queue &exec_q,
+                            const event_vecT &depends = {}) {
+        using dpctl::tensor::py_internal::py_accumulate_over_axis;
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table);
+    };
+    m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumsum_1d_include_initial_contig_dispatch_table;
+    using impl::cumsum_include_initial_strided_dispatch_table;
+    auto cumsum_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::
+                py_accumulate_final_axis_include_initial;
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumsum_include_initial_strided_dispatch_table,
+                cumsum_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumsum_dtype_supported = [&](const py::dtype &input_dtype,
+                                      const py::dtype &output_dtype) {
+        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumsum_strided_dispatch_table);
+    };
+    m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp
new file mode 100644
index 000000000000..5e06b222a3bc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_sum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp
new file mode 100644
index 000000000000..faa3fc8b52c6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp
@@ -0,0 +1,43 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "accumulators/accumulators_common.hpp"
+
+PYBIND11_MODULE(_tensor_accumulation_impl, m)
+{
+    dpctl::tensor::py_internal::init_accumulator_functions(m);
+}

From 6b81e7aa6e9caf365bb5a7f7d3bf5bf71a7491d6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 27 Feb 2026 08:59:16 -0800
Subject: [PATCH 104/145] Move ti.cumulative_sum() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py      |   2 +
 dpctl_ext/tensor/_accumulation.py | 311 ++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_mathematical.py   |   4 +-
 3 files changed, 315 insertions(+), 2 deletions(-)
 create mode 100644 dpctl_ext/tensor/_accumulation.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index d46cf4a4a65f..ce4a70d202f2 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -80,6 +80,7 @@
 )
 from dpctl_ext.tensor._reshape import reshape
 
+from ._accumulation import cumulative_sum
 from ._clip import clip
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
@@ -94,6 +95,7 @@
     "concat",
     "copy",
     "clip",
+    "cumulative_sum",
     "empty",
     "empty_like",
     "extract",
diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py
new file mode 100644
index 000000000000..252b892fe481
--- /dev/null
+++ b/dpctl_ext/tensor/_accumulation.py
@@ -0,0 +1,311 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tensor._type_utils import (  # _default_accumulation_dtype_fp_types,
+    _default_accumulation_dtype,
+    _to_device_supported_dtype,
+)
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_accumulation_impl as tai
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+
+
+def _accumulate_common(
+    x,
+    axis,
+    dtype,
+    include_initial,
+    out,
+    _accumulate_fn,
+    _accumulate_include_initial_fn,
+    _dtype_supported,
+    _default_accumulation_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    appended_axis = False
+    if x.ndim == 0:
+        x = x[dpt.newaxis]
+        appended_axis = True
+    nd = x.ndim
+    if axis is None:
+        if nd > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(nd)
+            )
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, nd, "axis")
+    sh = x.shape
+    res_sh = (
+        sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh
+    )
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt_ext.permute_dims(x, perm)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_usm_type = x.usm_type
+    if dtype is None:
+        res_dt = _default_accumulation_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    # checking now avoids unnecessary allocations
+    implemented_types = _dtype_supported(inp_dt, res_dt)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined accumulation data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        out_sh = out.shape
+        # append an axis to `out` if scalar
+        if appended_axis and not include_initial:
+            out = out[dpt.newaxis, ...]
+            orig_out = out
+            final_res_sh = res_sh[1:]
+        else:
+            final_res_sh = res_sh
+        if not out_sh == final_res_sh:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_sh}, got {out_sh}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, " f"got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        # permute out array dims if necessary
+        if a1 != nd:
+            out = dpt_ext.permute_dims(out, perm)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        if a1 != nd:
+            out = dpt_ext.permute_dims(out, perm)
+
+    _manager = SequentialOrderManager[q]
+    depends = _manager.submitted_events
+    if implemented_types:
+        if not include_initial:
+            ht_e, acc_ev = _accumulate_fn(
+                src=arr,
+                trailing_dims_to_accumulate=1,
+                dst=out,
+                sycl_queue=q,
+                depends=depends,
+            )
+        else:
+            ht_e, acc_ev = _accumulate_include_initial_fn(
+                src=arr, dst=out, sycl_queue=q, depends=depends
+            )
+        _manager.add_event_pair(ht_e, acc_ev)
+        if not (orig_out is None or out is orig_out):
+            # Copy the out data from temporary buffer to original memory
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev]
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt):
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=depends
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            if not include_initial:
+                ht_e, acc_ev = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, acc_ev = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            _manager.add_event_pair(ht_e, acc_ev)
+        else:
+            buf_dt = _default_accumulation_type_fn(inp_dt, q)
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=depends
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            tmp_res = dpt_ext.empty(
+                res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            if a1 != nd:
+                tmp_res = dpt_ext.permute_dims(tmp_res, perm)
+            if not include_initial:
+                ht_e, acc_ev = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, acc_ev = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            _manager.add_event_pair(ht_e, acc_ev)
+            ht_e_cpy2, cpy_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[acc_ev]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy_e2)
+
+    if appended_axis:
+        out = dpt_ext.squeeze(out)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(out, inv_perm)
+
+    return out
+
+
+def cumulative_sum(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative sum of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative sum must be computed.
+            If `None`, the sum is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative sum.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative sums. The returned array has the data
+            type as described in the `dtype` parameter description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative sum is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative sums are computed
+            along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumsum_over_axis,
+        tai._cumsum_final_axis_include_initial,
+        tai._cumsum_dtype_supported,
+        _default_accumulation_dtype,
+    )
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 3dbf07be080a..a6ff69c08ca7 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -1218,7 +1218,7 @@ def cumsum(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.cumulative_sum,
+        dpt_ext.cumulative_sum,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1403,7 +1403,7 @@ def cumulative_sum(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt.cumulative_sum,
+        dpt_ext.cumulative_sum,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,

From 5af94c87b47ccf73d55c42bc5584f3d583b0dddc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 02:17:34 -0800
Subject: [PATCH 105/145] Move _cumprod_over_axis to
 dpctl_ext.tensor._tensor_accumulation_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../accumulators/accumulators_common.cpp      |   4 +-
 .../source/accumulators/cumulative_prod.cpp   | 359 ++++++++++++++++++
 .../source/accumulators/cumulative_prod.hpp   |  46 +++
 4 files changed, 408 insertions(+), 3 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 7cd60cda6edd..a64a47f8e7e0 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -66,7 +66,7 @@ set(_tensor_impl_sources
 set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
 set(_tensor_accumulation_impl_sources
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
index c6cbce0f6ced..96c1b6d12cbc 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
@@ -36,7 +36,7 @@
 #include <pybind11/pybind11.h>
 
 // #include "cumulative_logsumexp.hpp"
-// #include "cumulative_prod.hpp"
+#include "cumulative_prod.hpp"
 #include "cumulative_sum.hpp"
 
 namespace py = pybind11;
@@ -48,7 +48,7 @@ namespace dpctl::tensor::py_internal
 void init_accumulator_functions(py::module_ m)
 {
     // init_cumulative_logsumexp(m);
-    // init_cumulative_prod(m);
+    init_cumulative_prod(m);
     init_cumulative_sum(m);
 }
 
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
new file mode 100644
index 000000000000..ca92db16cb05
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -0,0 +1,359 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProdAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename T>
+using CumProdScanOpT = std::conditional_t<std::is_same_v<T, bool>,
+                                          sycl::logical_and<T>,
+                                          sycl::multiplies<T>>;
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumprod_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumprod_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumprod_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumprod_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumprod_dispatch_tables;
+    populate_cumprod_dispatch_tables();
+
+    using impl::cumprod_1d_contig_dispatch_table;
+    using impl::cumprod_strided_dispatch_table;
+    auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+        using dpctl::tensor::py_internal::py_accumulate_over_axis;
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table);
+    };
+    m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumprod_1d_include_initial_contig_dispatch_table;
+    using impl::cumprod_include_initial_strided_dispatch_table;
+    auto cumprod_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::
+                py_accumulate_final_axis_include_initial;
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumprod_include_initial_strided_dispatch_table,
+                cumprod_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumprod_dtype_supported = [&](const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype) {
+        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumprod_strided_dispatch_table);
+    };
+    m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp
new file mode 100644
index 000000000000..e14bb2c44361
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_prod(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal

From 91547cc90a4fb507b5c0a9617ce20df70e322a03 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 02:33:58 -0800
Subject: [PATCH 106/145] Add missing include

---
 .../tensor/libtensor/source/accumulators/cumulative_prod.cpp     | 1 +
 .../tensor/libtensor/source/accumulators/cumulative_sum.cpp      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
index ca92db16cb05..07c802c8cffa 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -43,6 +43,7 @@
 #include "dpnp4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include "accumulate_over_axis.hpp"
 #include "kernels/accumulators.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
index 0087122ce1ac..3c422bfd0998 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -43,6 +43,7 @@
 #include "dpnp4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 #include "accumulate_over_axis.hpp"
 #include "kernels/accumulators.hpp"

From 668f3fb3beec3633261644d7eba8f3de5c4e1f12 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 02:34:41 -0800
Subject: [PATCH 107/145] Move ti.cumulative_prod() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py      |  3 +-
 dpctl_ext/tensor/_accumulation.py | 80 +++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_mathematical.py   |  4 +-
 3 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index ce4a70d202f2..ce489fe8a36d 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -80,7 +80,7 @@
 )
 from dpctl_ext.tensor._reshape import reshape
 
-from ._accumulation import cumulative_sum
+from ._accumulation import cumulative_prod, cumulative_sum
 from ._clip import clip
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
@@ -95,6 +95,7 @@
     "concat",
     "copy",
     "clip",
+    "cumulative_prod",
     "cumulative_sum",
     "empty",
     "empty_like",
diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py
index 252b892fe481..6ee5c2cfaa28 100644
--- a/dpctl_ext/tensor/_accumulation.py
+++ b/dpctl_ext/tensor/_accumulation.py
@@ -309,3 +309,83 @@ def cumulative_sum(
         tai._cumsum_dtype_supported,
         _default_accumulation_dtype,
     )
+
+
+def cumulative_prod(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative product of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative product must be computed.
+            If `None`, the product is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative product.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided
+            axis in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative products. The returned array has
+            the data type as described in the `dtype` parameter description
+            above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative product is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative products are
+            computed along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumprod_over_axis,
+        tai._cumprod_final_axis_include_initial,
+        tai._cumprod_dtype_supported,
+        _default_accumulation_dtype,
+    )
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index a6ff69c08ca7..e5e379f8dc00 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -1126,7 +1126,7 @@ def cumprod(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.cumulative_prod,
+        dpt_ext.cumulative_prod,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1307,7 +1307,7 @@ def cumulative_prod(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt.cumulative_prod,
+        dpt_ext.cumulative_prod,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,

From 72d2109ddb27537973094555210116c0d0069e31 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 02:47:00 -0800
Subject: [PATCH 108/145] Move _cumlogsumexp_over_axis to
 dpctl_ext.tensor._tensor_accumulation_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../accumulators/accumulators_common.cpp      |   4 +-
 .../accumulators/cumulative_logsumexp.cpp     | 351 ++++++++++++++++++
 .../accumulators/cumulative_logsumexp.hpp     |  46 +++
 4 files changed, 400 insertions(+), 3 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index a64a47f8e7e0..eff5e7552648 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -65,7 +65,7 @@ set(_tensor_impl_sources
 )
 set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
index 96c1b6d12cbc..5e07e81b7ad5 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp
@@ -35,7 +35,7 @@
 
 #include <pybind11/pybind11.h>
 
-// #include "cumulative_logsumexp.hpp"
+#include "cumulative_logsumexp.hpp"
 #include "cumulative_prod.hpp"
 #include "cumulative_sum.hpp"
 
@@ -47,7 +47,7 @@ namespace dpctl::tensor::py_internal
 /*! @brief Add accumulators to Python module */
 void init_accumulator_functions(py::module_ m)
 {
-    // init_cumulative_logsumexp(m);
+    init_cumulative_logsumexp(m);
     init_cumulative_prod(m);
     init_cumulative_sum(m);
 }
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
new file mode 100644
index 000000000000..572c93c0066e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -0,0 +1,351 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                       [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumlogsumexp_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumlogsumexp_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumlogsumexp_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumlogsumexp_dispatch_tables;
+    populate_cumlogsumexp_dispatch_tables();
+
+    using impl::cumlogsumexp_1d_contig_dispatch_table;
+    using impl::cumlogsumexp_strided_dispatch_table;
+    auto cumlogsumexp_pyapi = [&](const arrayT &src,
+                                  int trailing_dims_to_accumulate,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+        using dpctl::tensor::py_internal::py_accumulate_over_axis;
+        return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst,
+                                       exec_q, depends,
+                                       cumlogsumexp_strided_dispatch_table,
+                                       cumlogsumexp_1d_contig_dispatch_table);
+    };
+    m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table;
+    using impl::cumlogsumexp_include_initial_strided_dispatch_table;
+    auto cumlogsumexp_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::
+                py_accumulate_final_axis_include_initial;
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumlogsumexp_include_initial_strided_dispatch_table,
+                cumlogsumexp_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumlogsumexp_final_axis_include_initial",
+          cumlogsumexp_include_initial_pyapi, "", py::arg("src"),
+          py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                            const py::dtype &output_dtype) {
+        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
+        return py_accumulate_dtype_supported(
+            input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table);
+    };
+    m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
new file mode 100644
index 000000000000..f1292320bd0d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_logsumexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal

From d8c3680b6e19f15779652f41dee6172970903a19 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 02:47:29 -0800
Subject: [PATCH 109/145] Move ti.cumulative_logsumexp() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py      |  3 +-
 dpctl_ext/tensor/_accumulation.py | 87 +++++++++++++++++++++++++++++--
 dpnp/dpnp_iface_trigonometric.py  |  3 +-
 3 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index ce489fe8a36d..95c5e64c9322 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -80,7 +80,7 @@
 )
 from dpctl_ext.tensor._reshape import reshape
 
-from ._accumulation import cumulative_prod, cumulative_sum
+from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
@@ -95,6 +95,7 @@
     "concat",
     "copy",
     "clip",
+    "cumulative_logsumexp",
     "cumulative_prod",
     "cumulative_sum",
     "empty",
diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py
index 6ee5c2cfaa28..17596e5647fc 100644
--- a/dpctl_ext/tensor/_accumulation.py
+++ b/dpctl_ext/tensor/_accumulation.py
@@ -28,10 +28,6 @@
 
 import dpctl
 import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import (  # _default_accumulation_dtype_fp_types,
-    _default_accumulation_dtype,
-    _to_device_supported_dtype,
-)
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
@@ -39,6 +35,11 @@
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_accumulation_impl as tai
 import dpctl_ext.tensor._tensor_impl as ti
+from dpctl_ext.tensor._type_utils import (
+    _default_accumulation_dtype,
+    _default_accumulation_dtype_fp_types,
+    _to_device_supported_dtype,
+)
 
 from ._numpy_helper import normalize_axis_index
 
@@ -389,3 +390,81 @@ def cumulative_prod(
         tai._cumprod_dtype_supported,
         _default_accumulation_dtype,
     )
+
+
+def cumulative_logsumexp(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative logsmumexp of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative logsumexp must be computed.
+            If `None`, the logsumexp is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative logsumexp.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative logsumexp results. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative logsumexp is calculated, which will have size
+                  `N+1`
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumlogsumexp_over_axis,
+        tai._cumlogsumexp_final_axis_include_initial,
+        tai._cumlogsumexp_dtype_supported,
+        _default_accumulation_dtype_fp_types,
+    )
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 9894bd304701..460a0dc80f0f 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -48,6 +48,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -934,7 +935,7 @@ def cumlogsumexp(
     return dpnp_wrap_reduction_call(
         usm_x,
         out,
-        dpt.cumulative_logsumexp,
+        dpt_ext.cumulative_logsumexp,
         _get_accumulation_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,

From a1d9b0a737a7ef1048de4f5c6723eeff5a5ccc3a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 05:06:49 -0800
Subject: [PATCH 110/145] Extend ignore-words-list codespell

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 67fb75cb5f54..73844d1b0c17 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT"
 quiet-level = 3
 
 [tool.coverage.report]

From 8a87fea7458a803be34f81139b3ac27f741eed1d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 05:07:00 -0800
Subject: [PATCH 111/145] Move utils rich_comparisons.hpp file

---
 .../include/utils/rich_comparisons.hpp        | 149 ++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp

diff --git a/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp
new file mode 100644
index 000000000000..87cdfbfbd54f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp
@@ -0,0 +1,149 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <functional>
+#include <type_traits>
+
+#include "sycl/sycl.hpp"
+
+namespace dpctl::tensor::rich_comparisons
+{
+
+namespace detail
+{
+template <typename fpT>
+struct ExtendedRealFPLess
+{
+    /* [R, nan] */
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!std::isnan(v1) && (std::isnan(v2) || (v1 < v2)));
+    }
+};
+
+template <typename fpT>
+struct ExtendedRealFPGreater
+{
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!std::isnan(v2) && (std::isnan(v1) || (v2 < v1)));
+    }
+};
+
+template <typename cT>
+struct ExtendedComplexFPLess
+{
+    /* [(R, R), (R, nan), (nan, R), (nan, nan)] */
+
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        using realT = typename cT::value_type;
+
+        const realT real1 = std::real(v1);
+        const realT real2 = std::real(v2);
+
+        const bool r1_nan = std::isnan(real1);
+        const bool r2_nan = std::isnan(real2);
+
+        const realT imag1 = std::imag(v1);
+        const realT imag2 = std::imag(v2);
+
+        const bool i1_nan = std::isnan(imag1);
+        const bool i2_nan = std::isnan(imag2);
+
+        const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0);
+        const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0);
+
+        const bool res =
+            !(r1_nan && i1_nan) &&
+            ((idx1 < idx2) ||
+             ((idx1 == idx2) &&
+              ((r1_nan && !i1_nan && (imag1 < imag2)) ||
+               (!r1_nan && i1_nan && (real1 < real2)) ||
+               (!r1_nan && !i1_nan &&
+                ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2)))))));
+
+        return res;
+    }
+};
+
+template <typename cT>
+struct ExtendedComplexFPGreater
+{
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        auto less_ = ExtendedComplexFPLess<cT>{};
+        return less_(v2, v1);
+    }
+};
+
+template <typename T>
+inline constexpr bool is_fp_v = (std::is_same_v<T, sycl::half> ||
+                                 std::is_same_v<T, float> ||
+                                 std::is_same_v<T, double>);
+
+} // namespace detail
+
+template <typename argTy>
+struct AscendingSorter
+{
+    using type = std::conditional_t<detail::is_fp_v<argTy>,
+                                    detail::ExtendedRealFPLess<argTy>,
+                                    std::less<argTy>>;
+};
+
+template <typename T>
+struct AscendingSorter<std::complex<T>>
+{
+    using type = detail::ExtendedComplexFPLess<std::complex<T>>;
+};
+
+template <typename argTy>
+struct DescendingSorter
+{
+    using type = std::conditional_t<detail::is_fp_v<argTy>,
+                                    detail::ExtendedRealFPGreater<argTy>,
+                                    std::greater<argTy>>;
+};
+
+template <typename T>
+struct DescendingSorter<std::complex<T>>
+{
+    using type = detail::ExtendedComplexFPGreater<std::complex<T>>;
+};
+
+} // namespace dpctl::tensor::rich_comparisons

From 43a7a0474168c29c55a9c07a572cb138e3117c4e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 05:49:51 -0800
Subject: [PATCH 112/145] Initialize _tensor_sorting_impl extension and move
 _radix_sort and py_sort

---
 dpctl_ext/tensor/CMakeLists.txt               |   21 +-
 .../include/kernels/sorting/radix_sort.hpp    | 1920 +++++++++++++++++
 .../kernels/sorting/sort_impl_fn_ptr_t.hpp    |   61 +
 .../include/kernels/sorting/sort_utils.hpp    |  148 ++
 .../source/sorting/py_sort_common.hpp         |  179 ++
 .../libtensor/source/sorting/radix_sort.cpp   |  192 ++
 .../libtensor/source/sorting/radix_sort.hpp   |   47 +
 .../source/sorting/radix_sort_support.hpp     |   78 +
 .../libtensor/source/tensor_sorting.cpp       |   57 +
 pyproject.toml                                |    2 +-
 10 files changed, 2703 insertions(+), 2 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index eff5e7552648..cf0a78efdd59 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -69,10 +69,23 @@ set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
+set(_sorting_sources
+    #{CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
+)
 set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
     ${_accumulator_sources}
 )
+set(_tensor_sorting_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
+    ${_sorting_sources}
+)
 
 set(_static_lib_trgt simplify_iteration_space)
 
@@ -101,6 +114,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_i
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_sorting_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(_clang_prefix "")
 if(WIN32)
     set(_clang_prefix "/clang:")
@@ -117,7 +136,7 @@ list(
     APPEND _no_fast_math_sources
     # ${_elementwise_sources}
     # ${_reduction_sources}
-    # ${_sorting_sources}
+    ${_sorting_sources}
     # ${_linalg_sources}
     ${_accumulator_sources}
 )
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
new file mode 100644
index 000000000000..545444101fc6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -0,0 +1,1920 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace radix_sort_details
+{
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_count_kernel;
+
+template <std::uint32_t, typename... TrailingNames>
+class radix_sort_scan_kernel;
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_reorder_peer_kernel;
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_reorder_kernel;
+
+/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */
+template <typename SizeT,
+          std::enable_if_t<std::is_unsigned_v<SizeT> &&
+                               sizeof(SizeT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint32_t ceil_log2(SizeT n)
+{
+    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
+    // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1)
+    // ceil_log2(n) == 1 + floor_log2(n-1)
+    if (n <= 1)
+        return std::uint32_t{1};
+
+    std::uint32_t exp{1};
+    --n;
+    if (n >= (SizeT{1} << 32)) {
+        n >>= 32;
+        exp += 32;
+    }
+    if (n >= (SizeT{1} << 16)) {
+        n >>= 16;
+        exp += 16;
+    }
+    if (n >= (SizeT{1} << 8)) {
+        n >>= 8;
+        exp += 8;
+    }
+    if (n >= (SizeT{1} << 4)) {
+        n >>= 4;
+        exp += 4;
+    }
+    if (n >= (SizeT{1} << 2)) {
+        n >>= 2;
+        exp += 2;
+    }
+    if (n >= (SizeT{1} << 1)) {
+        n >>= 1;
+        ++exp;
+    }
+    return exp;
+}
+
+//----------------------------------------------------------
+// bitwise order-preserving conversions to unsigned integers
+//----------------------------------------------------------
+
+template <bool is_ascending>
+bool order_preserving_cast(bool val)
+{
+    if constexpr (is_ascending)
+        return val;
+    else
+        return !val;
+}
+
+template <bool is_ascending,
+          typename UIntT,
+          std::enable_if_t<std::is_unsigned_v<UIntT>, int> = 0>
+UIntT order_preserving_cast(UIntT val)
+{
+    if constexpr (is_ascending) {
+        return val;
+    }
+    else {
+        // bitwise invert
+        return (~val);
+    }
+}
+
+template <bool is_ascending,
+          typename IntT,
+          std::enable_if_t<std::is_integral_v<IntT> && std::is_signed_v<IntT>,
+                           int> = 0>
+std::make_unsigned_t<IntT> order_preserving_cast(IntT val)
+{
+    using UIntT = std::make_unsigned_t<IntT>;
+    const UIntT uint_val = sycl::bit_cast<UIntT>(val);
+
+    if constexpr (is_ascending) {
+        // ascending_mask: 100..0
+        static constexpr UIntT ascending_mask =
+            (UIntT(1) << std::numeric_limits<IntT>::digits);
+        return (uint_val ^ ascending_mask);
+    }
+    else {
+        // descending_mask: 011..1
+        static constexpr UIntT descending_mask =
+            (std::numeric_limits<UIntT>::max() >> 1);
+        return (uint_val ^ descending_mask);
+    }
+}
+
+template <bool is_ascending>
+std::uint16_t order_preserving_cast(sycl::half val)
+{
+    using UIntT = std::uint16_t;
+
+    const UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<sycl::half>::quiet_NaN()
+                           : val);
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15));
+
+    static constexpr UIntT zero_mask = UIntT(0x8000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFu);
+
+    static constexpr UIntT inv_zero_mask = static_cast<UIntT>(~zero_mask);
+    static constexpr UIntT inv_nonzero_mask = static_cast<UIntT>(~nonzero_mask);
+
+    if constexpr (is_ascending) {
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    }
+    else {
+        mask = (zero_fp_sign_bit) ? (inv_zero_mask) : (inv_nonzero_mask);
+    }
+
+    return (uint_val ^ mask);
+}
+
+template <bool is_ascending,
+          typename FloatT,
+          std::enable_if_t<std::is_floating_point_v<FloatT> &&
+                               sizeof(FloatT) == sizeof(std::uint32_t),
+                           int> = 0>
+std::uint32_t order_preserving_cast(FloatT val)
+{
+    using UIntT = std::uint32_t;
+
+    UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
+
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31));
+
+    static constexpr UIntT zero_mask = UIntT(0x80000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu);
+
+    if constexpr (is_ascending)
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    else
+        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
+
+    return (uint_val ^ mask);
+}
+
+template <bool is_ascending,
+          typename FloatT,
+          std::enable_if_t<std::is_floating_point_v<FloatT> &&
+                               sizeof(FloatT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint64_t order_preserving_cast(FloatT val)
+{
+    using UIntT = std::uint64_t;
+
+    UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63));
+
+    static constexpr UIntT zero_mask = UIntT(0x8000000000000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu);
+
+    if constexpr (is_ascending)
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    else
+        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
+
+    return (uint_val ^ mask);
+}
+
+//-----------------
+// bucket functions
+//-----------------
+
+template <typename T>
+constexpr std::size_t number_of_bits_in_type()
+{
+    constexpr std::size_t type_bits =
+        (sizeof(T) * std::numeric_limits<unsigned char>::digits);
+    return type_bits;
+}
+
+// the number of buckets (size of radix bits) in T
+template <typename T>
+constexpr std::uint32_t number_of_buckets_in_type(std::uint32_t radix_bits)
+{
+    constexpr std::size_t type_bits = number_of_bits_in_type<T>();
+    return (type_bits + radix_bits - 1) / radix_bits;
+}
+
+// get bits value (bucket) in a certain radix position
+template <std::uint32_t radix_mask, typename T>
+std::uint32_t get_bucket_id(T val, std::uint32_t radix_offset)
+{
+    static_assert(std::is_unsigned_v<T>);
+
+    return (val >> radix_offset) & T(radix_mask);
+}
+
+//--------------------------------
+// count kernel (single iteration)
+//--------------------------------
+
+template <typename KernelName,
+          std::uint32_t radix_bits,
+          typename ValueT,
+          typename CountT,
+          typename Proj>
+sycl::event
+    radix_sort_count_submit(sycl::queue &exec_q,
+                            std::size_t n_iters,
+                            std::size_t n_segments,
+                            std::size_t wg_size,
+                            std::uint32_t radix_offset,
+                            std::size_t n_values,
+                            ValueT *vals_ptr,
+                            std::size_t n_counts,
+                            CountT *counts_ptr,
+                            const Proj &proj_op,
+                            const bool is_ascending,
+                            const std::vector<sycl::event> &dependency_events)
+{
+    // bin_count = radix_states used for an array storing bucket state counters
+    static constexpr std::uint32_t radix_states =
+        (std::uint32_t(1) << radix_bits);
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
+
+    // iteration space info
+    const std::size_t n = n_values;
+    // each segment is processed by a work-group
+    const std::size_t elems_per_segment = (n + n_segments - 1) / n_segments;
+    const std::size_t no_op_flag_id = n_counts - 1;
+
+    assert(n_counts == (n_segments + 1) * radix_states + 1);
+
+    sycl::event local_count_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependency_events);
+
+        sycl::local_accessor<CountT, 1> counts_lacc(wg_size * radix_states,
+                                                    cgh);
+
+        sycl::nd_range<1> ndRange(n_iters * n_segments * wg_size, wg_size);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            // 0 <= lid < wg_size
+            const std::size_t lid = ndit.get_local_id(0);
+            // 0 <= group_id < n_segments * n_iters
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t val_iter_offset = iter_id * n;
+            // 0 <= wgr_id < n_segments
+            const std::size_t wgr_id = group_id - iter_id * n_segments;
+
+            const std::size_t seg_start = elems_per_segment * wgr_id;
+
+            // count per work-item: create a private array for storing count
+            // values here bin_count = radix_states
+            std::array<CountT, radix_states> counts_arr = {CountT{0}};
+
+            // count per work-item: count values and write result to private
+            // count array
+            const std::size_t seg_end =
+                sycl::min(seg_start + elems_per_segment, n);
+            if (is_ascending) {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += wg_size) {
+                    // get the bucket for the bit-ordered input value,
+                    // applying the offset and mask for radix bits
+                    const auto val =
+                        order_preserving_cast</*is_ascending*/ true>(
+                            proj_op(vals_ptr[val_iter_offset + val_id]));
+                    const std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(val, radix_offset);
+
+                    // increment counter for this bit bucket
+                    ++counts_arr[bucket_id];
+                }
+            }
+            else {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += wg_size) {
+                    // get the bucket for the bit-ordered input value,
+                    // applying the offset and mask for radix bits
+                    const auto val =
+                        order_preserving_cast</*is_ascending*/ false>(
+                            proj_op(vals_ptr[val_iter_offset + val_id]));
+                    const std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(val, radix_offset);
+
+                    // increment counter for this bit bucket
+                    ++counts_arr[bucket_id];
+                }
+            }
+
+            // count per work-item: write private count array to local count
+            // array counts_lacc is concatenation of private count arrays from
+            // each work-item in the order of their local ids
+            const std::uint32_t count_start_id = radix_states * lid;
+            for (std::uint32_t radix_state_id = 0;
+                 radix_state_id < radix_states; ++radix_state_id)
+            {
+                counts_lacc[count_start_id + radix_state_id] =
+                    counts_arr[radix_state_id];
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            // count per work-group: reduce till count_lacc[] size > wg_size
+            // all work-items in the work-group do the work.
+            for (std::uint32_t i = 1; i < radix_states; ++i) {
+                // Since we interested in computing total count over work-group
+                // for each radix state, the correct result is only assured if
+                // wg_size >= radix_states
+                counts_lacc[lid] += counts_lacc[wg_size * i + lid];
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            // count per work-group: reduce until count_lacc[] size >
+            // radix_states (n_witems /= 2 per iteration)
+            for (std::uint32_t n_witems = (wg_size >> 1);
+                 n_witems >= radix_states; n_witems >>= 1)
+            {
+                if (lid < n_witems)
+                    counts_lacc[lid] += counts_lacc[n_witems + lid];
+
+                sycl::group_barrier(ndit.get_group());
+            }
+
+            const std::size_t iter_counter_offset = iter_id * n_counts;
+
+            // count per work-group: write local count array to global count
+            // array
+            if (lid < radix_states) {
+                // move buckets with the same id to adjacent positions,
+                // thus splitting count array into radix_states regions
+                counts_ptr[iter_counter_offset + (n_segments + 1) * lid +
+                           wgr_id] = counts_lacc[lid];
+            }
+
+            // side work: reset 'no-operation-flag', signaling to skip re-order
+            // phase
+            if (wgr_id == 0 && lid == 0) {
+                CountT &no_op_flag =
+                    counts_ptr[iter_counter_offset + no_op_flag_id];
+                no_op_flag = 0;
+            }
+        });
+    });
+
+    return local_count_ev;
+}
+
+//-----------------------------------------------------------------------
+// radix sort: scan kernel (single iteration)
+//-----------------------------------------------------------------------
+
+template <typename KernelName, std::uint32_t radix_bits, typename CountT>
+sycl::event radix_sort_scan_submit(sycl::queue &exec_q,
+                                   std::size_t n_iters,
+                                   std::size_t n_segments,
+                                   std::size_t wg_size,
+                                   std::size_t n_values,
+                                   std::size_t n_counts,
+                                   CountT *counts_ptr,
+                                   const std::vector<sycl::event> depends)
+{
+    const std::size_t no_op_flag_id = n_counts - 1;
+
+    // Scan produces local offsets using count values.
+    // There are no local offsets for the first segment, but the rest segments
+    // should be scanned with respect to the count value in the first segment
+    // what requires n + 1 positions
+    const std::size_t scan_size = n_segments + 1;
+    wg_size = std::min(scan_size, wg_size);
+
+    static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                  << radix_bits;
+
+    // compilation of the kernel prevents out of resources issue, which may
+    // occur due to usage of collective algorithms such as joint_exclusive_scan
+    // even if local memory is not explicitly requested
+    sycl::event scan_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::nd_range<1> ndRange(n_iters * radix_states * wg_size, wg_size);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / radix_states;
+            const std::size_t wgr_id = group_id - iter_id * radix_states;
+            // find borders of a region with a specific bucket id
+            auto begin_ptr =
+                counts_ptr + scan_size * wgr_id + iter_id * n_counts;
+
+            sycl::joint_exclusive_scan(ndit.get_group(), begin_ptr,
+                                       begin_ptr + scan_size, begin_ptr,
+                                       CountT(0), sycl::plus<CountT>{});
+
+            const auto lid = ndit.get_local_linear_id();
+
+            // NB: No race condition here, because the condition may ever be
+            // true for only on one WG, one WI.
+            if ((lid == wg_size - 1) && (begin_ptr[scan_size - 1] == n_values))
+            {
+                // set flag, since all the values got into one
+                // this is optimization, may happy often for
+                // higher radix offsets (all zeros)
+                auto &no_op_flag =
+                    counts_ptr[iter_id * n_counts + no_op_flag_id];
+                no_op_flag = 1;
+            }
+        });
+    });
+
+    return scan_ev;
+}
+
+//-----------------------------------------------------------------------
+// radix sort: group level reorder algorithms
+//-----------------------------------------------------------------------
+
+struct empty_storage
+{
+    template <typename... T>
+    empty_storage(T &&...)
+    {
+    }
+};
+
+// Number with `n` least significant bits of uint32_t
+inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept
+{
+    static constexpr std::uint32_t zero{};
+    static constexpr std::uint32_t all_bits_set = ~zero;
+
+    return ~(all_bits_set << n);
+}
+
+enum class peer_prefix_algo
+{
+    subgroup_ballot,
+    atomic_fetch_or,
+    scan_then_broadcast
+};
+
+template <typename OffsetT, peer_prefix_algo Algo>
+struct peer_prefix_helper;
+
+template <typename AccT>
+auto get_accessor_pointer(const AccT &acc)
+{
+    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
+}
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::atomic_fetch_or>
+{
+    using AtomicT = sycl::atomic_ref<std::uint32_t,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::work_group,
+                                     sycl::access::address_space::local_space>;
+    using TempStorageT = sycl::local_accessor<std::uint32_t, 1>;
+
+private:
+    sycl::sub_group sgroup;
+    std::uint32_t lid;
+    std::uint32_t item_mask;
+    AtomicT atomic_peer_mask;
+
+public:
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT lacc)
+        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
+          item_mask(n_ls_bits_set(lid)), atomic_peer_mask(lacc[0])
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        // reset mask for each radix state
+        if (lid == 0)
+            atomic_peer_mask.store(std::uint32_t{0});
+        sycl::group_barrier(sgroup);
+
+        const std::uint32_t uint_contrib{wi_bit_set ? std::uint32_t{1}
+                                                    : std::uint32_t{0}};
+
+        // set local id's bit to 1 if the bucket value matches the radix state
+        atomic_peer_mask.fetch_or(uint_contrib << lid);
+        sycl::group_barrier(sgroup);
+        std::uint32_t peer_mask_bits = atomic_peer_mask.load();
+        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
+
+        // get the local offset index from the bits set in the peer mask with
+        // index less than the work item ID
+        peer_mask_bits &= item_mask;
+        new_offset_id |= wi_bit_set
+                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
+                             : OffsetT{0};
+        return sg_total_offset;
+    }
+};
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::scan_then_broadcast>
+{
+    using TempStorageT = empty_storage;
+    using ItemType = sycl::nd_item<1>;
+    using SubGroupType = sycl::sub_group;
+
+private:
+    SubGroupType sgroup;
+    std::uint32_t sg_size;
+
+public:
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
+        : sgroup(ndit.get_sub_group()), sg_size(sgroup.get_local_range()[0])
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        const std::uint32_t contrib{wi_bit_set ? std::uint32_t{1}
+                                               : std::uint32_t{0}};
+
+        std::uint32_t sg_item_offset = sycl::exclusive_scan_over_group(
+            sgroup, contrib, sycl::plus<std::uint32_t>{});
+
+        new_offset_id |=
+            (wi_bit_set ? (offset_prefix + sg_item_offset) : OffsetT(0));
+
+        // the last scanned value does not contain number of all copies, thus
+        // adding contribution
+        std::uint32_t sg_total_offset = sycl::group_broadcast(
+            sgroup, sg_item_offset + contrib, sg_size - 1);
+
+        return sg_total_offset;
+    }
+};
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::subgroup_ballot>
+{
+private:
+    sycl::sub_group sgroup;
+    std::uint32_t lid;
+    sycl::ext::oneapi::sub_group_mask item_sg_mask;
+
+    sycl::ext::oneapi::sub_group_mask mask_builder(std::uint32_t mask,
+                                                   std::uint32_t sg_size)
+    {
+        return sycl::detail::Builder::createSubGroupMask<
+            sycl::ext::oneapi::sub_group_mask>(mask, sg_size);
+    }
+
+public:
+    using TempStorageT = empty_storage;
+
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
+        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
+          item_sg_mask(
+              mask_builder(n_ls_bits_set(lid), sgroup.get_local_linear_range()))
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        // set local id's bit to 1 if the bucket value matches the radix state
+        auto peer_mask = sycl::ext::oneapi::group_ballot(sgroup, wi_bit_set);
+        std::uint32_t peer_mask_bits{};
+
+        peer_mask.extract_bits(peer_mask_bits);
+        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
+
+        // get the local offset index from the bits set in the peer mask with
+        // index less than the work item ID
+        peer_mask &= item_sg_mask;
+        peer_mask.extract_bits(peer_mask_bits);
+
+        new_offset_id |= wi_bit_set
+                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
+                             : OffsetT(0);
+
+        return sg_total_offset;
+    }
+};
+
+template <typename InputT, typename OutputT>
+void copy_func_for_radix_sort(const std::size_t n_segments,
+                              const std::size_t elems_per_segment,
+                              const std::size_t sg_size,
+                              const std::uint32_t lid,
+                              const std::size_t wgr_id,
+                              const InputT *input_ptr,
+                              const std::size_t n_values,
+                              OutputT *output_ptr)
+{
+    // item info
+    const std::size_t seg_start = elems_per_segment * wgr_id;
+
+    std::size_t seg_end = sycl::min(seg_start + elems_per_segment, n_values);
+
+    // ensure that each work item in a subgroup does the same number of loop
+    // iterations
+    const std::uint16_t tail_size = (seg_end - seg_start) % sg_size;
+    seg_end -= tail_size;
+
+    // find offsets for the same values within a segment and fill the resulting
+    // buffer
+    for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+         val_id += sg_size) {
+        output_ptr[val_id] = std::move(input_ptr[val_id]);
+    }
+
+    if (tail_size > 0 && lid < tail_size) {
+        const std::size_t val_id = seg_end + lid;
+        output_ptr[val_id] = std::move(input_ptr[val_id]);
+    }
+}
+
+//-----------------------------------------------------------------------
+// radix sort: reorder kernel (per iteration)
+//-----------------------------------------------------------------------
+template <typename KernelName,
+          std::uint32_t radix_bits,
+          peer_prefix_algo PeerAlgo,
+          typename InputT,
+          typename OutputT,
+          typename OffsetT,
+          typename ProjT>
+sycl::event
+    radix_sort_reorder_submit(sycl::queue &exec_q,
+                              std::size_t n_iters,
+                              std::size_t n_segments,
+                              std::uint32_t radix_offset,
+                              std::size_t n_values,
+                              const InputT *input_ptr,
+                              OutputT *output_ptr,
+                              std::size_t n_offsets,
+                              OffsetT *offset_ptr,
+                              const ProjT &proj_op,
+                              const bool is_ascending,
+                              const std::vector<sycl::event> dependency_events)
+{
+    using ValueT = InputT;
+    using PeerHelper = peer_prefix_helper<OffsetT, PeerAlgo>;
+
+    static constexpr std::uint32_t radix_states = std::uint32_t{1}
+                                                  << radix_bits;
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
+    const std::size_t elems_per_segment =
+        (n_values + n_segments - 1) / n_segments;
+
+    const std::size_t no_op_flag_id = n_offsets - 1;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    sycl::event reorder_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependency_events);
+        cgh.use_kernel_bundle(kb);
+
+        using StorageT = typename PeerHelper::TempStorageT;
+
+        StorageT peer_temp(1, cgh);
+
+        sycl::range<1> lRange{sg_size};
+        sycl::range<1> gRange{n_iters * n_segments * sg_size};
+
+        sycl::nd_range<1> ndRange{gRange, lRange};
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t segment_id = group_id - iter_id * n_segments;
+
+            auto b_offset_ptr = offset_ptr + iter_id * n_offsets;
+            auto b_input_ptr = input_ptr + iter_id * n_values;
+            auto b_output_ptr = output_ptr + iter_id * n_values;
+
+            const std::uint32_t lid = ndit.get_local_id(0);
+
+            auto &no_op_flag = b_offset_ptr[no_op_flag_id];
+            if (no_op_flag) {
+                // no reordering necessary, simply copy
+                copy_func_for_radix_sort<InputT, OutputT>(
+                    n_segments, elems_per_segment, sg_size, lid, segment_id,
+                    b_input_ptr, n_values, b_output_ptr);
+                return;
+            }
+
+            // create a private array for storing offset values
+            // and add total offset and offset for compute unit
+            // for a certain radix state
+            std::array<OffsetT, radix_states> offset_arr{};
+            const std::size_t scan_size = n_segments + 1;
+
+            OffsetT scanned_bin = 0;
+
+            /* find cumulative offset */
+            static constexpr std::uint32_t zero_radix_state_id = 0;
+            offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id];
+
+            for (std::uint32_t radix_state_id = 1;
+                 radix_state_id < radix_states; ++radix_state_id)
+            {
+                const std::uint32_t local_offset_id =
+                    segment_id + scan_size * radix_state_id;
+
+                // scan bins serially
+                const std::size_t last_segment_bucket_id =
+                    radix_state_id * scan_size - 1;
+                scanned_bin += b_offset_ptr[last_segment_bucket_id];
+
+                offset_arr[radix_state_id] =
+                    scanned_bin + b_offset_ptr[local_offset_id];
+            }
+
+            const std::size_t seg_start = elems_per_segment * segment_id;
+            std::size_t seg_end =
+                sycl::min(seg_start + elems_per_segment, n_values);
+            // ensure that each work item in a subgroup does the same number of
+            // loop iterations
+            const std::uint32_t tail_size = (seg_end - seg_start) % sg_size;
+            seg_end -= tail_size;
+
+            const PeerHelper peer_prefix_hlp(ndit, peer_temp);
+
+            // find offsets for the same values within a segment and fill the
+            // resulting buffer
+            if (is_ascending) {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += sg_size) {
+                    ValueT in_val = std::move(b_input_ptr[val_id]);
+
+                    // get the bucket for the bit-ordered input value, applying
+                    // the offset and mask for radix bits
+                    const auto mapped_val =
+                        order_preserving_cast</*is_ascending*/ true>(
+                            proj_op(in_val));
+                    std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+
+                    OffsetT new_offset_id = 0;
+                    for (std::uint32_t radix_state_id = 0;
+                         radix_state_id < radix_states; ++radix_state_id)
+                    {
+                        bool is_current_bucket = (bucket_id == radix_state_id);
+                        std::uint32_t sg_total_offset =
+                            peer_prefix_hlp.peer_contribution(
+                                /* modified by reference */ new_offset_id,
+                                offset_arr[radix_state_id],
+                                /* bit contribution from this work-item */
+                                is_current_bucket);
+                        offset_arr[radix_state_id] += sg_total_offset;
+                    }
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+            else {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += sg_size) {
+                    ValueT in_val = std::move(b_input_ptr[val_id]);
+
+                    // get the bucket for the bit-ordered input value, applying
+                    // the offset and mask for radix bits
+                    const auto mapped_val =
+                        order_preserving_cast</*is_ascending*/ false>(
+                            proj_op(in_val));
+                    std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+
+                    OffsetT new_offset_id = 0;
+                    for (std::uint32_t radix_state_id = 0;
+                         radix_state_id < radix_states; ++radix_state_id)
+                    {
+                        bool is_current_bucket = (bucket_id == radix_state_id);
+                        std::uint32_t sg_total_offset =
+                            peer_prefix_hlp.peer_contribution(
+                                /* modified by reference */ new_offset_id,
+                                offset_arr[radix_state_id],
+                                /* bit contribution from this work-item */
+                                is_current_bucket);
+                        offset_arr[radix_state_id] += sg_total_offset;
+                    }
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+            if (tail_size > 0) {
+                ValueT in_val;
+
+                // default: is greater than any actual radix state
+                std::uint32_t bucket_id = radix_states;
+                if (lid < tail_size) {
+                    in_val = std::move(b_input_ptr[seg_end + lid]);
+
+                    const auto proj_val = proj_op(in_val);
+                    const auto mapped_val =
+                        (is_ascending)
+                            ? order_preserving_cast</*is_ascending*/ true>(
+                                  proj_val)
+                            : order_preserving_cast</*is_ascending*/ false>(
+                                  proj_val);
+                    bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+                }
+
+                OffsetT new_offset_id = 0;
+                for (std::uint32_t radix_state_id = 0;
+                     radix_state_id < radix_states; ++radix_state_id)
+                {
+                    bool is_current_bucket = (bucket_id == radix_state_id);
+                    std::uint32_t sg_total_offset =
+                        peer_prefix_hlp.peer_contribution(
+                            new_offset_id, offset_arr[radix_state_id],
+                            is_current_bucket);
+
+                    offset_arr[radix_state_id] += sg_total_offset;
+                }
+
+                if (lid < tail_size) {
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+        });
+    });
+
+    return reorder_ev;
+}
+
+template <typename sizeT>
+sizeT _slm_adjusted_work_group_size(sycl::queue &exec_q,
+                                    sizeT required_slm_bytes_per_wg,
+                                    sizeT wg_size)
+{
+    const auto &dev = exec_q.get_device();
+
+    if (wg_size == 0)
+        wg_size =
+            dev.template get_info<sycl::info::device::max_work_group_size>();
+
+    const auto local_mem_sz =
+        dev.template get_info<sycl::info::device::local_mem_size>();
+
+    return sycl::min(local_mem_sz / required_slm_bytes_per_wg, wg_size);
+}
+
+//-----------------------------------------------------------------------
+// radix sort: one iteration
+//-----------------------------------------------------------------------
+
+template <std::uint32_t radix_bits, bool even>
+struct parallel_radix_sort_iteration_step
+{
+    template <typename... Name>
+    using count_phase = radix_sort_count_kernel<radix_bits, even, Name...>;
+    template <typename... Name>
+    using local_scan_phase = radix_sort_scan_kernel<radix_bits, Name...>;
+    template <typename... Name>
+    using reorder_peer_phase =
+        radix_sort_reorder_peer_kernel<radix_bits, even, Name...>;
+    template <typename... Name>
+    using reorder_phase = radix_sort_reorder_kernel<radix_bits, even, Name...>;
+
+    template <typename InputT,
+              typename OutputT,
+              typename CountT,
+              typename ProjT>
+    static sycl::event submit(sycl::queue &exec_q,
+                              std::size_t n_iters,
+                              std::size_t n_segments,
+                              std::uint32_t radix_iter,
+                              std::size_t n_values,
+                              const InputT *in_ptr,
+                              OutputT *out_ptr,
+                              std::size_t n_counts,
+                              CountT *counts_ptr,
+                              const ProjT &proj_op,
+                              const bool is_ascending,
+                              const std::vector<sycl::event> &dependency_events)
+    {
+        using _RadixCountKernel = count_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixLocalScanKernel =
+            local_scan_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixReorderPeerKernel =
+            reorder_peer_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixReorderKernel =
+            reorder_phase<InputT, OutputT, CountT, ProjT>;
+
+        const auto &supported_sub_group_sizes =
+            exec_q.get_device()
+                .template get_info<sycl::info::device::sub_group_sizes>();
+        const std::size_t max_sg_size =
+            (supported_sub_group_sizes.empty()
+                 ? 0
+                 : supported_sub_group_sizes.back());
+        const std::size_t reorder_sg_size = max_sg_size;
+        const std::size_t scan_wg_size =
+            exec_q.get_device()
+                .template get_info<sycl::info::device::max_work_group_size>();
+
+        static constexpr std::size_t two_mils = (std::size_t(1) << 21);
+        std::size_t count_wg_size =
+            ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size);
+
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
+
+        // correct count_wg_size according to local memory limit in count phase
+        const auto max_count_wg_size = _slm_adjusted_work_group_size(
+            exec_q, sizeof(CountT) * radix_states, count_wg_size);
+        count_wg_size =
+            static_cast<::std::size_t>((max_count_wg_size / radix_states)) *
+            radix_states;
+
+        // work-group size must be a power of 2 and not less than the number of
+        // states, for scanning to work correctly
+
+        const std::size_t rounded_down_count_wg_size =
+            std::size_t{1} << (number_of_bits_in_type<std::size_t>() -
+                               sycl::clz(count_wg_size) - 1);
+        count_wg_size =
+            sycl::max(rounded_down_count_wg_size, std::size_t(radix_states));
+
+        // Compute the radix position for the given iteration
+        std::uint32_t radix_offset = radix_iter * radix_bits;
+
+        // 1. Count Phase
+        sycl::event count_ev =
+            radix_sort_count_submit<_RadixCountKernel, radix_bits>(
+                exec_q, n_iters, n_segments, count_wg_size, radix_offset,
+                n_values, in_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                dependency_events);
+
+        // 2. Scan Phase
+        sycl::event scan_ev =
+            radix_sort_scan_submit<_RadixLocalScanKernel, radix_bits>(
+                exec_q, n_iters, n_segments, scan_wg_size, n_values, n_counts,
+                counts_ptr, {count_ev});
+
+        // 3. Reorder Phase
+        sycl::event reorder_ev{};
+        // subgroup_ballot-based peer algo uses extract_bits to populate
+        // uint32_t mask and hence relies on sub-group to be 32 or narrower
+        static constexpr std::size_t sg32_v = 32u;
+        static constexpr std::size_t sg16_v = 16u;
+        static constexpr std::size_t sg08_v = 8u;
+        if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size ||
+            sg08_v == reorder_sg_size)
+        {
+            static constexpr auto peer_algorithm =
+                peer_prefix_algo::subgroup_ballot;
+
+            reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel,
+                                                   radix_bits, peer_algorithm>(
+                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
+                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                {scan_ev});
+        }
+        else {
+            static constexpr auto peer_algorithm =
+                peer_prefix_algo::scan_then_broadcast;
+
+            reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel,
+                                                   radix_bits, peer_algorithm>(
+                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
+                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                {scan_ev});
+        }
+
+        return reorder_ev;
+    }
+}; // struct parallel_radix_sort_iteration
+
+template <typename Names, std::uint16_t... Constants>
+class radix_sort_one_wg_krn;
+
+template <typename KernelNameBase,
+          std::uint16_t wg_size = 256,
+          std::uint16_t block_size = 16,
+          std::uint32_t radix = 4,
+          std::uint16_t req_sub_group_size = (block_size < 4 ? 32 : 16)>
+struct subgroup_radix_sort
+{
+private:
+    class use_slm_tag
+    {
+    };
+    class use_global_mem_tag
+    {
+    };
+
+public:
+    template <typename ValueT, typename OutputT, typename ProjT>
+    sycl::event operator()(sycl::queue &exec_q,
+                           std::size_t n_iters,
+                           std::size_t n_to_sort,
+                           ValueT *input_ptr,
+                           OutputT *output_ptr,
+                           ProjT proj_op,
+                           const bool is_ascending,
+                           const std::vector<sycl::event> &depends)
+    {
+        static_assert(std::is_same_v<std::remove_cv_t<ValueT>, OutputT>);
+
+        using _SortKernelLoc =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 0>;
+        using _SortKernelPartGlob =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 1>;
+        using _SortKernelGlob =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 2>;
+
+        static constexpr std::size_t max_concurrent_work_groups = 128U;
+
+        // Choose this to occupy the entire accelerator
+        const std::size_t n_work_groups =
+            std::min<std::size_t>(n_iters, max_concurrent_work_groups);
+
+        // determine which temporary allocation can be accommodated in SLM
+        const auto &SLM_availability =
+            check_slm_size<ValueT>(exec_q, n_to_sort);
+
+        const std::size_t n_batch_size = n_work_groups;
+
+        switch (SLM_availability) {
+        case temp_allocations::both_in_slm:
+        {
+            static constexpr auto storage_for_values = use_slm_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
+
+            return one_group_submitter<_SortKernelLoc>()(
+                exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        case temp_allocations::counters_in_slm:
+        {
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
+
+            return one_group_submitter<_SortKernelPartGlob>()(
+                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        default:
+        {
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_global_mem_tag{};
+
+            return one_group_submitter<_SortKernelGlob>()(
+                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        }
+    }
+
+private:
+    template <typename KeyT, typename>
+    class TempBuf;
+
+    template <typename KeyT>
+    class TempBuf<KeyT, use_slm_tag>
+    {
+        std::size_t buf_size;
+
+    public:
+        TempBuf(std::size_t, std::size_t n) : buf_size(n) {}
+        auto get_acc(sycl::handler &cgh)
+        {
+            return sycl::local_accessor<KeyT>(buf_size, cgh);
+        }
+
+        std::size_t get_iter_stride() const
+        {
+            return std::size_t{0};
+        }
+    };
+
+    template <typename KeyT>
+    class TempBuf<KeyT, use_global_mem_tag>
+    {
+        sycl::buffer<KeyT> buf;
+        std::size_t iter_stride;
+
+    public:
+        TempBuf(std::size_t n_iters, std::size_t n)
+            : buf(n_iters * n), iter_stride(n)
+        {
+        }
+        auto get_acc(sycl::handler &cgh)
+        {
+            return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init);
+        }
+        std::size_t get_iter_stride() const
+        {
+            return iter_stride;
+        }
+    };
+
+    static_assert(wg_size <= 1024);
+    static constexpr std::uint16_t bin_count = (1 << radix);
+    static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1;
+
+    enum class temp_allocations
+    {
+        both_in_slm,
+        counters_in_slm,
+        both_in_global_mem
+    };
+
+    template <typename T, typename SizeT>
+    temp_allocations check_slm_size(const sycl::queue &exec_q, SizeT n)
+    {
+        // the kernel is designed for data size <= 64K
+        assert(n <= (SizeT(1) << 16));
+
+        static constexpr auto req_slm_size_counters =
+            counter_buf_sz * sizeof(std::uint16_t);
+
+        const auto &dev = exec_q.get_device();
+
+        // Pessimistically only use half of the memory to take into account
+        // a SYCL group algorithm might use a portion of SLM
+        const std::size_t max_slm_size =
+            dev.template get_info<sycl::info::device::local_mem_size>() / 2;
+
+        const auto n_uniform = 1 << ceil_log2(n);
+        const auto req_slm_size_val = sizeof(T) * n_uniform;
+
+        return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size)
+                   ?
+                   // the values and the counters are placed in SLM
+                   temp_allocations::both_in_slm
+                   : (req_slm_size_counters <= max_slm_size)
+                         ?
+                         // the counters are placed in SLM, the values - in the
+                         // global memory
+                         temp_allocations::counters_in_slm
+                         :
+                         // the values and the counters are placed in the global
+                         // memory
+                         temp_allocations::both_in_global_mem;
+    }
+
+    template <typename KernelName>
+    struct one_group_submitter
+    {
+        template <typename InputT,
+                  typename OutputT,
+                  typename ProjT,
+                  typename SLM_value_tag,
+                  typename SLM_counter_tag>
+        sycl::event operator()(sycl::queue &exec_q,
+                               std::size_t n_iters,
+                               std::size_t n_batch_size,
+                               std::size_t n_values,
+                               InputT *input_arr,
+                               OutputT *output_arr,
+                               const ProjT &proj_op,
+                               const bool is_ascending,
+                               SLM_value_tag,
+                               SLM_counter_tag,
+                               const std::vector<sycl::event> &depends)
+        {
+            assert(!(n_values >> 16));
+
+            assert(n_values <= static_cast<std::size_t>(block_size) *
+                                   static_cast<std::size_t>(wg_size));
+
+            const std::uint16_t n = static_cast<std::uint16_t>(n_values);
+            static_assert(std::is_same_v<std::remove_cv_t<InputT>, OutputT>);
+
+            using ValueT = OutputT;
+
+            using KeyT = std::invoke_result_t<ProjT, ValueT>;
+
+            TempBuf<ValueT, SLM_value_tag> buf_val(
+                n_batch_size, static_cast<std::size_t>(block_size * wg_size));
+            TempBuf<std::uint16_t, SLM_counter_tag> buf_count(
+                n_batch_size, static_cast<std::size_t>(counter_buf_sz));
+
+            sycl::range<1> lRange{wg_size};
+
+            sycl::event sort_ev;
+            std::vector<sycl::event> deps{depends};
+
+            const std::size_t n_batches =
+                (n_iters + n_batch_size - 1) / n_batch_size;
+
+            const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+            auto const &ctx = exec_q.get_context();
+            auto const &dev = exec_q.get_device();
+            auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                ctx, {dev}, {kernel_id});
+
+            const auto &krn = kb.get_kernel(kernel_id);
+
+            const std::uint32_t krn_sg_size = krn.template get_info<
+                sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+            // due to a bug in CPU device implementation, an additional
+            // synchronization is necessary for short sub-group sizes
+            const bool work_around_needed =
+                exec_q.get_device().has(sycl::aspect::cpu) &&
+                (krn_sg_size < 16);
+
+            for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) {
+
+                const std::size_t block_start = batch_id * n_batch_size;
+
+                // input_arr/output_arr each has shape (n_iters, n)
+                InputT *this_input_arr = input_arr + block_start * n_values;
+                OutputT *this_output_arr = output_arr + block_start * n_values;
+
+                const std::size_t block_end =
+                    std::min<std::size_t>(block_start + n_batch_size, n_iters);
+
+                sycl::range<1> gRange{(block_end - block_start) * wg_size};
+                sycl::nd_range ndRange{gRange, lRange};
+
+                sort_ev = exec_q.submit([&](sycl::handler &cgh) {
+                    cgh.depends_on(deps);
+                    cgh.use_kernel_bundle(kb);
+
+                    // allocation to use for value exchanges
+                    auto exchange_acc = buf_val.get_acc(cgh);
+                    const std::size_t exchange_acc_iter_stride =
+                        buf_val.get_iter_stride();
+
+                    // allocation for counters
+                    auto counter_acc = buf_count.get_acc(cgh);
+                    const std::size_t counter_acc_iter_stride =
+                        buf_count.get_iter_stride();
+
+                    cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1>
+                                                                  ndit) {
+                        ValueT values[block_size];
+
+                        const std::size_t iter_id = ndit.get_group(0);
+                        const std::size_t iter_val_offset =
+                            iter_id * static_cast<std::size_t>(n);
+                        const std::size_t iter_counter_offset =
+                            iter_id * counter_acc_iter_stride;
+                        const std::size_t iter_exchange_offset =
+                            iter_id * exchange_acc_iter_stride;
+
+                        std::uint16_t wi = ndit.get_local_linear_id();
+                        std::uint16_t begin_bit = 0;
+
+                        static constexpr std::uint16_t end_bit =
+                            number_of_bits_in_type<KeyT>();
+
+                        // copy from input array into values
+#pragma unroll
+                        for (std::uint16_t i = 0; i < block_size; ++i) {
+                            const std::uint16_t id = wi * block_size + i;
+                            values[i] =
+                                (id < n) ? this_input_arr[iter_val_offset + id]
+                                         : ValueT{};
+                        }
+
+                        while (true) {
+                            // indices for indirect access in the "re-order"
+                            // phase
+                            std::uint16_t indices[block_size];
+                            {
+                                // pointers to bucket's counters
+                                std::uint16_t *counters[block_size];
+
+                                // counting phase
+                                auto pcounter =
+                                    get_accessor_pointer(counter_acc) +
+                                    (wi + iter_counter_offset);
+
+                                // initialize counters
+#pragma unroll
+                                for (std::uint16_t i = 0; i < bin_count; ++i)
+                                    pcounter[i * wg_size] = std::uint16_t{0};
+
+                                sycl::group_barrier(ndit.get_group());
+
+                                if (is_ascending) {
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i) {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
+
+                                        // points to the padded element, i.e. id
+                                        // is in-range
+                                        static constexpr std::uint16_t
+                                            default_out_of_range_bin_id =
+                                                bin_mask;
+
+                                        const std::uint16_t bin =
+                                            (id < n)
+                                                ? get_bucket_id<bin_mask>(
+                                                      order_preserving_cast<
+                                                          /* is_ascending */
+                                                          true>(
+                                                          proj_op(values[i])),
+                                                      begin_bit)
+                                                : default_out_of_range_bin_id;
+
+                                        // counting and local offset calculation
+                                        counters[i] = &pcounter[bin * wg_size];
+                                        indices[i] = *counters[i];
+                                        *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
+                                    }
+                                }
+                                else {
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i) {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
+
+                                        // points to the padded element, i.e. id
+                                        // is in-range
+                                        static constexpr std::uint16_t
+                                            default_out_of_range_bin_id =
+                                                bin_mask;
+
+                                        const std::uint16_t bin =
+                                            (id < n)
+                                                ? get_bucket_id<bin_mask>(
+                                                      order_preserving_cast<
+                                                          /* is_ascending */
+                                                          false>(
+                                                          proj_op(values[i])),
+                                                      begin_bit)
+                                                : default_out_of_range_bin_id;
+
+                                        // counting and local offset calculation
+                                        counters[i] = &pcounter[bin * wg_size];
+                                        indices[i] = *counters[i];
+                                        *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
+                                    }
+                                }
+
+                                sycl::group_barrier(ndit.get_group());
+
+                                // exclusive scan phase
+                                {
+
+                                    // scan contiguous numbers
+                                    std::uint16_t bin_sum[bin_count];
+                                    const std::size_t counter_offset0 =
+                                        iter_counter_offset + wi * bin_count;
+                                    bin_sum[0] = counter_acc[counter_offset0];
+
+#pragma unroll
+                                    for (std::uint16_t i = 1; i < bin_count;
+                                         ++i)
+                                        bin_sum[i] =
+                                            bin_sum[i - 1] +
+                                            counter_acc[counter_offset0 + i];
+
+                                    sycl::group_barrier(ndit.get_group());
+
+                                    // exclusive scan local sum
+                                    std::uint16_t sum_scan =
+                                        sycl::exclusive_scan_over_group(
+                                            ndit.get_group(),
+                                            bin_sum[bin_count - 1],
+                                            sycl::plus<std::uint16_t>());
+
+// add to local sum, generate exclusive scan result
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < bin_count;
+                                         ++i)
+                                        counter_acc[counter_offset0 + i + 1] =
+                                            sum_scan + bin_sum[i];
+
+                                    if (wi == 0)
+                                        counter_acc[iter_counter_offset + 0] =
+                                            std::uint32_t{0};
+
+                                    sycl::group_barrier(ndit.get_group());
+                                }
+
+#pragma unroll
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
+                                    // a global index is a local offset plus a
+                                    // global base index
+                                    indices[i] += *counters[i];
+                                }
+
+                                sycl::group_barrier(ndit.get_group());
+                            }
+
+                            begin_bit += radix;
+
+                            // "re-order" phase
+                            sycl::group_barrier(ndit.get_group());
+                            if (begin_bit >= end_bit) {
+                                // the last iteration - writing out the result
+#pragma unroll
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
+                                    const std::uint16_t r = indices[i];
+                                    if (r < n) {
+                                        this_output_arr[iter_val_offset + r] =
+                                            values[i];
+                                    }
+                                }
+
+                                return;
+                            }
+
+                            // data exchange
+#pragma unroll
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t r = indices[i];
+                                if (r < n)
+                                    exchange_acc[iter_exchange_offset + r] =
+                                        values[i];
+                            }
+
+                            sycl::group_barrier(ndit.get_group());
+
+#pragma unroll
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t id = wi * block_size + i;
+                                if (id < n)
+                                    values[i] =
+                                        exchange_acc[iter_exchange_offset + id];
+                            }
+
+                            sycl::group_barrier(ndit.get_group());
+                        }
+                    });
+                });
+
+                deps = {sort_ev};
+            }
+
+            return sort_ev;
+        }
+    };
+};
+
+template <typename ValueT, typename ProjT>
+struct OneWorkGroupRadixSortKernel;
+
+//-----------------------------------------------------------------------
+// radix sort: main function
+//-----------------------------------------------------------------------
+template <typename ValueT, typename ProjT>
+sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
+                                     std::size_t n_iters,
+                                     std::size_t n_to_sort,
+                                     const ValueT *input_arr,
+                                     ValueT *output_arr,
+                                     const ProjT &proj_op,
+                                     const bool is_ascending,
+                                     const std::vector<sycl::event> &depends)
+{
+    assert(n_to_sort > 1);
+
+    using KeyT = std::remove_cv_t<
+        std::remove_reference_t<std::invoke_result_t<ProjT, ValueT>>>;
+
+    // radix bits represent number of processed bits in each value during one
+    // iteration
+    static constexpr std::uint32_t radix_bits = 4;
+
+    sycl::event sort_ev{};
+
+    const auto &dev = exec_q.get_device();
+    const auto max_wg_size =
+        dev.template get_info<sycl::info::device::max_work_group_size>();
+
+    static constexpr std::uint16_t ref_wg_size = 64;
+    if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) {
+        using _RadixSortKernel = OneWorkGroupRadixSortKernel<ValueT, ProjT>;
+
+        if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) {
+            // wg_size * block_size == 64 * 1 * 1 == 64
+            static constexpr std::uint16_t wg_size = ref_wg_size;
+            static constexpr std::uint16_t block_size = 1;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 1 == 128
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 1;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 2 == 256
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 2;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 4 == 512
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 4;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 8 == 1024
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 8;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) {
+            // wg_size * block_size == 64 * 4 * 8 == 2048
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 8;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) {
+            // wg_size * block_size == 64 * 4 * 16 == 4096
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 16;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) {
+            // wg_size * block_size == 64 * 8 * 16 == 8192
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 16;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else {
+            // wg_size * block_size == 64 * 8 * 32 == 16384
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 32;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+    }
+    else {
+        static constexpr std::uint32_t radix_iters =
+            number_of_buckets_in_type<KeyT>(radix_bits);
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
+
+        static constexpr std::size_t bound_512k = (std::size_t(1) << 19);
+        static constexpr std::size_t bound_2m = (std::size_t(1) << 21);
+
+        const auto wg_sz_k = (n_to_sort < bound_512k)  ? 8
+                             : (n_to_sort <= bound_2m) ? 4
+                                                       : 1;
+        const std::size_t wg_size = max_wg_size / wg_sz_k;
+
+        const std::size_t n_segments = (n_to_sort + wg_size - 1) / wg_size;
+
+        // Additional radix_states elements are used for getting local offsets
+        // from count values + no_op flag; 'No operation' flag specifies whether
+        // to skip re-order phase if the all keys are the same (lie in one bin)
+        const std::size_t n_counts =
+            (n_segments + 1) * radix_states + 1 /*no_op flag*/;
+
+        using CountT = std::uint32_t;
+
+        // memory for storing count and offset values
+        auto count_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
+                n_iters * n_counts, exec_q);
+
+        CountT *count_ptr = count_owner.get();
+
+        static constexpr std::uint32_t zero_radix_iter{0};
+
+        if constexpr (std::is_same_v<KeyT, bool>) {
+
+            sort_ev = parallel_radix_sort_iteration_step<
+                radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                                   zero_radix_iter, n_to_sort,
+                                                   input_arr, output_arr,
+                                                   n_counts, count_ptr, proj_op,
+                                                   is_ascending, depends);
+
+            sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {sort_ev}, count_owner);
+
+            return sort_ev;
+        }
+
+        auto tmp_arr_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
+                n_iters * n_to_sort, exec_q);
+
+        ValueT *tmp_arr = tmp_arr_owner.get();
+
+        // iterations per each bucket
+        assert("Number of iterations must be even" && radix_iters % 2 == 0);
+        assert(radix_iters > 0);
+
+        sort_ev = parallel_radix_sort_iteration_step<
+            radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                               zero_radix_iter, n_to_sort,
+                                               input_arr, tmp_arr, n_counts,
+                                               count_ptr, proj_op, is_ascending,
+                                               depends);
+
+        for (std::uint32_t radix_iter = 1; radix_iter < radix_iters;
+             ++radix_iter) {
+            if (radix_iter % 2 == 0) {
+                sort_ev = parallel_radix_sort_iteration_step<
+                    radix_bits,
+                    /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                           radix_iter, n_to_sort, output_arr,
+                                           tmp_arr, n_counts, count_ptr,
+                                           proj_op, is_ascending, {sort_ev});
+            }
+            else {
+                sort_ev = parallel_radix_sort_iteration_step<
+                    radix_bits,
+                    /*even=*/false>::submit(exec_q, n_iters, n_segments,
+                                            radix_iter, n_to_sort, tmp_arr,
+                                            output_arr, n_counts, count_ptr,
+                                            proj_op, is_ascending, {sort_ev});
+            }
+        }
+
+        sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {sort_ev}, tmp_arr_owner, count_owner);
+    }
+
+    return sort_ev;
+}
+
+struct IdentityProj
+{
+    constexpr IdentityProj() {}
+
+    template <typename T>
+    constexpr T operator()(T val) const
+    {
+        return val;
+    }
+};
+
+template <typename ValueT, typename IndexT>
+struct ValueProj
+{
+    constexpr ValueProj() {}
+
+    constexpr ValueT operator()(const std::pair<ValueT, IndexT> &pair) const
+    {
+        return pair.first;
+    }
+};
+
+template <typename IndexT, typename ValueT, typename ProjT>
+struct IndexedProj
+{
+    IndexedProj(const ValueT *arg_ptr) : ptr(arg_ptr), value_projector{} {}
+
+    IndexedProj(const ValueT *arg_ptr, const ProjT &proj_op)
+        : ptr(arg_ptr), value_projector(proj_op)
+    {
+    }
+
+    auto operator()(IndexT i) const
+    {
+        return value_projector(ptr[i]);
+    }
+
+private:
+    const ValueT *ptr;
+    ProjT value_projector;
+};
+
+} // namespace radix_sort_details
+
+using dpctl::tensor::ssize_t;
+
+template <typename argTy>
+sycl::event
+    radix_sort_axis1_contig_impl(sycl::queue &exec_q,
+                                 const bool sort_ascending,
+                                 // number of sub-arrays to sort (num. of rows
+                                 // in a matrix when sorting over rows)
+                                 std::size_t iter_nelems,
+                                 // size of each array to sort  (length of rows,
+                                 // i.e. number of columns)
+                                 std::size_t sort_nelems,
+                                 const char *arg_cp,
+                                 char *res_cp,
+                                 ssize_t iter_arg_offset,
+                                 ssize_t iter_res_offset,
+                                 ssize_t sort_arg_offset,
+                                 ssize_t sort_res_offset,
+                                 const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    using Proj = radix_sort_details::IdentityProj;
+    static constexpr Proj proj_op{};
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<argTy, Proj>(
+            exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, proj_op,
+            sort_ascending, depends);
+
+    return radix_sort_ev;
+}
+
+template <typename ValueT, typename IndexT>
+class radix_argsort_index_write_out_krn;
+
+template <typename ValueT, typename IndexT>
+class radix_argsort_iota_krn;
+
+template <typename argTy, typename IndexTy>
+sycl::event
+    radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
+                                    const bool sort_ascending,
+                                    // number of sub-arrays to sort (num. of
+                                    // rows in a matrix when sorting over rows)
+                                    std::size_t iter_nelems,
+                                    // size of each array to sort  (length of
+                                    // rows, i.e. number of columns)
+                                    std::size_t sort_nelems,
+                                    const char *arg_cp,
+                                    char *res_cp,
+                                    ssize_t iter_arg_offset,
+                                    ssize_t iter_res_offset,
+                                    ssize_t sort_arg_offset,
+                                    ssize_t sort_res_offset,
+                                    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    const std::size_t total_nelems = iter_nelems * sort_nelems;
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
+                                                                 exec_q);
+
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+
+    using IdentityProjT = radix_sort_details::IdentityProj;
+    using IndexedProjT =
+        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
+    const IndexedProjT proj_op{arg_tp};
+
+    using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
+            exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op,
+            sort_ascending, {iota_ev});
+
+    using MapBackKernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
+
+    sycl::event dep = radix_sort_ev;
+
+    // no need to perform map_back ( id % sort_nelems)
+    //   if total_nelems == sort_nelems
+    if (iter_nelems > 1u) {
+        dep = map_back_impl<MapBackKernelName, IndexTy>(
+            exec_q, total_nelems, res_tp, res_tp, sort_nelems, {dep});
+    }
+
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {dep}, workspace_owner);
+
+    return cleanup_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
new file mode 100644
index 000000000000..7b48f310a445
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
@@ -0,0 +1,61 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &,
+                                            std::size_t,
+                                            std::size_t,
+                                            const char *,
+                                            char *,
+                                            ssize_t,
+                                            ssize_t,
+                                            ssize_t,
+                                            ssize_t,
+                                            const std::vector<sycl::event> &);
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
new file mode 100644
index 000000000000..8a6b1a8131ca
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -0,0 +1,148 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <span>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::kernels::sort_utils_detail
+{
+
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+template <class KernelName, typename T>
+sycl::event iota_impl(sycl::queue &exec_q,
+                      T *data,
+                      std::size_t nelems,
+                      const std::vector<sycl::event> &dependent_events)
+{
+    static constexpr std::uint32_t lws = 256;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws);
+
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t offset = (gid - lane_id) * n_wi;
+            const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+
+            std::array<T, n_wi> stripe{};
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                stripe[i] = T(offset + lane_id + i * max_sgSize);
+            }
+
+            if (offset + n_wi * max_sgSize < nelems) {
+                static constexpr auto group_ls_props = syclexp::properties{
+                    syclexp::data_placement_striped
+                    // , syclexp::full_group
+                };
+
+                auto out_multi_ptr = sycl::address_space_cast<
+                    sycl::access::address_space::global_space,
+                    sycl::access::decorated::yes>(&data[offset]);
+
+                syclexp::group_store(sg, sycl::span<T, n_wi>{&stripe[0], n_wi},
+                                     out_multi_ptr, group_ls_props);
+            }
+            else {
+                for (std::size_t idx = offset + lane_id; idx < nelems;
+                     idx += max_sgSize) {
+                    data[idx] = T(idx);
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+template <class KernelName, typename IndexTy>
+sycl::event map_back_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          const IndexTy *flat_index_data,
+                          IndexTy *reduced_index_data,
+                          std::size_t row_size,
+                          const std::vector<sycl::event> &dependent_events)
+{
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const IndexTy linear_index = flat_index_data[data_id];
+                    reduced_index_data[data_id] = (linear_index % row_size);
+                }
+            }
+        });
+    });
+
+    return map_back_ev;
+}
+
+} // namespace dpctl::tensor::kernels::sort_utils_detail
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
new file mode 100644
index 000000000000..0fdf7bec80fd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
@@ -0,0 +1,179 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+    py_sort(const dpctl::tensor::usm_ndarray &src,
+            const int trailing_dims_to_sort,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends,
+            const sorting_contig_impl_fnT &sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, sort_nelems * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error("Both input arrays must have "
+                              "the same value data type");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        if (sort_nelems > 1) {
+            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+            auto fn = sort_contig_fns[src_typeid];
+
+            if (nullptr == fn) {
+                throw py::value_error(
+                    "Not implemented for the dtype of input arrays");
+            }
+
+            sycl::event comp_ev =
+                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
+                   dst.get_data(), zero_offset, zero_offset, zero_offset,
+                   zero_offset, depends);
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+            return std::make_pair(keep_args_alive_ev, comp_ev);
+        }
+        else {
+            assert(dst.get_size() == iter_nelems);
+            int src_elemsize = src.get_elemsize();
+
+            sycl::event copy_ev =
+                exec_q.copy<char>(src.get_data(), dst.get_data(),
+                                  src_elemsize * iter_nelems, depends);
+
+            return std::make_pair(
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                copy_ev);
+        }
+    }
+
+    throw py::value_error(
+        "Both source and destination arrays must be C-contiguous");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
new file mode 100644
index 000000000000..fb3cab487df8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
@@ -0,0 +1,192 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "py_sort_common.hpp"
+#include "radix_sort.hpp"
+#include "radix_sort_support.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_radix_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_radix_sort_contig_dispatch_vector[td_ns::num_types];
+
+namespace
+{
+
+template <bool is_ascending, typename T>
+sycl::event sort_axis1_contig_caller(sycl::queue &q,
+                                     std::size_t iter_nelems,
+                                     std::size_t sort_nelems,
+                                     const char *arg_cp,
+                                     char *res_cp,
+                                     ssize_t iter_arg_offset,
+                                     ssize_t iter_res_offset,
+                                     ssize_t sort_arg_offset,
+                                     ssize_t sort_res_offset,
+                                     const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::radix_sort_axis1_contig_impl;
+
+    return radix_sort_axis1_contig_impl<T>(
+        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
+        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
+        depends);
+}
+
+} // end of anonymous namespace
+
+template <typename fnT, typename argTy>
+struct AscendingRadixSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
+            return sort_axis1_contig_caller</*ascending*/ true, argTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy>
+struct DescendingRadixSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
+            return sort_axis1_contig_caller</*ascending*/ false, argTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_radix_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<
+        sort_contig_fn_ptr_t, AscendingRadixSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_radix_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingRadixSortContigFactory,
+                                 td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_radix_sort_contig_dispatch_vector);
+}
+
+bool py_radix_sort_defined(int typenum)
+{
+    const auto &array_types = td_ns::usm_ndarray_types();
+
+    try {
+        int type_id = array_types.typenum_to_lookup_id(typenum);
+        return (nullptr !=
+                ascending_radix_sort_contig_dispatch_vector[type_id]);
+    } catch (const std::exception &e) {
+        return false;
+    }
+}
+
+void init_radix_sort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_radix_sort_dispatch_vectors();
+
+    auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                      const int trailing_dims_to_sort,
+                                      const dpctl::tensor::usm_ndarray &dst,
+                                      sycl::queue &exec_q,
+                                      const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                ascending_radix_sort_contig_dispatch_vector);
+    };
+    m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_radix_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                       const int trailing_dims_to_sort,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                descending_radix_sort_contig_dispatch_vector);
+    };
+    m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_radix_sort_dtype_supported", py_radix_sort_defined);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp
new file mode 100644
index 000000000000..5f3c771b464b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_radix_sort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp
new file mode 100644
index 000000000000..8d7e55a5cd28
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp
@@ -0,0 +1,78 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename Ty, typename ArgTy>
+struct TypeDefinedEntry : std::bool_constant<std::is_same_v<Ty, ArgTy>>
+{
+    static constexpr bool is_defined = true;
+};
+
+struct NotDefinedEntry : std::true_type
+{
+    static constexpr bool is_defined = false;
+};
+
+template <typename T>
+struct RadixSortSupportVector
+{
+    using resolver_t =
+        typename std::disjunction<TypeDefinedEntry<T, bool>,
+                                  TypeDefinedEntry<T, std::int8_t>,
+                                  TypeDefinedEntry<T, std::uint8_t>,
+                                  TypeDefinedEntry<T, std::int16_t>,
+                                  TypeDefinedEntry<T, std::uint16_t>,
+                                  TypeDefinedEntry<T, std::int32_t>,
+                                  TypeDefinedEntry<T, std::uint32_t>,
+                                  TypeDefinedEntry<T, std::int64_t>,
+                                  TypeDefinedEntry<T, std::uint64_t>,
+                                  TypeDefinedEntry<T, sycl::half>,
+                                  TypeDefinedEntry<T, float>,
+                                  TypeDefinedEntry<T, double>,
+                                  NotDefinedEntry>;
+
+    static constexpr bool is_defined = resolver_t::is_defined;
+};
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
new file mode 100644
index 000000000000..d9ab1feca46a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+// #include "sorting/isin.hpp"
+// #include "sorting/merge_argsort.hpp"
+#include "sorting/merge_sort.hpp"
+// #include "sorting/radix_argsort.hpp"
+#include "sorting/radix_sort.hpp"
+// #include "sorting/searchsorted.hpp"
+// #include "sorting/topk.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_sorting_impl, m)
+{
+    // dpctl::tensor::py_internal::init_isin_functions(m);
+    dpctl::tensor::py_internal::init_merge_sort_functions(m);
+    // dpctl::tensor::py_internal::init_merge_argsort_functions(m);
+    // dpctl::tensor::py_internal::init_searchsorted_functions(m);
+    dpctl::tensor::py_internal::init_radix_sort_functions(m);
+    // dpctl::tensor::py_internal::init_radix_argsort_functions(m);
+    // dpctl::tensor::py_internal::init_topk_functions(m);
+}
diff --git a/pyproject.toml b/pyproject.toml
index 73844d1b0c17..6f5c83e542ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT"
 quiet-level = 3
 
 [tool.coverage.report]

From 1f018eca273573d66bf35f64497b815682642b0b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 05:52:48 -0800
Subject: [PATCH 113/145] Move _sort_ascending/descending to
 dpctl_ext.tensor._tensor_sorting_impl

---
 .../include/kernels/sorting/merge_sort.hpp    | 857 ++++++++++++++++++
 .../kernels/sorting/search_sorted_detail.hpp  | 125 +++
 .../libtensor/source/sorting/merge_sort.cpp   | 138 +++
 .../libtensor/source/sorting/merge_sort.hpp   |  47 +
 pyproject.toml                                |   2 +-
 5 files changed, 1168 insertions(+), 1 deletion(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
new file mode 100644
index 000000000000..e14425605fe2
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -0,0 +1,857 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace merge_sort_detail
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::kernels::search_sorted_detail;
+
+/*! @brief Merge two contiguous sorted segments */
+template <typename InAcc, typename OutAcc, typename Compare>
+void merge_impl(const std::size_t offset,
+                const InAcc in_acc,
+                OutAcc out_acc,
+                const std::size_t start_1,
+                const std::size_t end_1,
+                const std::size_t end_2,
+                const std::size_t start_out,
+                Compare comp,
+                const std::size_t chunk)
+{
+    const std::size_t start_2 = end_1;
+    // Borders of the sequences to merge within this call
+    const std::size_t local_start_1 = sycl::min(offset + start_1, end_1);
+    const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1);
+    const std::size_t local_start_2 = sycl::min(offset + start_2, end_2);
+    const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2);
+
+    const std::size_t local_size_1 = local_end_1 - local_start_1;
+    const std::size_t local_size_2 = local_end_2 - local_start_2;
+
+    const auto r_item_1 = in_acc[end_1 - 1];
+    const auto l_item_2 = (start_2 < end_2) ? in_acc[start_2] : r_item_1;
+
+    // Copy if the sequences are sorted with respect to each other or merge
+    // otherwise
+    if (!comp(l_item_2, r_item_1)) {
+        const std::size_t out_shift_1 = start_out + local_start_1 - start_1;
+        const std::size_t out_shift_2 =
+            start_out + end_1 - start_1 + local_start_2 - start_2;
+
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    else if (comp(r_item_1, l_item_2)) {
+        const std::size_t out_shift_1 =
+            start_out + end_2 - start_2 + local_start_1 - start_1;
+        const std::size_t out_shift_2 = start_out + local_start_2 - start_2;
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    // Perform merging
+    else {
+
+        // Process 1st sequence
+        if (local_start_1 < local_end_1) {
+            // Reduce the range for searching within the 2nd sequence and handle
+            // bound items find left border in 2nd sequence
+            const auto local_l_item_1 = in_acc[local_start_1];
+            std::size_t l_search_bound_2 =
+                lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp);
+            const std::size_t l_shift_1 = local_start_1 - start_1;
+            const std::size_t l_shift_2 = l_search_bound_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
+
+            std::size_t r_search_bound_2{};
+            // find right border in 2nd sequence
+            if (local_size_1 > 1) {
+                const auto local_r_item_1 = in_acc[local_end_1 - 1];
+                r_search_bound_2 = lower_bound_impl(
+                    in_acc, l_search_bound_2, end_2, local_r_item_1, comp);
+                const auto r_shift_1 = local_end_1 - 1 - start_1;
+                const auto r_shift_2 = r_search_bound_2 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
+            }
+
+            // Handle intermediate items
+            if (r_search_bound_2 == l_search_bound_2) {
+                const std::size_t shift_2 = l_search_bound_2 - start_2;
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    const std::size_t shift_1 = idx - start_1;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+            else {
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    // we shouldn't seek in whole 2nd sequence. Just for the
+                    // part where the 1st sequence should be
+                    l_search_bound_2 = lower_bound_impl(
+                        in_acc, l_search_bound_2, r_search_bound_2,
+                        intermediate_item_1, comp);
+                    const std::size_t shift_1 = idx - start_1;
+                    const std::size_t shift_2 = l_search_bound_2 - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+        }
+        // Process 2nd sequence
+        if (local_start_2 < local_end_2) {
+            // Reduce the range for searching within the 1st sequence and handle
+            // bound items find left border in 1st sequence
+            const auto local_l_item_2 = in_acc[local_start_2];
+            std::size_t l_search_bound_1 =
+                upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp);
+            const std::size_t l_shift_1 = l_search_bound_1 - start_1;
+            const std::size_t l_shift_2 = local_start_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
+
+            std::size_t r_search_bound_1{};
+            // find right border in 1st sequence
+            if (local_size_2 > 1) {
+                const auto local_r_item_2 = in_acc[local_end_2 - 1];
+                r_search_bound_1 = upper_bound_impl(
+                    in_acc, l_search_bound_1, end_1, local_r_item_2, comp);
+                const std::size_t r_shift_1 = r_search_bound_1 - start_1;
+                const std::size_t r_shift_2 = local_end_2 - 1 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
+            }
+
+            // Handle intermediate items
+            if (l_search_bound_1 == r_search_bound_1) {
+                const std::size_t shift_1 = l_search_bound_1 - start_1;
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    const std::size_t shift_2 = idx - start_2;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+            else {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
+                {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    // we shouldn't seek in whole 1st sequence. Just for the
+                    // part where the 2nd sequence should be
+                    l_search_bound_1 = upper_bound_impl(
+                        in_acc, l_search_bound_1, r_search_bound_1,
+                        intermediate_item_2, comp);
+                    const std::size_t shift_1 = l_search_bound_1 - start_1;
+                    const std::size_t shift_2 = idx - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void insertion_sort_impl(Iter &&first,
+                         std::size_t begin,
+                         std::size_t end,
+                         Compare &&comp)
+{
+    for (std::size_t i = begin + 1; i < end; ++i) {
+        const auto val_i = first[i];
+        std::size_t j = i - 1;
+        while ((j + 1 > begin) && (comp(val_i, first[j]))) {
+            first[j + 1] = first[j];
+            --j;
+        }
+        if (j + 1 < i) {
+            first[j + 1] = val_i;
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void leaf_sort_impl(Iter &&first,
+                    std::size_t begin,
+                    std::size_t end,
+                    Compare &&comp)
+{
+    return insertion_sort_impl<Iter, Compare>(std::forward<Iter>(first),
+                                              std::move(begin), std::move(end),
+                                              std::forward<Compare>(comp));
+}
+
+template <typename Iter>
+struct GetValueType
+{
+    using value_type = typename std::iterator_traits<Iter>::value_type;
+};
+
+template <typename ElementType,
+          sycl::access::address_space Space,
+          sycl::access::decorated IsDecorated>
+struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType,
+          int Dim,
+          sycl::access_mode Mode,
+          sycl::target Target,
+          sycl::access::placeholder isPlaceholder>
+struct GetValueType<
+    sycl::accessor<ElementType, Dim, Mode, Target, isPlaceholder>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    using value_type = ElementType;
+};
+
+template <typename Iter>
+struct GetReadOnlyAccess
+{
+    Iter operator()(const Iter &it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(const sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_only);
+        return acc;
+    }
+};
+
+template <typename Iter>
+struct GetWriteDiscardAccess
+{
+    Iter operator()(Iter it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init);
+        return acc;
+    }
+};
+
+template <typename Iter>
+struct GetReadWriteAccess
+{
+    Iter operator()(Iter &it, sycl::handler &)
+    {
+        return it;
+    }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadWriteAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_write);
+        return acc;
+    }
+};
+
+template <typename T1, typename T2, typename Comp>
+class sort_base_step_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+    sort_base_step_contig_impl(sycl::queue &q,
+                               const std::size_t iter_nelems,
+                               const std::size_t sort_nelems,
+                               const InpAcc input,
+                               OutAcc output,
+                               const Comp &comp,
+                               const std::size_t conseq_nelems_sorted,
+                               const std::vector<sycl::event> &depends = {})
+{
+
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
+
+    const std::size_t n_segments =
+        quotient_ceil(sort_nelems, conseq_nelems_sorted);
+
+    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const sycl::range<1> gRange{iter_nelems * n_segments};
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
+            const std::size_t iter_id = id[0] / n_segments;
+            const std::size_t segment_id = id[0] - iter_id * n_segments;
+
+            const std::size_t iter_offset = iter_id * sort_nelems;
+            const std::size_t beg_id =
+                iter_offset + segment_id * conseq_nelems_sorted;
+            const std::size_t end_id =
+                iter_offset +
+                std::min((segment_id + 1) * conseq_nelems_sorted, sort_nelems);
+            for (std::size_t i = beg_id; i < end_id; ++i) {
+                output_acc[i] = input_acc[i];
+            }
+
+            leaf_sort_impl(output_acc, beg_id, end_id, comp);
+        });
+    });
+
+    return base_sort;
+}
+
+template <typename T1, typename T2, typename Comp>
+class sort_over_work_group_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event sort_over_work_group_contig_impl(
+    sycl::queue &q,
+    std::size_t iter_nelems,
+    std::size_t sort_nelems,
+    const InpAcc input,
+    OutAcc output,
+    const Comp &comp,
+    std::size_t &nelems_wg_sorts,
+    const std::vector<sycl::event> &depends = {})
+{
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using T = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_over_work_group_contig_krn<inpT, T, Comp>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = q.get_context();
+    auto const &dev = q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+    const std::uint64_t device_local_memory_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+
+    //  leave 512 bytes of local memory for RT
+    const std::uint64_t safety_margin = 512;
+
+    const std::uint64_t nelems_per_slm =
+        (device_local_memory_size - safety_margin) / (2 * sizeof(T));
+
+    static constexpr std::uint32_t sub_groups_per_work_group = 4;
+    const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+    const std::size_t lws = sub_groups_per_work_group * max_sg_size;
+
+    nelems_wg_sorts = elems_per_wi * lws;
+
+    if (nelems_wg_sorts > nelems_per_slm) {
+        nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
+
+        return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
+            q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
+            depends);
+    }
+
+    // This assumption permits doing away with using a loop
+    assert(nelems_wg_sorts % lws == 0);
+
+    const std::size_t n_segments = quotient_ceil(sort_nelems, nelems_wg_sorts);
+
+    sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.use_kernel_bundle(kb);
+
+        sycl::range<1> global_range{iter_nelems * n_segments * lws};
+        sycl::range<1> local_range{lws};
+
+        sycl::range<1> slm_range{nelems_wg_sorts};
+        sycl::local_accessor<T, 1> work_space(slm_range, cgh);
+        sycl::local_accessor<T, 1> scratch_space(slm_range, cgh);
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        sycl::nd_range<1> ndRange(global_range, local_range);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t group_id = it.get_group_linear_id();
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t segment_id = group_id - iter_id * n_segments;
+            const std::size_t lid = it.get_local_linear_id();
+
+            const std::size_t segment_start_idx = segment_id * nelems_wg_sorts;
+            const std::size_t segment_end_idx =
+                std::min(segment_start_idx + nelems_wg_sorts, sort_nelems);
+            const std::size_t wg_chunk_size =
+                segment_end_idx - segment_start_idx;
+
+            // load input into SLM
+            for (std::size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                T v = (array_id < sort_nelems)
+                          ? input_acc[iter_id * sort_nelems + array_id]
+                          : T{};
+                work_space[array_id - segment_start_idx] = v;
+            }
+            sycl::group_barrier(it.get_group());
+
+            const std::size_t chunk = quotient_ceil(nelems_wg_sorts, lws);
+
+            const std::size_t chunk_start_idx = lid * chunk;
+            const std::size_t chunk_end_idx =
+                sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+            leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp);
+
+            sycl::group_barrier(it.get_group());
+
+            bool data_in_temp = false;
+            std::size_t n_chunks_merged = 1;
+
+            // merge chunk while n_chunks_merged * chunk < wg_chunk_size
+            const std::size_t max_chunks_merged =
+                1 + ((wg_chunk_size - 1) / chunk);
+            for (; n_chunks_merged < max_chunks_merged;
+                 data_in_temp = !data_in_temp, n_chunks_merged *= 2)
+            {
+                const std::size_t nelems_sorted_so_far =
+                    n_chunks_merged * chunk;
+                const std::size_t q = (lid / n_chunks_merged);
+                const std::size_t start_1 =
+                    sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size);
+                const std::size_t end_1 =
+                    sycl::min(start_1 + nelems_sorted_so_far, wg_chunk_size);
+                const std::size_t end_2 =
+                    sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size);
+                const std::size_t offset = chunk * (lid - q * n_chunks_merged);
+
+                if (data_in_temp) {
+                    merge_impl(offset, scratch_space, work_space, start_1,
+                               end_1, end_2, start_1, comp, chunk);
+                }
+                else {
+                    merge_impl(offset, work_space, scratch_space, start_1,
+                               end_1, end_2, start_1, comp, chunk);
+                }
+                sycl::group_barrier(it.get_group());
+            }
+
+            const auto &out_src = (data_in_temp) ? scratch_space : work_space;
+            for (std::size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws)
+            {
+                if (array_id < sort_nelems) {
+                    output_acc[iter_id * sort_nelems + array_id] =
+                        out_src[array_id - segment_start_idx];
+                }
+            }
+        });
+    });
+
+    return base_sort_ev;
+}
+
+class vacuous_krn;
+
+inline sycl::event tie_events(sycl::queue &q,
+                              const std::vector<sycl::event> depends)
+{
+    if (depends.empty())
+        return sycl::event();
+    if (depends.size() == 1)
+        return depends[0];
+
+    sycl::event e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        using KernelName = vacuous_krn;
+        cgh.single_task<KernelName>([]() {});
+    });
+
+    return e;
+}
+
+template <typename T, typename Comp>
+class merge_adjacent_blocks_to_temp_krn;
+
+template <typename T, typename Comp>
+class merge_adjacent_blocks_from_temp_krn;
+
+template <typename Acc, typename Comp>
+sycl::event
+    merge_sorted_block_contig_impl(sycl::queue &q,
+                                   std::size_t iter_nelems,
+                                   std::size_t sort_nelems,
+                                   Acc output,
+                                   const Comp comp,
+                                   std::size_t sorted_block_size,
+                                   const std::vector<sycl::event> &depends = {})
+{
+
+    if (sorted_block_size >= sort_nelems)
+        return tie_events(q, depends);
+
+    // experimentally determined value
+    // size of segments worked upon by each work-item during merging
+    const sycl::device &dev = q.get_device();
+    const std::size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4;
+
+    const std::size_t chunk_size =
+        (sorted_block_size < segment_size) ? sorted_block_size : segment_size;
+
+    assert(sorted_block_size % chunk_size == 0);
+
+    using T = typename GetValueType<Acc>::value_type;
+
+    sycl::buffer<T, 1> temp_buf(sycl::range<1>{iter_nelems * sort_nelems});
+    // T *allocated_mem = sycl::malloc_device<T>(iter_nelems * sort_nelems, q);
+
+    bool needs_copy = true;
+    bool used_depends = false;
+
+    sycl::event dep_ev;
+    std::size_t chunks_merged = sorted_block_size / chunk_size;
+
+    assert(!(chunks_merged & (chunks_merged - 1)));
+
+    using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn<T, Comp>;
+    using FromTempKernelName =
+        class merge_adjacent_blocks_from_temp_krn<T, Comp>;
+
+    while (chunks_merged * chunk_size < sort_nelems) {
+        sycl::event local_dep = dep_ev;
+
+        sycl::event merge_ev = q.submit([&](sycl::handler &cgh) {
+            if (used_depends) {
+                cgh.depends_on(local_dep);
+            }
+            else {
+                cgh.depends_on(depends);
+                used_depends = true;
+            }
+
+            const std::size_t n_chunks = quotient_ceil(sort_nelems, chunk_size);
+
+            if (needs_copy) {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only,
+                                        sycl::no_init};
+                auto output_acc = GetReadOnlyAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<ToTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, output_acc, temp_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+            else {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+                auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<FromTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, temp_acc, output_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+        });
+
+        chunks_merged *= 2;
+        dep_ev = merge_ev;
+
+        if (chunks_merged * chunk_size < sort_nelems) {
+            needs_copy = !needs_copy;
+        }
+    }
+
+    if (needs_copy) {
+        sycl::event copy_ev = q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dep_ev);
+
+            sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+            auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+
+            cgh.copy(temp_acc, output_acc);
+        });
+        dep_ev = copy_ev;
+    }
+
+    return dep_ev;
+}
+
+} // namespace merge_sort_detail
+
+template <typename argTy, typename Comp = std::less<argTy>>
+sycl::event stable_sort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                             // matrix when sorting over rows)
+    std::size_t sort_nelems, // size of each array to sort  (length of rows,
+                             // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t sort_arg_offset,
+    ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    auto comp = Comp{};
+
+    // constant chosen experimentally to ensure monotonicity of
+    // sorting performance, as measured on GPU Max, and Iris Xe
+    constexpr std::size_t sequential_sorting_threshold = 16;
+
+    if (sort_nelems < sequential_sorting_threshold) {
+        // equal work-item sorts entire row
+        sycl::event sequential_sorting_ev =
+            merge_sort_detail::sort_base_step_contig_impl<const argTy *,
+                                                          argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sort_nelems, depends);
+
+        return sequential_sorting_ev;
+    }
+    else {
+        std::size_t sorted_block_size{};
+
+        // Sort segments of the array
+        sycl::event base_sort_ev =
+            merge_sort_detail::sort_over_work_group_contig_impl<const argTy *,
+                                                                argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sorted_block_size, // modified in place with size of sorted
+                                   // block size
+                depends);
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            merge_sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, res_tp, comp,
+                sorted_block_size, {base_sort_ev});
+
+        return merges_ev;
+    }
+}
+
+template <typename T1, typename T2>
+class populate_index_data_krn;
+
+template <typename T1, typename T2>
+class index_map_to_rows_krn;
+
+template <typename IndexT, typename ValueT, typename ValueComp>
+struct IndexComp
+{
+    IndexComp(const ValueT *data, const ValueComp &comp_op)
+        : ptr(data), value_comp(comp_op)
+    {
+    }
+
+    bool operator()(const IndexT &i1, const IndexT &i2) const
+    {
+        return value_comp(ptr[i1], ptr[i2]);
+    }
+
+private:
+    const ValueT *ptr;
+    ValueComp value_comp;
+};
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event stable_argsort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                             // matrix when sorting over rows)
+    std::size_t sort_nelems, // size of each array to sort  (length of rows,
+                             // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t sort_arg_offset,
+    ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
+
+    static constexpr std::size_t determine_automatically = 0;
+    std::size_t sorted_block_size = determine_automatically;
+
+    const std::size_t total_nelems = iter_nelems * sort_nelems;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    using IotaKernelName = populate_index_data_krn<argTy, IndexTy>;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, res_tp, total_nelems, depends);
+
+    // Sort segments of the array
+    sycl::event base_sort_ev =
+        merge_sort_detail::sort_over_work_group_contig_impl(
+            exec_q, iter_nelems, sort_nelems, res_tp, res_tp, index_comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
+        {base_sort_ev});
+
+    // no need to map back if iter_nelems == 1
+    if (iter_nelems == 1u) {
+        return merges_ev;
+    }
+
+    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
+
+    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
+
+    return write_out_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
new file mode 100644
index 000000000000..c5b2e28411df
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+
+namespace dpctl::tensor::kernels
+{
+
+namespace search_sorted_detail
+{
+
+template <typename T>
+T quotient_ceil(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t lower_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             const Compare &comp)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n, start = first;
+    std::size_t it;
+    while (n > 0) {
+        it = start;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[it], value)) {
+            n -= cur + 1, start = ++it;
+        }
+        else
+            n = cur;
+    }
+    return start;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t upper_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             const Compare &comp)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_impl(acc, first, last, value, op_comp);
+}
+
+template <typename Acc, typename Value, typename Compare, typename IndexerT>
+std::size_t lower_bound_indexed_impl(const Acc acc,
+                                     std::size_t first,
+                                     std::size_t last,
+                                     const Value &value,
+                                     const Compare &comp,
+                                     const IndexerT &acc_indexer)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n, start = first;
+    std::size_t it;
+    while (n > 0) {
+        it = start;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[acc_indexer(it)], value)) {
+            n -= cur + 1, start = ++it;
+        }
+        else
+            n = cur;
+    }
+    return start;
+}
+
+template <typename Acc, typename Value, typename Compare, typename IndexerT>
+std::size_t upper_bound_indexed_impl(const Acc acc,
+                                     const std::size_t first,
+                                     const std::size_t last,
+                                     const Value &value,
+                                     const Compare &comp,
+                                     const IndexerT &acc_indexer)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_indexed_impl(acc, first, last, value, op_comp,
+                                    acc_indexer);
+}
+
+} // namespace search_sorted_detail
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
new file mode 100644
index 000000000000..7bb0e37c6aed
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
@@ -0,0 +1,138 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "merge_sort.hpp"
+#include "py_sort_common.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_sort_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct AscendingSortContigFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::rich_comparisons::AscendingSorter;
+        using Comp = typename AscendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+template <typename fnT, typename argTy>
+struct DescendingSortContigFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::rich_comparisons::DescendingSorter;
+        using Comp = typename DescendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+void init_merge_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 AscendingSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingSortContigFactory, td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector);
+}
+
+void init_merge_sort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_merge_sort_dispatch_vectors();
+
+    auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                const int trailing_dims_to_sort,
+                                const dpctl::tensor::usm_ndarray &dst,
+                                sycl::queue &exec_q,
+                                const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_ascending", py_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                 const int trailing_dims_to_sort,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_sort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_descending", py_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp
new file mode 100644
index 000000000000..a6bdd0a4efe9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_merge_sort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/pyproject.toml b/pyproject.toml
index 6f5c83e542ba..09253467b8dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT,inpT"
 quiet-level = 3
 
 [tool.coverage.report]

From b62d836496299b18391bd336522c5c77b5f20327 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 05:55:25 -0800
Subject: [PATCH 114/145] Move ti.sort() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py |   2 +
 dpctl_ext/tensor/_sorting.py | 163 +++++++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_sorting.py   |   2 +-
 3 files changed, 166 insertions(+), 1 deletion(-)
 create mode 100644 dpctl_ext/tensor/_sorting.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 95c5e64c9322..8fa57c6d5028 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -82,6 +82,7 @@
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
+from ._sorting import sort
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
@@ -124,6 +125,7 @@
     "reshape",
     "result_type",
     "roll",
+    "sort",
     "squeeze",
     "stack",
     "swapaxes",
diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py
new file mode 100644
index 000000000000..5c16cbf9f17a
--- /dev/null
+++ b/dpctl_ext/tensor/_sorting.py
@@ -0,0 +1,163 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# import operator
+# from typing import NamedTuple
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+from ._tensor_sorting_impl import (  # _argsort_ascending,; _argsort_descending,; _radix_argsort_ascending,; _radix_argsort_descending,; _topk,
+    _radix_sort_ascending,
+    _radix_sort_descending,
+    _radix_sort_dtype_supported,
+    _sort_ascending,
+    _sort_descending,
+)
+
+__all__ = ["sort"]
+
+
+def _get_mergesort_impl_fn(descending):
+    return _sort_descending if descending else _sort_ascending
+
+
+def _get_radixsort_impl_fn(descending):
+    return _radix_sort_descending if descending else _radix_sort_ascending
+
+
+def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
+    """sort(x, axis=-1, descending=False, stable=True)
+
+    Returns a sorted copy of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
+            Sorting algorithm. The default is `"stable"`, which uses parallel
+            merge-sort or parallel radix-sort algorithms depending on the
+            array data type.
+    Returns:
+        usm_ndarray:
+            a sorted array. The returned array has the same data type and
+            the same shape as the input array `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt_ext.copy(x, order="C")
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt_ext.permute_dims(x, perm)
+    if kind is None:
+        kind = "stable"
+    if not isinstance(kind, str) or kind not in [
+        "stable",
+        "radixsort",
+        "mergesort",
+    ]:
+        raise ValueError(
+            "Unsupported kind value. Expected 'stable', 'mergesort', "
+            f"or 'radixsort', but got '{kind}'"
+        )
+    if kind == "mergesort":
+        impl_fn = _get_mergesort_impl_fn(descending)
+    elif kind == "radixsort":
+        if _radix_sort_dtype_supported(x.dtype.num):
+            impl_fn = _get_radixsort_impl_fn(descending)
+        else:
+            raise ValueError(f"Radix sort is not supported for {x.dtype}")
+    else:
+        dt = x.dtype
+        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
+            impl_fn = _get_radixsort_impl_fn(descending)
+        else:
+            impl_fn = _get_mergesort_impl_fn(descending)
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if arr.flags.c_contiguous:
+        res = dpt_ext.empty_like(arr, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=arr,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt_ext.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        res = dpt_ext.empty_like(arr, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt_ext.permute_dims(res, inv_perm)
+    return res
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index be6c52ae9d80..31a3e9cf0017 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -404,7 +404,7 @@ def sort(a, axis=-1, kind=None, order=None, *, descending=False, stable=None):
 
     return _wrap_sort_argsort(
         a,
-        dpt.sort,
+        dpt_ext.sort,
         axis=axis,
         kind=kind,
         order=order,

From 82d202cceebcc6a8a01411c976886c1cf7792160 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 06:09:57 -0800
Subject: [PATCH 115/145] Move ti.unique_counts() and ti.unique_values() to
 dpctl_ext.tensor

---
 dpctl_ext/tensor/__init__.py       |   6 +
 dpctl_ext/tensor/_set_functions.py | 274 +++++++++++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py    |   4 +-
 3 files changed, 282 insertions(+), 2 deletions(-)
 create mode 100644 dpctl_ext/tensor/_set_functions.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 8fa57c6d5028..aa373a9dd5c0 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -82,6 +82,10 @@
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
+from ._set_functions import (  # isin,; unique_all,; unique_inverse,
+    unique_counts,
+    unique_values,
+)
 from ._sorting import sort
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
@@ -135,6 +139,8 @@
     "to_numpy",
     "tril",
     "triu",
+    "unique_counts",
+    "unique_values",
     "unstack",
     "where",
     "zeros",
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py
new file mode 100644
index 000000000000..d4608a5a2a0a
--- /dev/null
+++ b/dpctl_ext/tensor/_set_functions.py
@@ -0,0 +1,274 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from typing import NamedTuple
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+from dpctl.tensor._tensor_elementwise_impl import _not_equal, _subtract
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+
+from ._tensor_impl import (
+    _copy_usm_ndarray_into_usm_ndarray,
+    _extract,
+    _full_usm_ndarray,
+    _linspace_step,
+    default_device_index_type,
+    mask_positions,
+)
+from ._tensor_sorting_impl import (
+    _sort_ascending,
+)
+
+
+class UniqueCountsResult(NamedTuple):
+    values: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
+    """unique_values(x)
+
+    Returns the unique elements of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        usm_ndarray
+            an array containing the set of unique elements in `x`. The
+            returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    if fx.size == 0:
+        return fx
+    s = dpt_ext.empty_like(fx, order="C")
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # writing into new allocation, no dependencies
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        return s
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, ex_e = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, ex_e)
+    return unique_vals
+
+
+def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
+    """unique_counts(x)
+
+    Returns the unique elements of an input array `x` and the corresponding
+    counts for each unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, counts)` whose
+
+            * first element is the field name `values` and is an array
+               containing the unique elements of `x`. This array has the
+               same data type as `x`.
+            * second element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    ind_dt = default_device_index_type(exec_q)
+    if fx.size == 0:
+        return UniqueCountsResult(fx, dpt_ext.empty_like(fx, dtype=ind_dt))
+    s = dpt_ext.empty_like(fx, order="C")
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            dst=s,
+            trailing_dims_to_sort=1,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    unique_mask = dpt_ext.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # no dependency, since we write into new allocation
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(
+        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
+    )
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        return UniqueCountsResult(
+            s,
+            dpt_ext.ones(
+                n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+            ),
+        )
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    # populate unique values
+    ht_ev, ex_e = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, ex_e)
+    unique_counts = dpt_ext.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    # writing into new allocation, no dependency
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    # no dependency, writing into disjoint segmenent of new allocation
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt_ext.empty_like(unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=unique_counts[1:],
+        src2=unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+    return UniqueCountsResult(unique_vals, _counts)
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index c034d1c43793..c11538321e51 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -386,12 +386,12 @@ def _get_first_nan_index(usm_a):
 
     num_of_flags = (return_index, return_inverse, return_counts).count(True)
     if num_of_flags == 0:
-        usm_res = dpt.unique_values(usm_ar)
+        usm_res = dpt_ext.unique_values(usm_ar)
         usm_res = (usm_res,)  # cast to a tuple to align with other cases
     elif num_of_flags == 1 and return_inverse:
         usm_res = dpt.unique_inverse(usm_ar)
     elif num_of_flags == 1 and return_counts:
-        usm_res = dpt.unique_counts(usm_ar)
+        usm_res = dpt_ext.unique_counts(usm_ar)
     else:
         usm_res = dpt.unique_all(usm_ar)
 

From 893cdc3658f4220db7df940138047fb3bb4d45e2 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 10:35:26 -0800
Subject: [PATCH 116/145] Move _radix_argsort_ascending/descending to
 _tensor_sorting_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   4 +-
 .../source/sorting/py_argsort_common.hpp      | 185 +++++++++++++++++
 .../source/sorting/radix_argsort.cpp          | 190 ++++++++++++++++++
 .../source/sorting/radix_argsort.hpp          |  47 +++++
 .../libtensor/source/tensor_sorting.cpp       |   8 +-
 5 files changed, 428 insertions(+), 6 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index cf0a78efdd59..ed2a9014338c 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -72,9 +72,9 @@ set(_accumulator_sources
 set(_sorting_sources
     #{CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
 )
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
new file mode 100644
index 000000000000..d204b492b2ff
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
@@ -0,0 +1,185 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+    py_argsort(const dpctl::tensor::usm_ndarray &src,
+               const int trailing_dims_to_sort,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends,
+               const sorting_contig_impl_fnT &sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, sort_nelems * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
+    {
+        throw py::value_error(
+            "Output index array must have data type int32 or int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        if (sort_nelems > 1) {
+            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+            auto fn = sort_contig_fns[src_typeid][dst_typeid];
+
+            if (fn == nullptr) {
+                throw py::value_error(
+                    "Not implemented for dtypes of input arrays");
+            }
+
+            sycl::event comp_ev =
+                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
+                   dst.get_data(), zero_offset, zero_offset, zero_offset,
+                   zero_offset, depends);
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+            return std::make_pair(keep_args_alive_ev, comp_ev);
+        }
+        else {
+            assert(dst.get_size() == iter_nelems);
+            int dst_elemsize = dst.get_elemsize();
+            static constexpr int memset_val(0);
+
+            sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst.get_data()), memset_val,
+                           iter_nelems * dst_elemsize);
+            });
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {fill_ev});
+
+            return std::make_pair(keep_args_alive_ev, fill_ev);
+        }
+    }
+
+    throw py::value_error(
+        "Both source and destination arrays must be C-contiguous");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
new file mode 100644
index 000000000000..ba9bbfd61b7c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
@@ -0,0 +1,190 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "py_argsort_common.hpp"
+#include "radix_argsort.hpp"
+#include "radix_sort_support.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+static sort_contig_fn_ptr_t
+    ascending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+namespace
+{
+
+template <bool is_ascending, typename T, typename I>
+sycl::event argsort_axis1_contig_caller(sycl::queue &q,
+                                        std::size_t iter_nelems,
+                                        std::size_t sort_nelems,
+                                        const char *arg_cp,
+                                        char *res_cp,
+                                        ssize_t iter_arg_offset,
+                                        ssize_t iter_res_offset,
+                                        ssize_t sort_arg_offset,
+                                        ssize_t sort_res_offset,
+                                        const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::radix_argsort_axis1_contig_impl;
+
+    return radix_argsort_axis1_contig_impl<T, I>(
+        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
+        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
+        depends);
+}
+
+} // end of anonymous namespace
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>))
+        {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ true, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>))
+        {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ false, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_radix_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_radix_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                DescendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        descending_radix_argsort_contig_dispatch_table);
+}
+
+void init_radix_argsort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_radix_argsort_dispatch_tables();
+
+    auto py_radix_argsort_ascending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                ascending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_ascending", py_radix_argsort_ascending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_radix_argsort_descending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                descending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_descending", py_radix_argsort_descending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp
new file mode 100644
index 000000000000..89013fbb1bdc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_radix_argsort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
index d9ab1feca46a..8aacb8d08256 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
@@ -36,9 +36,9 @@
 #include <pybind11/pybind11.h>
 
 // #include "sorting/isin.hpp"
-// #include "sorting/merge_argsort.hpp"
+#include "sorting/merge_argsort.hpp"
 #include "sorting/merge_sort.hpp"
-// #include "sorting/radix_argsort.hpp"
+#include "sorting/radix_argsort.hpp"
 #include "sorting/radix_sort.hpp"
 // #include "sorting/searchsorted.hpp"
 // #include "sorting/topk.hpp"
@@ -49,9 +49,9 @@ PYBIND11_MODULE(_tensor_sorting_impl, m)
 {
     // dpctl::tensor::py_internal::init_isin_functions(m);
     dpctl::tensor::py_internal::init_merge_sort_functions(m);
-    // dpctl::tensor::py_internal::init_merge_argsort_functions(m);
+    dpctl::tensor::py_internal::init_merge_argsort_functions(m);
     // dpctl::tensor::py_internal::init_searchsorted_functions(m);
     dpctl::tensor::py_internal::init_radix_sort_functions(m);
-    // dpctl::tensor::py_internal::init_radix_argsort_functions(m);
+    dpctl::tensor::py_internal::init_radix_argsort_functions(m);
     // dpctl::tensor::py_internal::init_topk_functions(m);
 }

From 40c2b8467df776299f1e99e9a0aa7a70d1eab0b0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 10:36:33 -0800
Subject: [PATCH 117/145] Move _argsort_ascending/descending to
 _tensor_sorting_impl

---
 .../source/sorting/merge_argsort.cpp          | 159 ++++++++++++++++++
 .../source/sorting/merge_argsort.hpp          |  47 ++++++
 2 files changed, 206 insertions(+)
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp

diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
new file mode 100644
index 000000000000..f76a74e1f8bf
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
@@ -0,0 +1,159 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "merge_argsort.hpp"
+#include "py_argsort_common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_argsort_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+            using Comp = typename AscendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>)
+        {
+            using dpctl::tensor::rich_comparisons::DescendingSorter;
+            using Comp = typename DescendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_merge_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingArgSortContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<
+        sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table);
+}
+
+void init_merge_argsort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_merge_argsort_dispatch_tables();
+
+    auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                   const int trailing_dims_to_sort,
+                                   const dpctl::tensor::usm_ndarray &dst,
+                                   sycl::queue &exec_q,
+                                   const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                ascending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                    const int trailing_dims_to_sort,
+                                    const dpctl::tensor::usm_ndarray &dst,
+                                    sycl::queue &exec_q,
+                                    const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                descending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_descending", py_argsort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp
new file mode 100644
index 000000000000..10777b4bc2fd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_merge_argsort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal

From 6912311de96b88402c210fa7231e334ccd9b91e6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 10:38:59 -0800
Subject: [PATCH 118/145] Move ti.argsort() to dpctl_ext.tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py |   3 +-
 dpctl_ext/tensor/_sorting.py | 127 ++++++++++++++++++++++++++++++++++-
 dpnp/dpnp_iface_sorting.py   |  10 ++-
 3 files changed, 131 insertions(+), 9 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index aa373a9dd5c0..4e0155ff5e6e 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -86,11 +86,12 @@
     unique_counts,
     unique_values,
 )
-from ._sorting import sort
+from ._sorting import argsort, sort
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
     "arange",
+    "argsort",
     "asarray",
     "asnumpy",
     "astype",
diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py
index 5c16cbf9f17a..a5fd1d57bcdf 100644
--- a/dpctl_ext/tensor/_sorting.py
+++ b/dpctl_ext/tensor/_sorting.py
@@ -38,7 +38,11 @@
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
-from ._tensor_sorting_impl import (  # _argsort_ascending,; _argsort_descending,; _radix_argsort_ascending,; _radix_argsort_descending,; _topk,
+from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _argsort_descending,
+    _radix_argsort_ascending,
+    _radix_argsort_descending,
     _radix_sort_ascending,
     _radix_sort_descending,
     _radix_sort_dtype_supported,
@@ -46,7 +50,7 @@
     _sort_descending,
 )
 
-__all__ = ["sort"]
+__all__ = ["sort", "argsort"]
 
 
 def _get_mergesort_impl_fn(descending):
@@ -161,3 +165,122 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
         res = dpt_ext.permute_dims(res, inv_perm)
     return res
+
+
+def _get_mergeargsort_impl_fn(descending):
+    return _argsort_descending if descending else _argsort_ascending
+
+
+def _get_radixargsort_impl_fn(descending):
+    return _radix_argsort_descending if descending else _radix_argsort_ascending
+
+
+def argsort(x, axis=-1, descending=False, stable=True, kind=None):
+    """argsort(x, axis=-1, descending=False, stable=True)
+
+    Returns the indices that sort an array `x` along a specified axis.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
+            Sorting algorithm. The default is `"stable"`, which uses parallel
+            merge-sort or parallel radix-sort algorithms depending on the
+            array data type.
+
+    Returns:
+        usm_ndarray:
+            an array of indices. The returned array has the  same shape as
+            the input array `x`. The return array has default array index
+            data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt_ext.zeros_like(
+            x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
+        )
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt_ext.permute_dims(x, perm)
+    if kind is None:
+        kind = "stable"
+    if not isinstance(kind, str) or kind not in [
+        "stable",
+        "radixsort",
+        "mergesort",
+    ]:
+        raise ValueError(
+            "Unsupported kind value. Expected 'stable', 'mergesort', "
+            f"or 'radixsort', but got '{kind}'"
+        )
+    if kind == "mergesort":
+        impl_fn = _get_mergeargsort_impl_fn(descending)
+    elif kind == "radixsort":
+        if _radix_sort_dtype_supported(x.dtype.num):
+            impl_fn = _get_radixargsort_impl_fn(descending)
+        else:
+            raise ValueError(f"Radix sort is not supported for {x.dtype}")
+    else:
+        dt = x.dtype
+        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
+            impl_fn = _get_radixargsort_impl_fn(descending)
+        else:
+            impl_fn = _get_mergeargsort_impl_fn(descending)
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    index_dt = ti.default_device_index_type(exec_q)
+    if arr.flags.c_contiguous:
+        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=arr,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt_ext.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt_ext.permute_dims(res, inv_perm)
+    return res
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index 31a3e9cf0017..e7abef1f4338 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -41,13 +41,11 @@
 
 from collections.abc import Sequence
 
-import dpctl.tensor as dpt
-from dpctl.tensor._numpy_helper import normalize_axis_index
-
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
+from dpctl_ext.tensor._numpy_helper import normalize_axis_index
 
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (
@@ -87,7 +85,7 @@ def _wrap_sort_argsort(
 
     usm_a = dpnp.get_usm_ndarray(a)
     if axis is None:
-        usm_a = dpt_ext.reshape(usm_a, -1)
+        usm_a = dpt.reshape(usm_a, -1)
         axis = -1
 
     axis = normalize_axis_index(axis, ndim=usm_a.ndim)
@@ -404,7 +402,7 @@ def sort(a, axis=-1, kind=None, order=None, *, descending=False, stable=None):
 
     return _wrap_sort_argsort(
         a,
-        dpt_ext.sort,
+        dpt.sort,
         axis=axis,
         kind=kind,
         order=order,

From 88a23a20a32681c9589ce893ebd5911f9c5ed816 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 11:12:09 -0800
Subject: [PATCH 119/145] Move _searchsorted_left/right to _tensor_sorting_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/sorting/searchsorted.hpp  | 260 ++++++++++
 .../libtensor/source/sorting/searchsorted.cpp | 478 ++++++++++++++++++
 .../libtensor/source/sorting/searchsorted.hpp |  47 ++
 .../libtensor/source/tensor_sorting.cpp       |   4 +-
 5 files changed, 788 insertions(+), 3 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index ed2a9014338c..1ad403aad573 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -75,7 +75,7 @@ set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
 )
 set(_tensor_accumulation_impl_sources
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
new file mode 100644
index 000000000000..a38522d7e4f7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
@@ -0,0 +1,260 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "utils/offset_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename argTy,
+          typename indTy,
+          bool left_side,
+          typename HayIndexerT,
+          typename NeedlesIndexerT,
+          typename PositionsIndexerT,
+          typename Compare>
+struct SearchSortedFunctor
+{
+private:
+    const argTy *hay_tp;
+    const argTy *needles_tp;
+    indTy *positions_tp;
+    std::size_t hay_nelems;
+    HayIndexerT hay_indexer;
+    NeedlesIndexerT needles_indexer;
+    PositionsIndexerT positions_indexer;
+
+public:
+    SearchSortedFunctor(const argTy *hay_,
+                        const argTy *needles_,
+                        indTy *positions_,
+                        const std::size_t hay_nelems_,
+                        const HayIndexerT &hay_indexer_,
+                        const NeedlesIndexerT &needles_indexer_,
+                        const PositionsIndexerT &positions_indexer_)
+        : hay_tp(hay_), needles_tp(needles_), positions_tp(positions_),
+          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
+          needles_indexer(needles_indexer_),
+          positions_indexer(positions_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const Compare comp{};
+
+        const std::size_t i = id[0];
+        const argTy needle_v = needles_tp[needles_indexer(i)];
+
+        // position of the needle_v in the hay array
+        indTy pos{};
+
+        static constexpr std::size_t zero(0);
+        if constexpr (left_side) {
+            // search in hay in left-closed interval, give `pos` such that
+            // hay[pos - 1] < needle_v <= hay[pos]
+
+            // lower_bound returns the first pos such that bool(hay[pos] <
+            // needle_v) is false, i.e. needle_v <= hay[pos]
+            pos = search_sorted_detail::lower_bound_indexed_impl(
+                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        }
+        else {
+            // search in hay in right-closed interval: hay[pos - 1] <= needle_v
+            // < hay[pos]
+
+            // upper_bound returns the first pos such that bool(needle_v <
+            // hay[pos]) is true, i.e. needle_v < hay[pos]
+            pos = search_sorted_detail::upper_bound_indexed_impl(
+                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        }
+
+        positions_tp[positions_indexer(i)] = pos;
+    }
+};
+
+typedef sycl::event (*searchsorted_contig_impl_fp_ptr_t)(
+    sycl::queue &,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, bool left_closed>
+class searchsorted_contig_impl_krn;
+
+template <typename argTy, typename indTy, bool left_closed, typename Compare>
+sycl::event searchsorted_contig_impl(sycl::queue &exec_q,
+                                     const std::size_t hay_nelems,
+                                     const std::size_t needles_nelems,
+                                     const char *hay_cp,
+                                     const ssize_t hay_offset,
+                                     const char *needles_cp,
+                                     const ssize_t needles_offset,
+                                     char *positions_cp,
+                                     const ssize_t positions_offset,
+                                     const std::vector<sycl::event> &depends)
+{
+    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp) + hay_offset;
+    const argTy *needles_tp =
+        reinterpret_cast<const argTy *>(needles_cp) + needles_offset;
+
+    indTy *positions_tp =
+        reinterpret_cast<indTy *>(positions_cp) + positions_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName =
+            class searchsorted_contig_impl_krn<argTy, indTy, left_closed>;
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        static constexpr TrivialIndexerT hay_indexer{};
+        static constexpr TrivialIndexerT needles_indexer{};
+        static constexpr TrivialIndexerT positions_indexer{};
+
+        const auto fnctr =
+            SearchSortedFunctor<argTy, indTy, left_closed, TrivialIndexerT,
+                                TrivialIndexerT, TrivialIndexerT, Compare>(
+                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
+                needles_indexer, positions_indexer);
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*searchsorted_strided_impl_fp_ptr_t)(
+    sycl::queue &,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, bool left_closed>
+class searchsorted_strided_impl_krn;
+
+template <typename argTy, typename indTy, bool left_closed, typename Compare>
+sycl::event searchsorted_strided_impl(
+    sycl::queue &exec_q,
+    const std::size_t hay_nelems,
+    const std::size_t needles_nelems,
+    const char *hay_cp,
+    const ssize_t hay_offset,
+    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
+    const ssize_t hay_stride,
+    const char *needles_cp,
+    const ssize_t needles_offset,
+    char *positions_cp,
+    const ssize_t positions_offset,
+    const int needles_nd,
+    // packed_shape_strides is [needles_shape, needles_strides,
+    // positions_strides] has length of 3*needles_nd
+    const ssize_t *packed_shape_strides,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp);
+    const argTy *needles_tp = reinterpret_cast<const argTy *>(needles_cp);
+
+    indTy *positions_tp = reinterpret_cast<indTy *>(positions_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        const HayIndexerT hay_indexer(
+            /* offset */ hay_offset,
+            /* size   */ hay_nelems,
+            /* step   */ hay_stride);
+
+        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const ssize_t *needles_shape_strides = packed_shape_strides;
+        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
+                                              needles_shape_strides);
+        using PositionsIndexerT =
+            dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+        const ssize_t *positions_shape = packed_shape_strides;
+        const ssize_t *positions_strides =
+            packed_shape_strides + 2 * needles_nd;
+        const PositionsIndexerT positions_indexer(
+            needles_nd, positions_offset, positions_shape, positions_strides);
+
+        const auto fnctr =
+            SearchSortedFunctor<argTy, indTy, left_closed, HayIndexerT,
+                                NeedlesIndexerT, PositionsIndexerT, Compare>(
+                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
+                needles_indexer, positions_indexer);
+        using KernelName =
+            class searchsorted_strided_impl_krn<argTy, indTy, left_closed>;
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
new file mode 100644
index 000000000000..1f2a45d217cb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -0,0 +1,478 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/searchsorted.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/rich_comparisons.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace detail
+{
+
+using dpctl::tensor::kernels::searchsorted_contig_impl_fp_ptr_t;
+
+static searchsorted_contig_impl_fp_ptr_t
+    left_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
+
+static searchsorted_contig_impl_fp_ptr_t
+    right_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
+
+template <typename fnT, typename argTy, typename indTy>
+struct LeftSideSearchSortedContigFactory
+{
+    constexpr LeftSideSearchSortedContigFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool left_side_search(true);
+            using dpctl::tensor::kernels::searchsorted_contig_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_contig_impl<argTy, indTy, left_side_search,
+                                            Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename indTy>
+struct RightSideSearchSortedContigFactory
+{
+    constexpr RightSideSearchSortedContigFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool right_side_search(false);
+
+            using dpctl::tensor::kernels::searchsorted_contig_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_contig_impl<argTy, indTy, right_side_search,
+                                            Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+using dpctl::tensor::kernels::searchsorted_strided_impl_fp_ptr_t;
+
+static searchsorted_strided_impl_fp_ptr_t
+    left_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
+
+static searchsorted_strided_impl_fp_ptr_t
+    right_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
+
+template <typename fnT, typename argTy, typename indTy>
+struct LeftSideSearchSortedStridedFactory
+{
+    constexpr LeftSideSearchSortedStridedFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool left_side_search(true);
+            using dpctl::tensor::kernels::searchsorted_strided_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_strided_impl<argTy, indTy, left_side_search,
+                                             Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename indTy>
+struct RightSideSearchSortedStridedFactory
+{
+    constexpr RightSideSearchSortedStridedFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>)
+        {
+            static constexpr bool right_side_search(false);
+            using dpctl::tensor::kernels::searchsorted_strided_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_strided_impl<argTy, indTy, right_side_search,
+                                             Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_searchsorted_dispatch_table(void)
+{
+
+    // Contiguous input function dispatch
+    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
+                                LeftSideSearchSortedContigFactory,
+                                td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(left_side_searchsorted_contig_impl);
+
+    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
+                                RightSideSearchSortedContigFactory,
+                                td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(right_side_searchsorted_contig_impl);
+
+    // Strided input function dispatch
+    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
+                                LeftSideSearchSortedStridedFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(left_side_searchsorted_strided_impl);
+
+    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
+                                RightSideSearchSortedStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(right_side_searchsorted_strided_impl);
+}
+
+} // namespace detail
+
+/*! @brief search for needle from needles in sorted hay */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted(const dpctl::tensor::usm_ndarray &hay,
+                    const dpctl::tensor::usm_ndarray &needles,
+                    const dpctl::tensor::usm_ndarray &positions,
+                    sycl::queue &exec_q,
+                    const bool search_left_side,
+                    const std::vector<sycl::event> &depends)
+{
+    const int hay_nd = hay.get_ndim();
+    const int needles_nd = needles.get_ndim();
+    const int positions_nd = positions.get_ndim();
+
+    if (hay_nd != 1 || needles_nd != positions_nd) {
+        throw py::value_error("Array dimensions mismatch");
+    }
+
+    // check that needle and positions have the same shape
+    std::size_t needles_nelems(1);
+    bool same_shape(true);
+
+    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
+
+    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *positions_shape_ptr = needles.get_shape_raw();
+
+    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
+        const auto needles_sh_i = needles_shape_ptr[i];
+        const auto positions_sh_i = positions_shape_ptr[i];
+
+        same_shape = same_shape && (needles_sh_i == positions_sh_i);
+        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
+    }
+
+    if (!same_shape) {
+        throw py::value_error(
+            "Array of values to search for and array of their "
+            "positions do not have the same shape");
+    }
+
+    // check that positions is ample enough
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(positions,
+                                                               needles_nelems);
+
+    // check that positions is writable
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions);
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, positions}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // if output array overlaps with input arrays, race condition results
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(positions, hay) || overlap(positions, needles)) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    const int hay_typenum = hay.get_typenum();
+    const int needles_typenum = needles.get_typenum();
+    const int positions_typenum = positions.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
+    const int needles_typeid =
+        array_types.typenum_to_lookup_id(needles_typenum);
+    const int positions_typeid =
+        array_types.typenum_to_lookup_id(positions_typenum);
+
+    // check hay and needle have the same data-type
+    if (needles_typeid != hay_typeid) {
+        throw py::value_error(
+            "Hay array and needles array must have the same data types");
+    }
+    // check that positions has indexing data-type (int32, or int64)
+    const auto positions_typenum_t_v =
+        static_cast<td_ns::typenum_t>(positions_typeid);
+    if (positions_typenum_t_v != td_ns::typenum_t::INT32 &&
+        positions_typenum_t_v != td_ns::typenum_t::INT64)
+    {
+        throw py::value_error(
+            "Positions array must have data-type int32, or int64");
+    }
+
+    if (needles_nelems == 0) {
+        // Nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    // if all inputs are contiguous call contiguous implementations
+    // otherwise call strided implementation
+    const bool hay_is_c_contig = hay.is_c_contiguous();
+    const bool hay_is_f_contig = hay.is_f_contiguous();
+
+    const bool needles_is_c_contig = needles.is_c_contiguous();
+    const bool needles_is_f_contig = needles.is_f_contiguous();
+
+    const bool positions_is_c_contig = positions.is_c_contiguous();
+    const bool positions_is_f_contig = positions.is_f_contiguous();
+
+    const bool all_c_contig =
+        (hay_is_c_contig && needles_is_c_contig && positions_is_c_contig);
+    const bool all_f_contig =
+        (hay_is_f_contig && needles_is_f_contig && positions_is_f_contig);
+
+    const char *hay_data = hay.get_data();
+    const char *needles_data = needles.get_data();
+
+    char *positions_data = positions.get_data();
+
+    if (all_c_contig || all_f_contig) {
+        auto fn =
+            (search_left_side)
+                ? detail::left_side_searchsorted_contig_impl[hay_typeid]
+                                                            [positions_typeid]
+                : detail::right_side_searchsorted_contig_impl[hay_typeid]
+                                                             [positions_typeid];
+
+        if (fn) {
+            static constexpr py::ssize_t zero_offset(0);
+
+            sycl::event comp_ev =
+                fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset,
+                   needles_data, zero_offset, positions_data, zero_offset,
+                   depends);
+
+            return std::make_pair(
+                dpctl::utils::keep_args_alive(exec_q, {hay, needles, positions},
+                                              {comp_ev}),
+                comp_ev);
+        }
+    }
+
+    // strided case
+
+    const auto &needles_strides = needles.get_strides_vector();
+    const auto &positions_strides = positions.get_strides_vector();
+
+    int simplified_nd = needles_nd;
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_common_shape;
+    shT simplified_needles_strides;
+    shT simplified_positions_strides;
+    py::ssize_t needles_offset(0);
+    py::ssize_t positions_offset(0);
+
+    if (simplified_nd == 0) {
+        // needles and positions have same nd
+        simplified_nd = 1;
+        simplified_common_shape.push_back(1);
+        simplified_needles_strides.push_back(0);
+        simplified_positions_strides.push_back(0);
+    }
+    else {
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            // modified by reference
+            simplified_nd,
+            // read-only inputs
+            needles_shape_ptr, needles_strides, positions_strides,
+            // output, modified by reference
+            simplified_common_shape, simplified_needles_strides,
+            simplified_positions_strides, needles_offset, positions_offset);
+    }
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // vectors being packed
+        simplified_common_shape, simplified_needles_strides,
+        simplified_positions_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_strides_ev =
+        std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    auto strided_fn =
+        (search_left_side)
+            ? detail::left_side_searchsorted_strided_impl[hay_typeid]
+                                                         [positions_typeid]
+            : detail::right_side_searchsorted_strided_impl[hay_typeid]
+                                                          [positions_typeid];
+
+    if (!strided_fn) {
+        throw std::runtime_error(
+            "No implementation for data types of input arrays");
+    }
+
+    static constexpr py::ssize_t zero_offset(0);
+    py::ssize_t hay_step = hay.get_strides_vector()[0];
+
+    const sycl::event &comp_ev = strided_fn(
+        exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, hay_step,
+        needles_data, needles_offset, positions_data, positions_offset,
+        simplified_nd, packed_shape_strides, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {comp_ev}, packed_shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {hay, needles, positions}, host_task_events);
+
+    return std::make_pair(ht_ev, comp_ev);
+}
+
+/*! @brief search for needle from needles in sorted hay,
+ *         hay[pos] <= needle < hay[pos + 1]
+ */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay,
+                         const dpctl::tensor::usm_ndarray &needles,
+                         const dpctl::tensor::usm_ndarray &positions,
+                         sycl::queue &exec_q,
+                         const std::vector<sycl::event> &depends)
+{
+    static constexpr bool side_left(true);
+    return py_searchsorted(hay, needles, positions, exec_q, side_left, depends);
+}
+
+/*! @brief search for needle from needles in sorted hay,
+ *         hay[pos] < needle <= hay[pos + 1]
+ */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay,
+                          const dpctl::tensor::usm_ndarray &needles,
+                          const dpctl::tensor::usm_ndarray &positions,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+    static constexpr bool side_right(false);
+    return py_searchsorted(hay, needles, positions, exec_q, side_right,
+                           depends);
+}
+
+void init_searchsorted_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::detail::init_searchsorted_dispatch_table();
+
+    using dpctl::tensor::py_internal::py_searchsorted_left;
+    using dpctl::tensor::py_internal::py_searchsorted_right;
+
+    m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"),
+          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+    m.def("_searchsorted_right", &py_searchsorted_right, py::arg("hay"),
+          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp
new file mode 100644
index 000000000000..b60dae1e0ec9
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_searchsorted_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
index 8aacb8d08256..b569317e5d68 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
@@ -40,7 +40,7 @@
 #include "sorting/merge_sort.hpp"
 #include "sorting/radix_argsort.hpp"
 #include "sorting/radix_sort.hpp"
-// #include "sorting/searchsorted.hpp"
+#include "sorting/searchsorted.hpp"
 // #include "sorting/topk.hpp"
 
 namespace py = pybind11;
@@ -50,7 +50,7 @@ PYBIND11_MODULE(_tensor_sorting_impl, m)
     // dpctl::tensor::py_internal::init_isin_functions(m);
     dpctl::tensor::py_internal::init_merge_sort_functions(m);
     dpctl::tensor::py_internal::init_merge_argsort_functions(m);
-    // dpctl::tensor::py_internal::init_searchsorted_functions(m);
+    dpctl::tensor::py_internal::init_searchsorted_functions(m);
     dpctl::tensor::py_internal::init_radix_sort_functions(m);
     dpctl::tensor::py_internal::init_radix_argsort_functions(m);
     // dpctl::tensor::py_internal::init_topk_functions(m);

From 148b6e54a190d8278ca4a368c4515c8da867bf26 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 11:18:16 -0800
Subject: [PATCH 120/145] Fix bug: wrong shape pointer for positions array

---
 dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
index 1f2a45d217cb..5cebb74be2d0 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -236,7 +236,7 @@ std::pair<sycl::event, sycl::event>
     const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
 
     const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
-    const py::ssize_t *positions_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *positions_shape_ptr = positions.get_shape_raw();
 
     for (int i = 0; (i < needles_nd) && same_shape; ++i) {
         const auto needles_sh_i = needles_shape_ptr[i];

From 8b010b0e4f39ff0aeed7cf5a1f8dc656e99fad17 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 11:20:54 -0800
Subject: [PATCH 121/145] Move ti.searchsorted() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py      |   2 +
 dpctl_ext/tensor/_searchsorted.py | 189 ++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py   |   6 +-
 dpnp/dpnp_iface_searching.py      |   2 +-
 4 files changed, 196 insertions(+), 3 deletions(-)
 create mode 100644 dpctl_ext/tensor/_searchsorted.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 4e0155ff5e6e..63de04d1cdb8 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -82,6 +82,7 @@
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
+from ._searchsorted import searchsorted
 from ._set_functions import (  # isin,; unique_all,; unique_inverse,
     unique_counts,
     unique_values,
@@ -130,6 +131,7 @@
     "reshape",
     "result_type",
     "roll",
+    "searchsorted",
     "sort",
     "squeeze",
     "stack",
diff --git a/dpctl_ext/tensor/_searchsorted.py b/dpctl_ext/tensor/_searchsorted.py
new file mode 100644
index 000000000000..2d4807fb0d0c
--- /dev/null
+++ b/dpctl_ext/tensor/_searchsorted.py
@@ -0,0 +1,189 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+from typing import Literal, Union
+
+import dpctl
+import dpctl.utils as du
+
+# TODO: revert to `from ._usmarray import...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl.tensor._usmarray import usm_ndarray
+
+from ._copy_utils import _empty_like_orderK
+from ._ctors import empty
+from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy
+from ._tensor_impl import _take as ti_take
+from ._tensor_impl import (
+    default_device_index_type as ti_default_device_index_type,
+)
+from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right
+from ._type_utils import isdtype, result_type
+
+
+def searchsorted(
+    x1: usm_ndarray,
+    x2: usm_ndarray,
+    /,
+    *,
+    side: Literal["left", "right"] = "left",
+    sorter: Union[usm_ndarray, None] = None,
+) -> usm_ndarray:
+    """searchsorted(x1, x2, side='left', sorter=None)
+
+    Finds the indices into `x1` such that, if the corresponding elements
+    in `x2` were inserted before the indices, the order of `x1`, when sorted
+    in ascending order, would be preserved.
+
+    Args:
+        x1 (usm_ndarray):
+            input array. Must be a one-dimensional array. If `sorter` is
+            `None`, must be sorted in ascending order; otherwise, `sorter` must
+            be an array of indices that sort `x1` in ascending order.
+        x2 (usm_ndarray):
+            array containing search values.
+        side (Literal["left", "right]):
+            argument controlling which index is returned if a value lands
+            exactly on an edge. If `x2` is an array of rank `N` where
+            `v = x2[n, m, ..., j]`, the element `ret[n, m, ..., j]` in the
+            return array `ret` contains the position `i` such that
+            if `side="left"`, it is the first index such that
+            `x1[i-1] < v <= x1[i]`, `0` if `v <= x1[0]`, and `x1.size`
+            if `v > x1[-1]`;
+            and if `side="right"`, it is the first position `i` such that
+            `x1[i-1] <= v < x1[i]`, `0` if `v < x1[0]`, and `x1.size`
+            if `v >= x1[-1]`. Default: `"left"`.
+        sorter (Optional[usm_ndarray]):
+            array of indices that sort `x1` in ascending order. The array must
+            have the same shape as `x1` and have an integral data type.
+            Out of bound index values of `sorter` array are treated using
+            `"wrap"` mode documented in :py:func:`dpctl.tensor.take`.
+            Default: `None`.
+    """
+    if not isinstance(x1, usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
+    if sorter is not None and not isinstance(sorter, usm_ndarray):
+        raise TypeError(
+            f"Expected dpctl.tensor.usm_ndarray, got {type(sorter)}"
+        )
+
+    if side not in ["left", "right"]:
+        raise ValueError(
+            "Unrecognized value of 'side' keyword argument. "
+            "Expected either 'left' or 'right'"
+        )
+
+    if sorter is None:
+        q = du.get_execution_queue([x1.sycl_queue, x2.sycl_queue])
+    else:
+        q = du.get_execution_queue(
+            [x1.sycl_queue, x2.sycl_queue, sorter.sycl_queue]
+        )
+    if q is None:
+        raise du.ExecutionPlacementError(
+            "Execution placement can not be unambiguously "
+            "inferred from input arguments."
+        )
+
+    if x1.ndim != 1:
+        raise ValueError("First argument array must be one-dimensional")
+
+    x1_dt = x1.dtype
+    x2_dt = x2.dtype
+
+    _manager = du.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    ev = dpctl.SyclEvent()
+    if sorter is not None:
+        if not isdtype(sorter.dtype, "integral"):
+            raise ValueError(
+                f"Sorter array must have integral data type, got {sorter.dtype}"
+            )
+        if x1.shape != sorter.shape:
+            raise ValueError(
+                "Sorter array must be one-dimension with the same "
+                "shape as the first argument array"
+            )
+        res = empty(x1.shape, dtype=x1_dt, usm_type=x1.usm_type, sycl_queue=q)
+        ind = (sorter,)
+        axis = 0
+        wrap_out_of_bound_indices_mode = 0
+        ht_ev, ev = ti_take(
+            x1,
+            ind,
+            res,
+            axis,
+            wrap_out_of_bound_indices_mode,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        x1 = res
+        _manager.add_event_pair(ht_ev, ev)
+
+    if x1_dt != x2_dt:
+        dt = result_type(x1, x2)
+        if x1_dt != dt:
+            x1_buf = _empty_like_orderK(x1, dt)
+            dep_evs = _manager.submitted_events
+            ht_ev, ev = ti_copy(
+                src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_ev, ev)
+            x1 = x1_buf
+        if x2_dt != dt:
+            x2_buf = _empty_like_orderK(x2, dt)
+            dep_evs = _manager.submitted_events
+            ht_ev, ev = ti_copy(
+                src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_ev, ev)
+            x2 = x2_buf
+
+    dst_usm_type = du.get_coerced_usm_type([x1.usm_type, x2.usm_type])
+    index_dt = ti_default_device_index_type(q)
+
+    dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type)
+
+    dep_evs = _manager.submitted_events
+    if side == "left":
+        ht_ev, s_ev = _searchsorted_left(
+            hay=x1,
+            needles=x2,
+            positions=dst,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+    else:
+        ht_ev, s_ev = _searchsorted_right(
+            hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs
+        )
+    _manager.add_event_pair(ht_ev, s_ev)
+    return dst
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index c11538321e51..6d6244d2534f 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -378,8 +378,10 @@ def _get_first_nan_index(usm_a):
                 true_val = dpt_ext.asarray(
                     True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type
                 )
-                return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left")
-            return dpt.searchsorted(usm_a, usm_a[-1], side="left")
+                return dpt_ext.searchsorted(
+                    dpt.isnan(usm_a), true_val, side="left"
+                )
+            return dpt_ext.searchsorted(usm_a, usm_a[-1], side="left")
         return None
 
     usm_ar = dpnp.get_usm_ndarray(ar)
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 15f52338ec7e..055aaa999c3a 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -382,7 +382,7 @@ def searchsorted(a, v, side="left", sorter=None):
 
     usm_sorter = None if sorter is None else dpnp.get_usm_ndarray(sorter)
     return dpnp_array._create_from_usm_ndarray(
-        dpt.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
+        dpt_ext.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
     )
 
 

From 9766d345c0874965a9a96d9d18405e05dd043de5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 11:27:16 -0800
Subject: [PATCH 122/145] Move dpt.unique_all() and dpt.unique_inverse() and
 reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py       |   6 +-
 dpctl_ext/tensor/_set_functions.py | 364 +++++++++++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py    |   4 +-
 3 files changed, 371 insertions(+), 3 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 63de04d1cdb8..5eca7e61098a 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -83,8 +83,10 @@
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
 from ._searchsorted import searchsorted
-from ._set_functions import (  # isin,; unique_all,; unique_inverse,
+from ._set_functions import (
+    unique_all,
     unique_counts,
+    unique_inverse,
     unique_values,
 )
 from ._sorting import argsort, sort
@@ -142,7 +144,9 @@
     "to_numpy",
     "tril",
     "triu",
+    "unique_all",
     "unique_counts",
+    "unique_inverse",
     "unique_values",
     "unstack",
     "where",
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py
index d4608a5a2a0a..0ca0cd9d3db0 100644
--- a/dpctl_ext/tensor/_set_functions.py
+++ b/dpctl_ext/tensor/_set_functions.py
@@ -41,19 +41,34 @@
     _extract,
     _full_usm_ndarray,
     _linspace_step,
+    _take,
     default_device_index_type,
     mask_positions,
 )
 from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _searchsorted_left,
     _sort_ascending,
 )
 
 
+class UniqueAllResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
 class UniqueCountsResult(NamedTuple):
     values: dpt.usm_ndarray
     counts: dpt.usm_ndarray
 
 
+class UniqueInverseResult(NamedTuple):
+    values: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+
+
 def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
     """unique_values(x)
 
@@ -272,3 +287,352 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     )
     _manager.add_event_pair(ht_ev, sub_ev)
     return UniqueCountsResult(unique_vals, _counts)
+
+
+def unique_inverse(x):
+    """unique_inverse
+
+    Returns the unique elements of an input array x and the indices from the
+    set of unique elements that reconstruct `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, inverse_indices)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        return UniqueInverseResult(fx, dpt_ext.reshape(unsorting_ids, x.shape))
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    ht_ev, argsort_ev = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, argsort_ev)
+    s = dpt_ext.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, take_ev)
+    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # no dependency
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(
+        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
+    )
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        return UniqueInverseResult(s, dpt_ext.reshape(unsorting_ids, x.shape))
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, uv_ev = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, uv_ev)
+    cum_unique_counts = dpt_ext.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+
+    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    ht_ev, ssl_ev = _searchsorted_left(
+        hay=unique_vals,
+        needles=x,
+        positions=inv,
+        sycl_queue=exec_q,
+        depends=[
+            uv_ev,
+        ],
+    )
+    _manager.add_event_pair(ht_ev, ssl_ev)
+
+    return UniqueInverseResult(unique_vals, inv)
+
+
+def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
+    """unique_all(x)
+
+    Returns the unique elements of an input array `x`, the first occurring
+    indices for each unique element in `x`, the indices from the set of unique
+    elements that reconstruct `x`, and the corresponding counts for each
+    unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices, inverse_indices, counts)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `indices` and is an array
+              the indices (of first occurrences) of `x` that result in
+              `values`. The array has the same shape as `values` and has the
+              default array index data type.
+            * third element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+            * fourth element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt_ext.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        # original array contains no data
+        # so it can be safely returned as values
+        return UniqueAllResult(
+            fx,
+            sorting_ids,
+            dpt_ext.reshape(unsorting_ids, x.shape),
+            dpt_ext.empty_like(fx, dtype=ind_dt),
+        )
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt_ext.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    ht_ev, args_ev = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, args_ev)
+    s = dpt_ext.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, take_ev)
+    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt_ext.empty(
+        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
+    )
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        _counts = dpt_ext.ones(
+            n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+        )
+        return UniqueAllResult(
+            s,
+            sorting_ids,
+            dpt_ext.reshape(unsorting_ids, x.shape),
+            _counts,
+        )
+    unique_vals = dpt_ext.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, uv_ev = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, uv_ev)
+    cum_unique_counts = dpt_ext.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+
+    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    ht_ev, ssl_ev = _searchsorted_left(
+        hay=unique_vals,
+        needles=x,
+        positions=inv,
+        sycl_queue=exec_q,
+        depends=[
+            uv_ev,
+        ],
+    )
+    _manager.add_event_pair(ht_ev, ssl_ev)
+    return UniqueAllResult(
+        unique_vals,
+        sorting_ids[cum_unique_counts[:-1]],
+        inv,
+        _counts,
+    )
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 6d6244d2534f..cb0d0d35b0d7 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -391,11 +391,11 @@ def _get_first_nan_index(usm_a):
         usm_res = dpt_ext.unique_values(usm_ar)
         usm_res = (usm_res,)  # cast to a tuple to align with other cases
     elif num_of_flags == 1 and return_inverse:
-        usm_res = dpt.unique_inverse(usm_ar)
+        usm_res = dpt_ext.unique_inverse(usm_ar)
     elif num_of_flags == 1 and return_counts:
         usm_res = dpt_ext.unique_counts(usm_ar)
     else:
-        usm_res = dpt.unique_all(usm_ar)
+        usm_res = dpt_ext.unique_all(usm_ar)
 
     first_nan = None
     if equal_nan:

From b8ad5ec1826f5f0e288fae7a0c20af7f0279faf3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 11:55:59 -0800
Subject: [PATCH 123/145] Move _isin to _tensor_sorting_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/sorting/isin.hpp          | 245 +++++++++++++
 .../include/kernels/sorting/merge_sort.hpp    |   3 +-
 .../kernels/sorting/search_sorted_detail.hpp  |   3 +-
 .../include/kernels/sorting/searchsorted.hpp  |   3 +-
 .../include/kernels/sorting/sort_utils.hpp    |   3 +-
 .../tensor/libtensor/source/sorting/isin.cpp  | 324 ++++++++++++++++++
 .../tensor/libtensor/source/sorting/isin.hpp  |  47 +++
 .../libtensor/source/tensor_sorting.cpp       |   4 +-
 9 files changed, 623 insertions(+), 11 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/isin.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 1ad403aad573..c03cba55b7e4 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -70,7 +70,7 @@ set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
 set(_sorting_sources
-    #{CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp
new file mode 100644
index 000000000000..847fa96ecdff
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp
@@ -0,0 +1,245 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor membership operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/rich_comparisons.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename T,
+          typename HayIndexerT,
+          typename NeedlesIndexerT,
+          typename OutIndexerT>
+struct IsinFunctor
+{
+private:
+    bool invert;
+    const T *hay_tp;
+    const T *needles_tp;
+    bool *out_tp;
+    std::size_t hay_nelems;
+    HayIndexerT hay_indexer;
+    NeedlesIndexerT needles_indexer;
+    OutIndexerT out_indexer;
+
+public:
+    IsinFunctor(const bool invert_,
+                const T *hay_,
+                const T *needles_,
+                bool *out_,
+                const std::size_t hay_nelems_,
+                const HayIndexerT &hay_indexer_,
+                const NeedlesIndexerT &needles_indexer_,
+                const OutIndexerT &out_indexer_)
+        : invert(invert_), hay_tp(hay_), needles_tp(needles_), out_tp(out_),
+          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
+          needles_indexer(needles_indexer_), out_indexer(out_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        using Compare =
+            typename dpctl::tensor::rich_comparisons::AscendingSorter<T>::type;
+        static constexpr Compare comp{};
+
+        const std::size_t i = id[0];
+        const T needle_v = needles_tp[needles_indexer(i)];
+
+        // position of the needle_v in the hay array
+        std::size_t pos{};
+
+        static constexpr std::size_t zero(0);
+        // search in hay in left-closed interval, give `pos` such that
+        // hay[pos - 1] < needle_v <= hay[pos]
+
+        // lower_bound returns the first pos such that bool(hay[pos] <
+        // needle_v) is false, i.e. needle_v <= hay[pos]
+        pos = search_sorted_detail::lower_bound_indexed_impl(
+            hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        bool out = (pos == hay_nelems ? false : hay_tp[pos] == needle_v);
+        out_tp[out_indexer(i)] = (invert) ? !out : out;
+    }
+};
+
+typedef sycl::event (*isin_contig_impl_fp_ptr_t)(
+    sycl::queue &,
+    const bool,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+class isin_contig_impl_krn;
+
+template <typename T>
+sycl::event isin_contig_impl(sycl::queue &exec_q,
+                             const bool invert,
+                             const std::size_t hay_nelems,
+                             const std::size_t needles_nelems,
+                             const char *hay_cp,
+                             const ssize_t hay_offset,
+                             const char *needles_cp,
+                             const ssize_t needles_offset,
+                             char *out_cp,
+                             const ssize_t out_offset,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *hay_tp = reinterpret_cast<const T *>(hay_cp) + hay_offset;
+    const T *needles_tp =
+        reinterpret_cast<const T *>(needles_cp) + needles_offset;
+
+    bool *out_tp = reinterpret_cast<bool *>(out_cp) + out_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = class isin_contig_impl_krn<T>;
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        static constexpr TrivialIndexerT hay_indexer{};
+        static constexpr TrivialIndexerT needles_indexer{};
+        static constexpr TrivialIndexerT out_indexer{};
+
+        const auto fnctr =
+            IsinFunctor<T, TrivialIndexerT, TrivialIndexerT, TrivialIndexerT>(
+                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
+                needles_indexer, out_indexer);
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*isin_strided_impl_fp_ptr_t)(
+    sycl::queue &,
+    const bool,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+class isin_strided_impl_krn;
+
+template <typename T>
+sycl::event isin_strided_impl(
+    sycl::queue &exec_q,
+    const bool invert,
+    const std::size_t hay_nelems,
+    const std::size_t needles_nelems,
+    const char *hay_cp,
+    const ssize_t hay_offset,
+    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
+    const ssize_t hay_stride,
+    const char *needles_cp,
+    const ssize_t needles_offset,
+    char *out_cp,
+    const ssize_t out_offset,
+    const int needles_nd,
+    // packed_shape_strides is [needles_shape, needles_strides,
+    // out_strides] has length of 3*needles_nd
+    const ssize_t *packed_shape_strides,
+    const std::vector<sycl::event> &depends)
+{
+    const T *hay_tp = reinterpret_cast<const T *>(hay_cp);
+    const T *needles_tp = reinterpret_cast<const T *>(needles_cp);
+
+    bool *out_tp = reinterpret_cast<bool *>(out_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        const HayIndexerT hay_indexer(
+            /* offset */ hay_offset,
+            /* size   */ hay_nelems,
+            /* step   */ hay_stride);
+
+        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const ssize_t *needles_shape_strides = packed_shape_strides;
+        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
+                                              needles_shape_strides);
+        using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+        const ssize_t *out_shape = packed_shape_strides;
+        const ssize_t *out_strides = packed_shape_strides + 2 * needles_nd;
+        const OutIndexerT out_indexer(needles_nd, out_offset, out_shape,
+                                      out_strides);
+
+        const auto fnctr =
+            IsinFunctor<T, HayIndexerT, NeedlesIndexerT, OutIndexerT>(
+                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
+                needles_indexer, out_indexer);
+        using KernelName = class isin_strided_impl_krn<T>;
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index e14425605fe2..a047c172f7bc 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -29,8 +29,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
+/// This file defines kernels for tensor sort/argsort operations.
 //===----------------------------------------------------------------------===//
 
 #pragma once
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
index c5b2e28411df..d184261aeabf 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
@@ -29,8 +29,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
+/// This file defines kernels for tensor sort/argsort operations.
 //===----------------------------------------------------------------------===//
 
 #pragma once
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
index a38522d7e4f7..b89ad922e102 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
@@ -29,8 +29,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
+/// This file defines kernels for tensor sort/argsort operations.
 //===----------------------------------------------------------------------===//
 
 #pragma once
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
index 8a6b1a8131ca..4bbcacd56fcc 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -29,8 +29,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
+/// This file defines kernels for tensor sort/argsort operations.
 //===----------------------------------------------------------------------===//
 
 #pragma once
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
new file mode 100644
index 000000000000..30f65f0d9bfa
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
@@ -0,0 +1,324 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/isin.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+namespace detail
+{
+
+using dpctl::tensor::kernels::isin_contig_impl_fp_ptr_t;
+
+static isin_contig_impl_fp_ptr_t
+    isin_contig_impl_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct IsinContigFactory
+{
+    constexpr IsinContigFactory() {}
+
+    fnT get() const
+    {
+        using dpctl::tensor::kernels::isin_contig_impl;
+        return isin_contig_impl<argTy>;
+    }
+};
+
+using dpctl::tensor::kernels::isin_strided_impl_fp_ptr_t;
+
+static isin_strided_impl_fp_ptr_t
+    isin_strided_impl_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct IsinStridedFactory
+{
+    constexpr IsinStridedFactory() {}
+
+    fnT get() const
+    {
+        using dpctl::tensor::kernels::isin_strided_impl;
+        return isin_strided_impl<argTy>;
+    }
+};
+
+void init_isin_dispatch_vector(void)
+{
+
+    // Contiguous input function dispatch
+    td_ns::DispatchVectorBuilder<isin_contig_impl_fp_ptr_t, IsinContigFactory,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isin_contig_impl_dispatch_vector);
+
+    // Strided input function dispatch
+    td_ns::DispatchVectorBuilder<isin_strided_impl_fp_ptr_t, IsinStridedFactory,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isin_strided_impl_dispatch_vector);
+}
+
+} // namespace detail
+
+/*! @brief search for needle from needles in sorted hay */
+std::pair<sycl::event, sycl::event>
+    py_isin(const dpctl::tensor::usm_ndarray &needles,
+            const dpctl::tensor::usm_ndarray &hay,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const bool invert,
+            const std::vector<sycl::event> &depends)
+{
+    const int hay_nd = hay.get_ndim();
+    const int needles_nd = needles.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (hay_nd != 1 || needles_nd != dst_nd) {
+        throw py::value_error("Array dimensions mismatch");
+    }
+
+    // check that needle and dst have the same shape
+    std::size_t needles_nelems(1);
+    bool same_shape(true);
+
+    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
+
+    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = needles.get_shape_raw();
+
+    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
+        const auto needles_sh_i = needles_shape_ptr[i];
+        const auto dst_sh_i = dst_shape_ptr[i];
+
+        same_shape = same_shape && (needles_sh_i == dst_sh_i);
+        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
+    }
+
+    if (!same_shape) {
+        throw py::value_error(
+            "Array of values to search for and array of their "
+            "dst do not have the same shape");
+    }
+
+    // check that dst is ample enough
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               needles_nelems);
+
+    // check that dst is writable
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // if output array overlaps with input arrays, race condition results
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(dst, hay) || overlap(dst, needles)) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    const int hay_typenum = hay.get_typenum();
+    const int needles_typenum = needles.get_typenum();
+    const int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
+    const int needles_typeid =
+        array_types.typenum_to_lookup_id(needles_typenum);
+    const int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // check hay and needle have the same data-type
+    if (needles_typeid != hay_typeid) {
+        throw py::value_error(
+            "Hay array and needles array must have the same data types");
+    }
+    // check that dst has boolean data type
+    const auto dst_typenum_t_v = static_cast<td_ns::typenum_t>(dst_typeid);
+    if (dst_typenum_t_v != td_ns::typenum_t::BOOL) {
+        throw py::value_error("dst array must have data-type bool");
+    }
+
+    if (needles_nelems == 0) {
+        // Nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    // if all inputs are contiguous call contiguous implementations
+    // otherwise call strided implementation
+    const bool hay_is_c_contig = hay.is_c_contiguous();
+    const bool hay_is_f_contig = hay.is_f_contiguous();
+
+    const bool needles_is_c_contig = needles.is_c_contiguous();
+    const bool needles_is_f_contig = needles.is_f_contiguous();
+
+    const bool dst_is_c_contig = dst.is_c_contiguous();
+    const bool dst_is_f_contig = dst.is_f_contiguous();
+
+    const bool all_c_contig =
+        (hay_is_c_contig && needles_is_c_contig && dst_is_c_contig);
+    const bool all_f_contig =
+        (hay_is_f_contig && needles_is_f_contig && dst_is_f_contig);
+
+    const char *hay_data = hay.get_data();
+    const char *needles_data = needles.get_data();
+
+    char *dst_data = dst.get_data();
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = detail::isin_contig_impl_dispatch_vector[hay_typeid];
+
+        static constexpr py::ssize_t zero_offset(0);
+
+        sycl::event comp_ev = fn(exec_q, invert, hay_nelems, needles_nelems,
+                                 hay_data, zero_offset, needles_data,
+                                 zero_offset, dst_data, zero_offset, depends);
+
+        return std::make_pair(dpctl::utils::keep_args_alive(
+                                  exec_q, {hay, needles, dst}, {comp_ev}),
+                              comp_ev);
+    }
+
+    // strided case
+
+    const auto &needles_strides = needles.get_strides_vector();
+    const auto &dst_strides = dst.get_strides_vector();
+
+    int simplified_nd = needles_nd;
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_common_shape;
+    shT simplified_needles_strides;
+    shT simplified_dst_strides;
+    py::ssize_t needles_offset(0);
+    py::ssize_t dst_offset(0);
+
+    if (simplified_nd == 0) {
+        // needles and dst have same nd
+        simplified_nd = 1;
+        simplified_common_shape.push_back(1);
+        simplified_needles_strides.push_back(0);
+        simplified_dst_strides.push_back(0);
+    }
+    else {
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            // modified by reference
+            simplified_nd,
+            // read-only inputs
+            needles_shape_ptr, needles_strides, dst_strides,
+            // output, modified by reference
+            simplified_common_shape, simplified_needles_strides,
+            simplified_dst_strides, needles_offset, dst_offset);
+    }
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // vectors being packed
+        simplified_common_shape, simplified_needles_strides,
+        simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_strides_ev =
+        std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    auto strided_fn = detail::isin_strided_impl_dispatch_vector[hay_typeid];
+
+    if (!strided_fn) {
+        throw std::runtime_error(
+            "No implementation for data types of input arrays");
+    }
+
+    static constexpr py::ssize_t zero_offset(0);
+    py::ssize_t hay_step = hay.get_strides_vector()[0];
+
+    const sycl::event &comp_ev = strided_fn(
+        exec_q, invert, hay_nelems, needles_nelems, hay_data, zero_offset,
+        hay_step, needles_data, needles_offset, dst_data, dst_offset,
+        simplified_nd, packed_shape_strides, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {comp_ev}, packed_shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {hay, needles, dst}, host_task_events);
+
+    return std::make_pair(ht_ev, comp_ev);
+}
+
+void init_isin_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::detail::init_isin_dispatch_vector();
+
+    using dpctl::tensor::py_internal::py_isin;
+    m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("invert"),
+          py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp
new file mode 100644
index 000000000000..236e8b5898c6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isin_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
index b569317e5d68..2aa664fc94ff 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
@@ -35,7 +35,7 @@
 
 #include <pybind11/pybind11.h>
 
-// #include "sorting/isin.hpp"
+#include "sorting/isin.hpp"
 #include "sorting/merge_argsort.hpp"
 #include "sorting/merge_sort.hpp"
 #include "sorting/radix_argsort.hpp"
@@ -47,7 +47,7 @@ namespace py = pybind11;
 
 PYBIND11_MODULE(_tensor_sorting_impl, m)
 {
-    // dpctl::tensor::py_internal::init_isin_functions(m);
+    dpctl::tensor::py_internal::init_isin_functions(m);
     dpctl::tensor::py_internal::init_merge_sort_functions(m);
     dpctl::tensor::py_internal::init_merge_argsort_functions(m);
     dpctl::tensor::py_internal::init_searchsorted_functions(m);

From ce570a54c31f93ce2a542800cc68c616732ce14b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 11:58:42 -0800
Subject: [PATCH 124/145] Fix bug: wrong shape pointer for dst  array

---
 dpctl_ext/tensor/libtensor/source/sorting/isin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
index 30f65f0d9bfa..728a7ca3b733 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
@@ -136,7 +136,7 @@ std::pair<sycl::event, sycl::event>
     const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
 
     const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
 
     for (int i = 0; (i < needles_nd) && same_shape; ++i) {
         const auto needles_sh_i = needles_shape_ptr[i];

From a7c644007f5ea6ff874265d8f8c43f3b4fdc4403 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 12:02:02 -0800
Subject: [PATCH 125/145] Move dpt.isin() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py       |   2 +
 dpctl_ext/tensor/_set_functions.py | 167 ++++++++++++++++++++++++++++-
 dpnp/dpnp_iface_logic.py           |   5 +-
 3 files changed, 172 insertions(+), 2 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 5eca7e61098a..0f2585c9b0e9 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -84,6 +84,7 @@
 from ._clip import clip
 from ._searchsorted import searchsorted
 from ._set_functions import (
+    isin,
     unique_all,
     unique_counts,
     unique_inverse,
@@ -119,6 +120,7 @@
     "full_like",
     "iinfo",
     "isdtype",
+    "isin",
     "linspace",
     "meshgrid",
     "moveaxis",
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py
index 0ca0cd9d3db0..93f81f044fd2 100644
--- a/dpctl_ext/tensor/_set_functions.py
+++ b/dpctl_ext/tensor/_set_functions.py
@@ -26,7 +26,7 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-from typing import NamedTuple
+from typing import NamedTuple, Optional, Union
 
 import dpctl.tensor as dpt
 import dpctl.utils as du
@@ -36,6 +36,13 @@
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 
+from ._copy_utils import _empty_like_orderK
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
 from ._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _extract,
@@ -47,9 +54,25 @@
 )
 from ._tensor_sorting_impl import (
     _argsort_ascending,
+    _isin,
     _searchsorted_left,
     _sort_ascending,
 )
+from ._type_utils import (
+    _resolve_weak_types_all_py_ints,
+    _to_device_supported_dtype,
+)
+
+__all__ = [
+    "isin",
+    "unique_values",
+    "unique_counts",
+    "unique_inverse",
+    "unique_all",
+    "UniqueAllResult",
+    "UniqueCountsResult",
+    "UniqueInverseResult",
+]
 
 
 class UniqueAllResult(NamedTuple):
@@ -636,3 +659,145 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         inv,
         _counts,
     )
+
+
+def isin(
+    x: Union[dpt.usm_ndarray, int, float, complex, bool],
+    test_elements: Union[dpt.usm_ndarray, int, float, complex, bool],
+    /,
+    *,
+    invert: Optional[bool] = False,
+) -> dpt.usm_ndarray:
+    """isin(x, test_elements, /, *, invert=False)
+
+    Tests `x in test_elements` for each element of `x`. Returns a boolean array
+    with the same shape as `x` that is `True` where the element is in
+    `test_elements`, `False` otherwise.
+
+    Args:
+        x (Union[usm_ndarray, bool, int, float, complex]):
+            input element or elements.
+        test_elements (Union[usm_ndarray, bool, int, float, complex]):
+            elements against which to test each value of `x`.
+        invert (Optional[bool]):
+            if `True`, the output results are inverted, i.e., are equivalent to
+            testing `x not in test_elements` for each element of `x`.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            an array of the inclusion test results. The returned array has a
+            boolean data type and the same shape as `x`.
+    """
+    q1, x_usm_type = _get_queue_usm_type(x)
+    q2, test_usm_type = _get_queue_usm_type(test_elements)
+    if q1 is None and q2 is None:
+        raise du.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+            "One of the arguments must represent USM allocation and "
+            "expose `__sycl_usm_array_interface__` property"
+        )
+    if q1 is None:
+        exec_q = q2
+        res_usm_type = test_usm_type
+    elif q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = du.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise du.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = du.get_coerced_usm_type(
+            (
+                x_usm_type,
+                test_usm_type,
+            )
+        )
+    du.validate_usm_type(res_usm_type, allow_none=False)
+    sycl_dev = exec_q.sycl_device
+
+    if not isinstance(invert, bool):
+        raise TypeError(
+            "`invert` keyword argument must be of boolean type, "
+            f"got {type(invert)}"
+        )
+
+    x_dt = _get_dtype(x, sycl_dev)
+    test_dt = _get_dtype(test_elements, sycl_dev)
+    if not all(_validate_dtype(dt) for dt in (x_dt, test_dt)):
+        raise ValueError("Operands have unsupported data types")
+
+    x_sh = _get_shape(x)
+    if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0:
+        if invert:
+            return dpt_ext.ones(
+                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
+            )
+        else:
+            return dpt_ext.zeros(
+                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
+            )
+
+    dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev)
+    dt = _to_device_supported_dtype(dpt_ext.result_type(dt1, dt2), sycl_dev)
+
+    if not isinstance(x, dpt.usm_ndarray):
+        x_arr = dpt_ext.asarray(
+            x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+    else:
+        x_arr = x
+
+    if not isinstance(test_elements, dpt.usm_ndarray):
+        test_arr = dpt_ext.asarray(
+            test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+    else:
+        test_arr = test_elements
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    if x_dt != dt:
+        x_buf = _empty_like_orderK(x_arr, dt, res_usm_type, exec_q)
+        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=x_arr, dst=x_buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, ev)
+    else:
+        x_buf = x_arr
+
+    if test_dt != dt:
+        # copy into C-contiguous memory, because the array will be flattened
+        test_buf = dpt_ext.empty_like(
+            test_arr, dtype=dt, order="C", usm_type=res_usm_type
+        )
+        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=test_arr, dst=test_buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, ev)
+    else:
+        test_buf = test_arr
+
+    test_buf = dpt_ext.reshape(test_buf, -1)
+    test_buf = dpt_ext.sort(test_buf)
+
+    dst = dpt_ext.empty_like(
+        x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C"
+    )
+
+    dep_evs = _manager.submitted_events
+    ht_ev, s_ev = _isin(
+        needles=x_buf,
+        hay=test_buf,
+        dst=dst,
+        sycl_queue=exec_q,
+        invert=invert,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(ht_ev, s_ev)
+    return dst
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 1834f25a0485..3e3501b14c7c 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -49,6 +49,9 @@
 import dpctl.utils as dpu
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
@@ -1273,7 +1276,7 @@ def isin(
         usm_element = dpnp.get_usm_ndarray(element)
         usm_test = dpnp.get_usm_ndarray(test_elements)
     return dpnp_array._create_from_usm_ndarray(
-        dpt.isin(
+        dpt_ext.isin(
             usm_element,
             usm_test,
             invert=invert,

From 62d19f1b746ea7b0a4241d74e052915cc59ac712 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 12:24:42 -0800
Subject: [PATCH 126/145] Move _topk to _tensor_sorting_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../include/kernels/sorting/topk.hpp          | 510 ++++++++++++++++++
 .../tensor/libtensor/source/sorting/topk.cpp  | 303 +++++++++++
 .../tensor/libtensor/source/sorting/topk.hpp  |  47 ++
 .../libtensor/source/tensor_sorting.cpp       |   4 +-
 5 files changed, 863 insertions(+), 3 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/topk.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index c03cba55b7e4..056b7c425544 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -76,7 +76,7 @@ set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
 )
 set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
new file mode 100644
index 000000000000..90e06a1e9eb8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -0,0 +1,510 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor topk operation.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+
+#include <sycl/ext/oneapi/sub_group_mask.hpp>
+#include <sycl/sycl.hpp>
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace topk_detail
+{
+
+void scale_topk_params(const std::uint64_t nelems_per_slm,
+                       const std::size_t sub_groups_per_work_group,
+                       const std::uint32_t elems_per_wi,
+                       const std::vector<std::size_t> &sg_sizes,
+                       std::size_t &lws,
+                       std::size_t &nelems_wg_sorts)
+{
+    for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) {
+        auto sg_size = *it;
+        lws = sub_groups_per_work_group * sg_size;
+        nelems_wg_sorts = elems_per_wi * lws;
+        if (nelems_wg_sorts < nelems_per_slm) {
+            return;
+        }
+    }
+    // should never reach
+    throw std::runtime_error("Could not construct top k kernel parameters");
+}
+
+template <class KernelName, typename argTy, typename IndexTy>
+sycl::event write_out_impl(sycl::queue &exec_q,
+                           std::size_t iter_nelems,
+                           std::size_t k,
+                           const argTy *arg_tp,
+                           const IndexTy *index_data,
+                           std::size_t iter_index_stride,
+                           std::size_t axis_nelems,
+                           argTy *vals_tp,
+                           IndexTy *inds_tp,
+                           const std::vector<sycl::event> &depends)
+{
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t nelems = iter_nelems * k;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const std::size_t iter_id = data_id / k;
+
+                    /*
+                    const std::size_t axis_gid = data_id - (iter_gid * k);
+                    const std::size_t src_idx = iter_gid * iter_index_stride +
+                    axis_gid;
+                    */
+                    const std::size_t src_idx =
+                        data_id + iter_id * (iter_index_stride - k);
+
+                    const IndexTy res_ind = index_data[src_idx];
+                    const argTy v = arg_tp[res_ind];
+
+                    const std::size_t dst_idx = data_id;
+                    vals_tp[dst_idx] = v;
+                    inds_tp[dst_idx] = (res_ind % axis_nelems);
+                }
+            }
+        });
+    });
+
+    return write_out_ev;
+}
+
+} // namespace topk_detail
+
+template <typename T1, typename T2>
+class topk_populate_index_data_krn;
+
+template <typename T1, typename T2>
+class topk_full_merge_map_back_krn;
+
+template <typename argTy, typename IndexTy, typename CompT>
+sycl::event
+    topk_full_merge_sort_impl(sycl::queue &exec_q,
+                              std::size_t iter_nelems, // number of sub-arrays
+                              std::size_t axis_nelems, // size of each sub-array
+                              std::size_t k,
+                              const argTy *arg_tp,
+                              argTy *vals_tp,
+                              IndexTy *inds_tp,
+                              const CompT &comp,
+                              const std::vector<sycl::event> &depends)
+{
+    auto index_data_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            iter_nelems * axis_nelems, exec_q);
+    // extract USM pointer
+    IndexTy *index_data = index_data_owner.get();
+
+    using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, index_data, iter_nelems * axis_nelems, depends);
+
+    std::size_t sorted_block_size;
+    // Sort segments of the array
+    sycl::event base_sort_ev =
+        merge_sort_detail::sort_over_work_group_contig_impl(
+            exec_q, iter_nelems, axis_nelems, index_data, index_data, comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size,
+        {base_sort_ev});
+
+    using WriteOutKernelName = topk_full_merge_map_back_krn<argTy, IndexTy>;
+
+    sycl::event write_out_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems,
+            axis_nelems, vals_tp, inds_tp, {merges_ev});
+
+    sycl::event cleanup_host_task_event =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
+                                                     index_data_owner);
+
+    return cleanup_host_task_event;
+};
+
+template <typename T1, typename T2>
+class topk_partial_merge_map_back_krn;
+
+template <typename T1, typename T2, typename Comp>
+class topk_over_work_group_krn;
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event topk_merge_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows
+                             // in a matrix when sorting over rows)
+    std::size_t axis_nelems, // size of each array to sort  (length of
+                             // rows, i.e. number of columns)
+    std::size_t k,
+    const char *arg_cp,
+    char *vals_cp,
+    char *inds_cp,
+    const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < k) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
+
+    using dpctl::tensor::kernels::IndexComp;
+    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
+
+    if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) {
+        return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k,
+                                         arg_tp, vals_tp, inds_tp, index_comp,
+                                         depends);
+    }
+    else {
+        using PartialKernelName =
+            topk_over_work_group_krn<IndexTy, IndexTy, ValueComp>;
+
+        const auto &kernel_id = sycl::get_kernel_id<PartialKernelName>();
+
+        auto const &ctx = exec_q.get_context();
+        auto const &dev = exec_q.get_device();
+
+        auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            ctx, {dev}, {kernel_id});
+
+        auto krn = kb.get_kernel(kernel_id);
+
+        const std::uint32_t max_sg_size = krn.template get_info<
+            sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+        const std::uint64_t device_local_memory_size =
+            dev.get_info<sycl::info::device::local_mem_size>();
+
+        //  leave 512 bytes of local memory for RT
+        const std::uint64_t safety_margin = 512;
+
+        const std::uint64_t nelems_per_slm =
+            (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy));
+
+        static constexpr std::uint32_t sub_groups_per_work_group = 4;
+        const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+        std::size_t lws = sub_groups_per_work_group * max_sg_size;
+
+        std::size_t sorted_block_size = elems_per_wi * lws;
+        if (sorted_block_size > nelems_per_slm) {
+            const std::vector<std::size_t> sg_sizes =
+                dev.get_info<sycl::info::device::sub_group_sizes>();
+            topk_detail::scale_topk_params(
+                nelems_per_slm, sub_groups_per_work_group, elems_per_wi,
+                sg_sizes,
+                lws,              // modified by reference
+                sorted_block_size // modified by reference
+            );
+        }
+
+        // This assumption permits doing away with using a loop
+        assert(sorted_block_size % lws == 0);
+
+        using search_sorted_detail::quotient_ceil;
+        const std::size_t n_segments =
+            quotient_ceil<std::size_t>(axis_nelems, sorted_block_size);
+
+        // round k up for the later merge kernel if necessary
+        const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4;
+        std::size_t k_rounded =
+            (k < round_k_to)
+                ? k
+                : quotient_ceil<std::size_t>(k, round_k_to) * round_k_to;
+
+        // get length of tail for alloc size
+        auto rem = axis_nelems % sorted_block_size;
+        auto alloc_len = (rem && rem < k_rounded)
+                             ? rem + k_rounded * (n_segments - 1)
+                             : k_rounded * n_segments;
+
+        // if allocation would be sufficiently large or k is larger than
+        // elements processed, use full sort
+        if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
+            alloc_len >= axis_nelems / 2)
+        {
+            return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems,
+                                             k, arg_tp, vals_tp, inds_tp,
+                                             index_comp, depends);
+        }
+
+        auto index_data_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+                iter_nelems * alloc_len, exec_q);
+        // get raw USM pointer
+        IndexTy *index_data = index_data_owner.get();
+
+        // no need to populate index data: SLM will be populated with default
+        // values
+
+        sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.use_kernel_bundle(kb);
+
+            sycl::range<1> global_range{iter_nelems * n_segments * lws};
+            sycl::range<1> local_range{lws};
+
+            sycl::range<1> slm_range{sorted_block_size};
+            sycl::local_accessor<IndexTy, 1> work_space(slm_range, cgh);
+            sycl::local_accessor<IndexTy, 1> scratch_space(slm_range, cgh);
+
+            sycl::nd_range<1> ndRange(global_range, local_range);
+
+            cgh.parallel_for<PartialKernelName>(
+                ndRange, [=](sycl::nd_item<1> it) {
+                    const std::size_t group_id = it.get_group_linear_id();
+                    const std::size_t iter_id = group_id / n_segments;
+                    const std::size_t segment_id =
+                        group_id - iter_id * n_segments;
+                    const std::size_t lid = it.get_local_linear_id();
+
+                    const std::size_t segment_start_idx =
+                        segment_id * sorted_block_size;
+                    const std::size_t segment_end_idx = std::min<std::size_t>(
+                        segment_start_idx + sorted_block_size, axis_nelems);
+                    const std::size_t wg_chunk_size =
+                        segment_end_idx - segment_start_idx;
+
+                    // load input into SLM
+                    for (std::size_t array_id = segment_start_idx + lid;
+                         array_id < segment_end_idx; array_id += lws)
+                    {
+                        IndexTy v = (array_id < axis_nelems)
+                                        ? iter_id * axis_nelems + array_id
+                                        : IndexTy{};
+                        work_space[array_id - segment_start_idx] = v;
+                    }
+                    sycl::group_barrier(it.get_group());
+
+                    const std::size_t chunk =
+                        quotient_ceil<std::size_t>(sorted_block_size, lws);
+
+                    const std::size_t chunk_start_idx = lid * chunk;
+                    const std::size_t chunk_end_idx =
+                        sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+                    merge_sort_detail::leaf_sort_impl(
+                        work_space, chunk_start_idx, chunk_end_idx, index_comp);
+
+                    sycl::group_barrier(it.get_group());
+
+                    bool data_in_temp = false;
+                    std::size_t n_chunks_merged = 1;
+
+                    // merge chunk while n_chunks_merged * chunk < wg_chunk_size
+                    const std::size_t max_chunks_merged =
+                        1 + ((wg_chunk_size - 1) / chunk);
+                    for (; n_chunks_merged < max_chunks_merged;
+                         data_in_temp = !data_in_temp, n_chunks_merged *= 2)
+                    {
+                        const std::size_t nelems_sorted_so_far =
+                            n_chunks_merged * chunk;
+                        const std::size_t q = (lid / n_chunks_merged);
+                        const std::size_t start_1 = sycl::min(
+                            2 * nelems_sorted_so_far * q, wg_chunk_size);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t offset =
+                            chunk * (lid - q * n_chunks_merged);
+
+                        if (data_in_temp) {
+                            merge_sort_detail::merge_impl(
+                                offset, scratch_space, work_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        else {
+                            merge_sort_detail::merge_impl(
+                                offset, work_space, scratch_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        sycl::group_barrier(it.get_group());
+                    }
+
+                    // output assumed to be structured as (iter_nelems,
+                    // alloc_len)
+                    const std::size_t k_segment_start_idx =
+                        segment_id * k_rounded;
+                    const std::size_t k_segment_end_idx = std::min<std::size_t>(
+                        k_segment_start_idx + k_rounded, alloc_len);
+                    const auto &out_src =
+                        (data_in_temp) ? scratch_space : work_space;
+                    for (std::size_t array_id = k_segment_start_idx + lid;
+                         array_id < k_segment_end_idx; array_id += lws)
+                    {
+                        if (lid < k_rounded) {
+                            index_data[iter_id * alloc_len + array_id] =
+                                out_src[array_id - k_segment_start_idx];
+                        }
+                    }
+                });
+        });
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            merge_sort_detail::merge_sorted_block_contig_impl(
+                exec_q, iter_nelems, alloc_len, index_data, index_comp,
+                k_rounded, {base_sort_ev});
+
+        // Write out top k of the merge-sorted memory
+        using WriteOutKernelName =
+            topk_partial_merge_map_back_krn<argTy, IndexTy>;
+
+        sycl::event write_topk_ev =
+            topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+                exec_q, iter_nelems, k, arg_tp, index_data, alloc_len,
+                axis_nelems, vals_tp, inds_tp, {merges_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {write_topk_ev}, index_data_owner);
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename T1, typename T2>
+class topk_iota_krn;
+
+template <typename T1, typename T2>
+class topk_radix_map_back_krn;
+
+template <typename argTy, typename IndexTy>
+sycl::event topk_radix_impl(sycl::queue &exec_q,
+                            std::size_t iter_nelems, // number of sub-arrays
+                            std::size_t axis_nelems, // size of each sub-array
+                            std::size_t k,
+                            bool ascending,
+                            const char *arg_cp,
+                            char *vals_cp,
+                            char *inds_cp,
+                            const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < k) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
+
+    const std::size_t total_nelems = iter_nelems * axis_nelems;
+    const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            padded_total_nelems + total_nelems, exec_q);
+
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+    IndexTy *tmp_tp = workspace + padded_total_nelems;
+
+    using IdentityProjT = radix_sort_details::IdentityProj;
+    using IndexedProjT =
+        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
+    const IndexedProjT proj_op{arg_tp};
+
+    using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
+            exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op,
+            ascending, {iota_ev});
+
+    // Write out top k of the temporary
+    using WriteOutKernelName = topk_radix_map_back_krn<argTy, IndexTy>;
+
+    sycl::event write_topk_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems,
+            vals_tp, inds_tp, {radix_sort_ev});
+
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {write_topk_ev}, workspace_owner);
+
+    return cleanup_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
new file mode 100644
index 000000000000..23a8d2a4328f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
@@ -0,0 +1,303 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/topk.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "topk.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &,
+                                          std::size_t,
+                                          std::size_t,
+                                          std::size_t,
+                                          bool,
+                                          const char *,
+                                          char *,
+                                          char *,
+                                          const std::vector<sycl::event> &);
+
+static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types];
+
+namespace
+{
+
+template <typename T, typename = void>
+struct use_radix_sort : public std::false_type
+{
+};
+
+template <typename T>
+struct use_radix_sort<
+    T,
+    std::enable_if_t<std::disjunction<std::is_same<T, bool>,
+                                      std::is_same<T, std::uint8_t>,
+                                      std::is_same<T, std::int8_t>,
+                                      std::is_same<T, std::uint16_t>,
+                                      std::is_same<T, std::int16_t>>::value>>
+    : public std::true_type
+{
+};
+
+template <typename argTy, typename IndexTy>
+sycl::event topk_caller(sycl::queue &exec_q,
+                        std::size_t iter_nelems, // number of sub-arrays
+                        std::size_t axis_nelems, // size of each sub-array
+                        std::size_t k,
+                        bool largest,
+                        const char *arg_cp,
+                        char *vals_cp,
+                        char *inds_cp,
+                        const std::vector<sycl::event> &depends)
+{
+    if constexpr (use_radix_sort<argTy>::value) {
+        using dpctl::tensor::kernels::topk_radix_impl;
+        auto ascending = !largest;
+        return topk_radix_impl<argTy, IndexTy>(exec_q, iter_nelems, axis_nelems,
+                                               k, ascending, arg_cp, vals_cp,
+                                               inds_cp, depends);
+    }
+    else {
+        using dpctl::tensor::kernels::topk_merge_impl;
+        if (largest) {
+            using CompTy =
+                typename dpctl::tensor::rich_comparisons::DescendingSorter<
+                    argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                depends);
+        }
+        else {
+            using CompTy =
+                typename dpctl::tensor::rich_comparisons::AscendingSorter<
+                    argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                depends);
+        }
+    }
+}
+
+} // namespace
+
+std::pair<sycl::event, sycl::event>
+    py_topk(const dpctl::tensor::usm_ndarray &src,
+            std::optional<const int> trailing_dims_to_search,
+            const std::size_t k,
+            const bool largest,
+            const dpctl::tensor::usm_ndarray &vals,
+            const dpctl::tensor::usm_ndarray &inds,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int vals_nd = vals.get_ndim();
+    int inds_nd = inds.get_ndim();
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
+    const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
+
+    std::size_t axis_nelems(1);
+    std::size_t iter_nelems(1);
+    if (trailing_dims_to_search.has_value()) {
+        if (src_nd != vals_nd || src_nd != inds_nd) {
+            throw py::value_error("The input and output arrays must have "
+                                  "the same array ranks");
+        }
+
+        auto trailing_dims = trailing_dims_to_search.value();
+        int iter_nd = src_nd - trailing_dims;
+        if (trailing_dims <= 0 || iter_nd < 0) {
+            throw py::value_error(
+                "trailing_dims_to_search must be positive, but no "
+                "greater than rank of the array being searched");
+        }
+
+        bool same_shapes = true;
+        for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+            auto src_shape_i = src_shape_ptr[i];
+            same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] &&
+                                          src_shape_i == inds_shape_ptr[i]);
+            iter_nelems *= static_cast<std::size_t>(src_shape_i);
+        }
+
+        if (!same_shapes) {
+            throw py::value_error(
+                "Destination shape does not match the input shape");
+        }
+
+        std::size_t vals_k(1);
+        std::size_t inds_k(1);
+        for (int i = iter_nd; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+            vals_k *= static_cast<std::size_t>(vals_shape_ptr[i]);
+            inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
+        }
+
+        bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
+    }
+    else {
+        if (vals_nd != 1 || inds_nd != 1) {
+            throw py::value_error("Output arrays must be one-dimensional");
+        }
+
+        for (int i = 0; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+        }
+
+        bool valid_k = (axis_nelems >= k &&
+                        static_cast<std::size_t>(vals_shape_ptr[0]) == k &&
+                        static_cast<std::size_t>(inds_shape_ptr[0]) == k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals);
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds);
+
+    if ((iter_nelems == 0) || (axis_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, vals) || overlap(src, inds)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals,
+                                                               k * iter_nelems);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds,
+                                                               k * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int vals_typenum = vals.get_typenum();
+    int inds_typenum = inds.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum);
+    int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum);
+
+    if (src_typeid != vals_typeid) {
+        throw py::value_error("Input array and vals array must have "
+                              "the same data type");
+    }
+
+    if (inds_typeid != static_cast<int>(td_ns::typenum_t::INT64)) {
+        throw py::value_error("Inds array must have data type int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_vals_c_contig = vals.is_c_contiguous();
+    bool is_inds_c_contig = inds.is_c_contiguous();
+
+    if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) {
+        auto fn = topk_dispatch_vector[src_typeid];
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(),
+               vals.get_data(), inds.get_data(), depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+template <typename fnT, typename T>
+struct TopKFactory
+{
+    fnT get()
+    {
+        using IdxT = std::int64_t;
+        return topk_caller<T, IdxT>;
+    }
+};
+
+void init_topk_dispatch_vectors(void)
+{
+    td_ns::DispatchVectorBuilder<topk_impl_fn_ptr_t, TopKFactory,
+                                 td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(topk_dispatch_vector);
+}
+
+void init_topk_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_topk_dispatch_vectors();
+
+    m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
+          py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp
new file mode 100644
index 000000000000..d39c0eefdb93
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_topk_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
index 2aa664fc94ff..98a2493a7c48 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
@@ -41,7 +41,7 @@
 #include "sorting/radix_argsort.hpp"
 #include "sorting/radix_sort.hpp"
 #include "sorting/searchsorted.hpp"
-// #include "sorting/topk.hpp"
+#include "sorting/topk.hpp"
 
 namespace py = pybind11;
 
@@ -53,5 +53,5 @@ PYBIND11_MODULE(_tensor_sorting_impl, m)
     dpctl::tensor::py_internal::init_searchsorted_functions(m);
     dpctl::tensor::py_internal::init_radix_sort_functions(m);
     dpctl::tensor::py_internal::init_radix_argsort_functions(m);
-    // dpctl::tensor::py_internal::init_topk_functions(m);
+    dpctl::tensor::py_internal::init_topk_functions(m);
 }

From 56d397d89ac0740cba12223935809da12a3f53e5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 2 Mar 2026 12:26:31 -0800
Subject: [PATCH 127/145] Move dpt.top_k()

---
 dpctl_ext/tensor/__init__.py |   3 +-
 dpctl_ext/tensor/_sorting.py | 170 ++++++++++++++++++++++++++++++++++-
 2 files changed, 169 insertions(+), 4 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 0f2585c9b0e9..9bfda2446889 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -90,7 +90,7 @@
     unique_inverse,
     unique_values,
 )
-from ._sorting import argsort, sort
+from ._sorting import argsort, sort, top_k
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
@@ -143,6 +143,7 @@
     "take",
     "take_along_axis",
     "tile",
+    "top_k",
     "to_numpy",
     "tril",
     "triu",
diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py
index a5fd1d57bcdf..24693a408889 100644
--- a/dpctl_ext/tensor/_sorting.py
+++ b/dpctl_ext/tensor/_sorting.py
@@ -26,8 +26,8 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-# import operator
-# from typing import NamedTuple
+import operator
+from typing import NamedTuple
 
 import dpctl.tensor as dpt
 import dpctl.utils as du
@@ -48,9 +48,10 @@
     _radix_sort_dtype_supported,
     _sort_ascending,
     _sort_descending,
+    _topk,
 )
 
-__all__ = ["sort", "argsort"]
+__all__ = ["sort", "argsort", "top_k"]
 
 
 def _get_mergesort_impl_fn(descending):
@@ -284,3 +285,166 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
         res = dpt_ext.permute_dims(res, inv_perm)
     return res
+
+
+def _get_top_k_largest(mode):
+    modes = {"largest": True, "smallest": False}
+    try:
+        return modes[mode]
+    except KeyError:
+        raise ValueError(
+            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
+        )
+
+
+class TopKResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+
+
+def top_k(x, k, /, *, axis=None, mode="largest"):
+    """top_k(x, k, axis=None, mode="largest")
+
+    Returns the `k` largest or smallest values and their indices in the input
+    array `x` along the specified axis `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        k (int):
+            number of elements to find. Must be a positive integer value.
+        axis (Optional[int]):
+            axis along which to search. If `None`, the search will be performed
+            over the flattened array. Default: ``None``.
+        mode (Literal["largest", "smallest"]):
+            search mode. Must be one of the following modes:
+
+            - `"largest"`: return the `k` largest elements.
+            - `"smallest"`: return the `k` smallest elements.
+
+            Default: `"largest"`.
+
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices)` whose
+
+            * first element `values` will be an array containing the `k`
+              largest or smallest elements of `x`. The array has the same data
+              type as `x`. If `axis` was `None`, `values` will be a
+              one-dimensional array with shape `(k,)` and otherwise, `values`
+              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
+            * second element `indices` will be an array containing indices of
+              `x` that result in `values`. The array will have the same shape
+              as `values` and will have the default array index data type.
+    """
+    largest = _get_top_k_largest(mode)
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+
+    k = operator.index(k)
+    if k < 0:
+        raise ValueError("`k` must be a positive integer value")
+
+    nd = x.ndim
+    if axis is None:
+        sz = x.size
+        if nd == 0:
+            if k > 1:
+                raise ValueError(f"`k`={k} is out of bounds 1")
+            return TopKResult(
+                dpt_ext.copy(x, order="C"),
+                dpt_ext.zeros_like(
+                    x, dtype=ti.default_device_index_type(x.sycl_queue)
+                ),
+            )
+        arr = x
+        n_search_dims = None
+        res_sh = k
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+        sz = x.shape[axis]
+        a1 = axis + 1
+        if a1 == nd:
+            perm = list(range(nd))
+            arr = x
+        else:
+            perm = [i for i in range(nd) if i != axis] + [
+                axis,
+            ]
+            arr = dpt_ext.permute_dims(x, perm)
+        n_search_dims = 1
+        res_sh = arr.shape[: nd - 1] + (k,)
+
+    if k > sz:
+        raise ValueError(f"`k`={k} is out of bounds {sz}")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    res_usm_type = arr.usm_type
+    if arr.flags.c_contiguous:
+        vals = dpt_ext.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt_ext.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=arr,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt_ext.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        vals = dpt_ext.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt_ext.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=tmp,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if axis is not None and a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        vals = dpt_ext.permute_dims(vals, inv_perm)
+        inds = dpt_ext.permute_dims(inds, inv_perm)
+
+    return TopKResult(vals, inds)

From ad814fb168cfb39a73b97c73b3ca4ee7b1852e38 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 03:44:05 -0800
Subject: [PATCH 128/145] Initialize _tensor_reductions_impl extension and move
 _all

---
 dpctl_ext/tensor/CMakeLists.txt               |   25 +-
 .../libtensor/include/kernels/reductions.hpp  | 3324 +++++++++++++++++
 .../libtensor/source/reductions/all.cpp       |  164 +
 .../libtensor/source/reductions/all.hpp       |   46 +
 .../reductions/reduction_atomic_support.hpp   |  148 +
 .../source/reductions/reduction_common.cpp    |   69 +
 .../source/reductions/reduction_common.hpp    |   46 +
 .../source/reductions/reduction_over_axis.hpp | 1325 +++++++
 .../libtensor/source/tensor_reductions.cpp    |   45 +
 9 files changed, 5191 insertions(+), 1 deletion(-)
 create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/all.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/all.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 056b7c425544..872e3f786c65 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -69,6 +69,19 @@ set(_accumulator_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
 )
+set(_reduction_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
+    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
+)
 set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
@@ -82,6 +95,10 @@ set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
     ${_accumulator_sources}
 )
+set(_tensor_reductions_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
+    ${_reduction_sources}
+)
 set(_tensor_sorting_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
     ${_sorting_sources}
@@ -114,6 +131,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_i
 target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
 list(APPEND _py_trgts ${python_module_name})
 
+set(python_module_name _tensor_reductions_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
 set(python_module_name _tensor_sorting_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
@@ -135,7 +158,7 @@ set(_no_fast_math_sources
 list(
     APPEND _no_fast_math_sources
     # ${_elementwise_sources}
-    # ${_reduction_sources}
+    ${_reduction_sources}
     ${_sorting_sources}
     # ${_linalg_sources}
     ${_accumulator_sources}
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
new file mode 100644
index 000000000000..d77d7df92609
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
@@ -0,0 +1,3324 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor reduction along axis.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace reduction_detail
+{
+
+inline std::size_t get_work_group_size(const sycl::device &d)
+{
+    // prevents running out of resources on CPU
+    return std::min<std::size_t>(
+        2048, d.get_info<sycl::info::device::max_work_group_size>() / 2);
+}
+
+} // namespace reduction_detail
+
+template <typename ReductionOpT, typename T>
+struct needs_workaround
+{
+    static constexpr bool value =
+        (std::is_same_v<ReductionOpT, sycl::multiplies<T>> &&
+         (std::is_same_v<T, std::int64_t> ||
+          std::is_same_v<T, std::uint64_t>)) ||
+        (__LIBSYCL_MAJOR_VERSION < 7 && std::is_same_v<T, bool> &&
+         std::is_same_v<ReductionOpT, sycl::logical_or<T>>);
+};
+
+template <typename ReductionOpT, typename T>
+struct can_use_reduce_over_group
+{
+    static constexpr bool value =
+        sycl::has_known_identity<ReductionOpT, T>::value &&
+        !needs_workaround<ReductionOpT, T>::value;
+};
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialReduction(const argT *inp,
+                        outT *res,
+                        const ReductionOp &reduction_op,
+                        const outT &identity_val,
+                        const InputOutputIterIndexerT &arg_res_iter_indexer,
+                        const InputRedIndexerT &arg_reduced_dims_indexer,
+                        std::size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        outT red_val(identity_);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
+            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
+            {
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+            red_val = reduction_op_(red_val, val);
+        }
+
+        out_[out_iter_offset] = red_val;
+    }
+};
+
+/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */
+
+/*
+  This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8
+  if the device has aspect atomic64 and only with those supported by
+  sycl::atomic_ref
+*/
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
+            {
+                // handle nans
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg;
+        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+            red_val_over_wg = static_cast<outT>(
+                sycl::all_of_group(work_group, local_red_val));
+        }
+        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+            red_val_over_wg = static_cast<outT>(
+                sycl::any_of_group(work_group, local_red_val));
+        }
+        else {
+            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
+                                                      identity_, reduction_op_);
+        }
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
+                res_ref += red_val_over_wg;
+            }
+            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+                res_ref.fetch_and(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+                res_ref.fetch_or(red_val_over_wg);
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        SlmT local_mem,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
+            {
+                // handle nans
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            // retain these checks in case a reduce_over_group work-around is
+            // needed
+            if constexpr (su_ns::IsSyclPlus<outT, ReductionOp>::value) {
+                res_ref += red_val_over_wg;
+            }
+            else if constexpr (su_ns::IsSyclMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclLogicalAnd<outT,
+                                                       ReductionOp>::value) {
+                res_ref.fetch_and(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclLogicalOr<outT, ReductionOp>::value)
+            {
+                res_ref.fetch_or(red_val_over_wg);
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val;
+                if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                              su_ns::IsLogicalOr<outT, ReductionOp>::value)
+                {
+                    // handle nans
+                    val = convert_impl<bool, argT>(inp_[inp_offset]);
+                }
+                else {
+                    val = convert_impl<outT, argT>(inp_[inp_offset]);
+                }
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg;
+        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+            red_val_over_wg = sycl::all_of_group(work_group, local_red_val);
+        }
+        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+            red_val_over_wg = sycl::any_of_group(work_group, local_red_val);
+        }
+        else {
+            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
+                                                      identity_, reduction_op_);
+        }
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        SlmT local_mem,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val;
+                if constexpr (std::is_same_v<ReductionOp,
+                                             sycl::logical_and<outT>> ||
+                              std::is_same_v<ReductionOp,
+                                             sycl::logical_or<outT>>)
+                {
+                    // handle nans
+                    val = convert_impl<bool, argT>(inp_[inp_offset]);
+                }
+                else {
+                    val = convert_impl<outT, argT>(inp_[inp_offset]);
+                }
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <
+    typename argTy,
+    typename resTy,
+    typename ReductionOpT,
+    typename InputOutputIterIndexerT,
+    typename ReductionIndexerT,
+    template <typename T1, typename T2, typename T3, typename T4, typename T5>
+    class kernel_name_token>
+sycl::event
+    sequential_reduction(sycl::queue &exec_q,
+                         const argTy *arg,
+                         resTy *res,
+                         resTy identity_val,
+                         std::size_t iter_nelems,
+                         std::size_t reduction_nelems,
+                         const InputOutputIterIndexerT &in_out_iter_indexer,
+                         const ReductionIndexerT &reduction_indexer,
+                         const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName =
+            class kernel_name_token<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(iter_nelems),
+            SequentialReduction<argTy, resTy, ReductionOpT,
+                                InputOutputIterIndexerT, ReductionIndexerT>(
+                arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                reduction_indexer, reduction_nelems));
+    });
+
+    return red_ev;
+}
+
+template <typename BasedKernelName>
+class custom_reduction_wrapper;
+
+template <
+    typename argTy,
+    typename resTy,
+    typename ReductionOpT,
+    typename InputOutputIterIndexerT,
+    typename ReductionIndexerT,
+    template <typename T1, typename T2, typename T3, typename T4, typename T5>
+    class kernel_name_token>
+sycl::event
+    submit_atomic_reduction(sycl::queue &exec_q,
+                            const argTy *arg,
+                            resTy *res,
+                            resTy identity_val,
+                            std::size_t wg,
+                            std::size_t iter_nelems,
+                            std::size_t reduction_nelems,
+                            std::size_t reductions_per_wi,
+                            std::size_t reduction_groups,
+                            const InputOutputIterIndexerT &in_out_iter_indexer,
+                            const ReductionIndexerT &reduction_indexer,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<argTy, resTy, ReductionOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, reduction_nelems, iter_nelems,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+
+            using KernelName = class custom_reduction_wrapper<
+                kernel_name_token<argTy, resTy, ReductionOpT,
+                                  InputOutputIterIndexerT, ReductionIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomReductionOverGroupWithAtomicFunctor<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_with_atomics_init_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_seq_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_with_atomics_krn;
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_with_atomics_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class reduction_over_group_with_atomics_init_krn<resTy, argTy,
+                                                                 ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+// Contig
+
+typedef sycl::event (*reduction_contig_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                RowsIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const RowsIndexerT rows_indexer{/* size */ iter_nelems,
+                                        /* step */ reduction_nelems};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                          result_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of cols in a
+                                  // matrix when reducing over cols)
+    std::size_t reduction_nelems, // size of each reduction  (length of cols,
+                                  // i.e. number of rows)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
+
+template <
+    typename argTy,
+    typename resTy,
+    typename ReductionOpT,
+    typename InputOutputIterIndexerT,
+    typename ReductionIndexerT,
+    template <typename T1, typename T2, typename T3, typename T4, typename T5>
+    class kernel_name_token>
+sycl::event submit_no_atomic_reduction(
+    sycl::queue &exec_q,
+    const argTy *arg,
+    resTy *res,
+    resTy identity_val,
+    std::size_t wg,
+    std::size_t iter_nelems,
+    std::size_t reduction_nelems,
+    std::size_t reductions_per_wi,
+    std::size_t reduction_groups,
+    const InputOutputIterIndexerT &in_out_iter_indexer,
+    const ReductionIndexerT &reduction_indexer,
+    const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<argTy, resTy, ReductionOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                  InputOutputIterIndexerT,
+                                                  ReductionIndexerT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, reduction_nelems, iter_nelems,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+            using KernelName = class custom_reduction_wrapper<
+                kernel_name_token<argTy, resTy, ReductionOpT,
+                                  InputOutputIterIndexerT, ReductionIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomReductionOverGroupNoAtomicFunctor<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_temps_krn;
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_temps_empty_krn;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class reduction_over_group_temps_empty_krn<resTy, argTy,
+                                                           ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+        ;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of
+            // iterated dimensions of input array from
+            // iter_shape_and_strides are going to be accessed by
+            // inp_indexer
+            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                            iter_shape_and_strides);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_arg_offset, reduction_shape_stride};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev;
+            {
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                                /* step */ reduction_groups_};
+                static constexpr ResIndexerT res_iter_indexer{};
+
+                const InputOutputIterIndexerT in_out_iter_indexer{
+                    inp_indexer, res_iter_indexer};
+                static constexpr ReductionIndexerT reduction_indexer{};
+
+                partial_reduction_ev = submit_no_atomic_reduction<
+                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, reduction_over_group_temps_krn>(
+                    exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+            }
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            iter_nd, iter_res_offset,
+            /* shape */ iter_shape_and_strides,
+            /* strides */ iter_shape_and_strides + 2 * iter_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    RowsIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            const RowsIndexerT rows_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_nelems};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                              noop_tmp_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                /* size */ reduction_nelems,
+                /* step */ iter_nelems};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+// Argmax and Argmin
+
+/* Sequential search reduction */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialSearchReduction(
+        const argT *inp,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const argT &identity_val,
+        const IdxReductionOp &idx_reduction_op,
+        const outT &idx_identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), idx_reduction_op_(idx_reduction_op),
+          idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        argT red_val(identity_);
+        outT idx_val(idx_identity_);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
+            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            argT val = inp_[inp_offset];
+            if (val == red_val) {
+                idx_val = idx_reduction_op_(idx_val, static_cast<outT>(m));
+            }
+            else {
+                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::less_complex;
+                        // less_complex always returns false for NaNs, so check
+                        if (less_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val < red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val < red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::greater_complex;
+                        if (greater_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>)
+                    {
+                        if (val > red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val > red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+            }
+        }
+        out_[out_iter_offset] = idx_val;
+    }
+};
+
+/* = Search reduction using reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          bool First,
+          bool Last>
+struct SearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    SearchReduction(const argT *data,
+                    argT *vals,
+                    const outT *inds,
+                    outT *res,
+                    const ReductionOp &reduction_op,
+                    const argT &identity_val,
+                    const IdxReductionOp &idx_reduction_op,
+                    const outT &idx_identity_val,
+                    const InputOutputIterIndexerT &arg_res_iter_indexer,
+                    const InputRedIndexerT &arg_reduced_dims_indexer,
+                    std::size_t reduction_size,
+                    std::size_t iteration_size,
+                    std::size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        if (val < local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        if (val > local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, identity_, reduction_op_);
+
+        if constexpr (std::is_integral_v<argT>) {
+            local_idx =
+                (red_val_over_wg == local_red_val) ? local_idx : idx_identity_;
+        }
+        else {
+            local_idx =
+                (red_val_over_wg == local_red_val ||
+                 std::isnan(red_val_over_wg) || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+/* = Search reduction using custom_reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT,
+          bool First,
+          bool Last>
+struct CustomSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomSearchReduction(const argT *data,
+                          argT *vals,
+                          outT *inds,
+                          outT *res,
+                          const ReductionOp &reduction_op,
+                          const argT &identity_val,
+                          const IdxReductionOp &idx_reduction_op,
+                          const outT &idx_identity_val,
+                          const InputOutputIterIndexerT &arg_res_iter_indexer,
+                          const InputRedIndexerT &arg_reduced_dims_indexer,
+                          SlmT local_mem,
+                          std::size_t reduction_size,
+                          std::size_t iteration_size,
+                          std::size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::less_complex;
+                            // less_complex always returns false for NaNs, so
+                            // check
+                            if (less_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
+                            if (val < local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val < local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::greater_complex;
+                            if (greater_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>)
+                        {
+                            if (val > local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val > local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<argT>::value) {
+            // equality does not hold for NaNs, so check here
+            local_idx = (red_val_over_wg == local_red_val ||
+                         std::isnan(std::real(local_red_val)) ||
+                         std::isnan(std::imag(local_red_val)))
+                            ? local_idx
+                            : idx_identity_;
+        }
+        else if constexpr (std::is_floating_point_v<argT> ||
+                           std::is_same_v<argT, sycl::half>)
+        {
+            // equality does not hold for NaNs, so check here
+            local_idx =
+                (red_val_over_wg == local_red_val || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        else {
+            local_idx =
+                red_val_over_wg == local_red_val ? local_idx : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+typedef sycl::event (*search_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_strided_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class search_over_group_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class custom_search_over_group_krn;
+
+template <typename T1, typename T2, typename T3>
+class search_empty_krn;
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          bool First,
+          bool Last>
+sycl::event
+    submit_search_reduction(sycl::queue &exec_q,
+                            const argTy *arg,
+                            argTy *arg_tmp,
+                            resTy *res_tmp,
+                            resTy *res,
+                            argTy identity_val,
+                            resTy idx_identity_val,
+                            std::size_t wg,
+                            std::size_t iter_nelems,
+                            std::size_t reduction_nelems,
+                            std::size_t reductions_per_wi,
+                            std::size_t reduction_groups,
+                            const InputOutputIterIndexerT &in_out_iter_indexer,
+                            const ReductionIndexerT &reduction_indexer,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class search_over_group_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT, First, Last>;
+            cgh.parallel_for<KernelName>(
+                ndRange, SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                         InputOutputIterIndexerT,
+                                         ReductionIndexerT, First, Last>(
+                             arg, arg_tmp, res_tmp, res, ReductionOpT(),
+                             identity_val, IndexOpT(), idx_identity_val,
+                             in_out_iter_indexer, reduction_indexer,
+                             reduction_nelems, iter_nelems, reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<argTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+            using KernelName = class custom_search_over_group_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, SlmT, First, Last>;
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                      InputOutputIterIndexerT,
+                                      ReductionIndexerT, SlmT, First, Last>(
+                    arg, arg_tmp, res_tmp, res, ReductionOpT(), identity_val,
+                    IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class search_empty_krn<resTy, argTy, ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = idx_identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<class search_seq_strided_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 4;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto val_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+
+        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of iterated
+            // dimensions of input array from iter_shape_and_strides are going
+            // to be accessed by inp_indexer
+            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                            iter_shape_and_strides);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_arg_offset, reduction_shape_stride};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            iter_nd, iter_res_offset,
+            /* shape */ iter_shape_and_strides,
+            /* strides */ iter_shape_and_strides + 2 * iter_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+typedef sycl::event (*search_contig_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis1_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<class search_seq_contig_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto val_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            const InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{/* size */ iter_nelems,
+                                  /* step */ reduction_nelems},
+                NoOpIndexerT{}};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis0_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using KernelName =
+                class search_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto vals_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+        argTy *partially_reduced_vals_tmp = vals_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT result_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                              result_indexer};
+            const ReductionIndexerT reduction_indexer{
+                /* size */ reduction_nelems,
+                /* step */ iter_nelems};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, vals_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.cpp b/dpctl_ext/tensor/libtensor/source/reductions/all.cpp
new file mode 100644
index 000000000000..a901b9e1d9a3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/all.cpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    all_reduction_strided_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    all_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    all_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename srcTy>
+struct AllStridedFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                           ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AllAxis1ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AllAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+void populate_all_dispatch_vectors(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AllStridedFactory,
+                          td_ns::num_types>
+        all_dvb1;
+    all_dvb1.populate_dispatch_vector(all_reduction_strided_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis1ContigFactory,
+                          td_ns::num_types>
+        all_dvb2;
+    all_dvb2.populate_dispatch_vector(
+        all_reduction_axis1_contig_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis0ContigFactory,
+                          td_ns::num_types>
+        all_dvb3;
+    all_dvb3.populate_dispatch_vector(
+        all_reduction_axis0_contig_dispatch_vector);
+};
+
+using atomic_support::atomic_support_fn_ptr_t;
+using atomic_support::check_atomic_support;
+static atomic_support_fn_ptr_t all_atomic_support =
+    check_atomic_support<std::int32_t>;
+
+} // namespace impl
+
+void init_all(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_all_dispatch_vectors();
+        using impl::all_reduction_axis0_contig_dispatch_vector;
+        using impl::all_reduction_axis1_contig_dispatch_vector;
+        using impl::all_reduction_strided_dispatch_vector;
+
+        using impl::all_atomic_support;
+
+        auto all_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                all_reduction_axis1_contig_dispatch_vector,
+                all_reduction_axis0_contig_dispatch_vector,
+                all_reduction_strided_dispatch_vector, all_atomic_support);
+        };
+        m.def("_all", all_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.hpp b/dpctl_ext/tensor/libtensor/source/reductions/all.hpp
new file mode 100644
index 000000000000..5fb184e37c66
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/all.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_all(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
new file mode 100644
index 000000000000..b7aaf9313aa2
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -0,0 +1,148 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::py_internal::atomic_support
+{
+
+typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc);
+
+/*! @brief Function which returns a constant value for atomic support */
+template <bool return_value>
+bool fixed_decision(const sycl::queue &, sycl::usm::alloc)
+{
+    return return_value;
+}
+
+/*! @brief Template for querying atomic support for a type on a device */
+template <typename T>
+bool check_atomic_support(const sycl::queue &exec_q,
+                          sycl::usm::alloc usm_alloc_type)
+{
+    static constexpr bool atomic32 = (sizeof(T) == 4);
+    static constexpr bool atomic64 = (sizeof(T) == 8);
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr ((!atomic32 && !atomic64) || is_complex<T>::value) {
+        return fixed_decision<false>(exec_q, usm_alloc_type);
+    }
+    else {
+        bool supports_atomics = false;
+        const sycl::device &dev = exec_q.get_device();
+        if constexpr (atomic64) {
+            if (!dev.has(sycl::aspect::atomic64)) {
+                return false;
+            }
+        }
+        switch (usm_alloc_type) {
+        case sycl::usm::alloc::shared:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_shared_allocations);
+            break;
+        case sycl::usm::alloc::host:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_host_allocations);
+            break;
+        case sycl::usm::alloc::device:
+            supports_atomics = true;
+            break;
+        default:
+            supports_atomics = false;
+        }
+        return supports_atomics;
+    }
+}
+
+template <typename fnT, typename T>
+struct ArithmeticAtomicSupportFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (std::is_floating_point_v<T> ||
+                      std::is_same_v<T, sycl::half> || is_complex<T>::value)
+        {
+            // for real- and complex- floating point types, tree reduction has
+            // better round-off accumulation properties (round-off error is
+            // proportional to the log2(reduction_size), while naive elementwise
+            // summation used by atomic implementation has round-off error
+            // growing proportional to the reduction_size.), hence reduction
+            // over floating point types should always use tree_reduction
+            // algorithm, even though atomic implementation may be applicable
+            return fixed_decision<false>;
+        }
+        else {
+            return check_atomic_support<T>;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct MinMaxAtomicSupportFactory
+{
+    fnT get()
+    {
+        return check_atomic_support<T>;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct ProductAtomicSupportFactory
+    : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+} // namespace dpctl::tensor::py_internal::atomic_support
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
new file mode 100644
index 000000000000..e3858338171f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "all.hpp"
+// #include "any.hpp"
+// #include "argmax.hpp"
+// #include "argmin.hpp"
+// #include "logsumexp.hpp"
+// #include "max.hpp"
+// #include "min.hpp"
+// #include "prod.hpp"
+// #include "reduce_hypot.hpp"
+// #include "sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+/*! @brief Add reduction functions to Python module */
+void init_reduction_functions(py::module_ m)
+{
+    init_all(m);
+    // init_any(m);
+    // init_argmax(m);
+    // init_argmin(m);
+    // init_logsumexp(m);
+    // init_max(m);
+    // init_min(m);
+    // init_prod(m);
+    // init_reduce_hypot(m);
+    // init_sum(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp
new file mode 100644
index 000000000000..4df67c16bc4e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reduction_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
new file mode 100644
index 000000000000..5397fe24693d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -0,0 +1,1325 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension, specifically functions for reductions.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+/* ====================== dtype supported ======================== */
+
+/*! @brief Template implementing Python API for querying type support by
+ * reduction which may support atomics */
+template <typename fnT, typename CheckAtomicSupportFnT>
+bool py_reduction_dtype_supported(
+    const py::dtype &input_dtype,
+    const py::dtype &output_dtype,
+    const std::string &dst_usm_type,
+    sycl::queue &q,
+    const fnT &atomic_dispatch_table,
+    const fnT &temps_dispatch_table,
+    const CheckAtomicSupportFnT &check_atomic_support)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
+
+    if (dst_usm_type == "device") {
+        kind = sycl::usm::alloc::device;
+    }
+    else if (dst_usm_type == "shared") {
+        kind = sycl::usm::alloc::shared;
+    }
+    else if (dst_usm_type == "host") {
+        kind = sycl::usm::alloc::host;
+    }
+    else {
+        throw py::value_error("Unrecognized `dst_usm_type` argument.");
+    }
+
+    bool supports_atomics = check_atomic_support[out_typeid](q, kind);
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    return (fn != nullptr);
+}
+
+/*! @brief Template implementing Python API for querying type support by tree
+ * reduction */
+template <typename fnT>
+bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype,
+                                       const fnT &temps_dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    auto fn = temps_dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+/* ==================== Generic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis which may
+ * support atomics */
+template <typename strided_fnT, typename contig_fnT, typename SupportAtomicFnT>
+std::pair<sycl::event, sycl::event> py_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &atomic_dispatch_table,
+    const contig_fnT &axis0_atomic_dispatch_table,
+    const contig_fnT &axis1_atomic_dispatch_table,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table,
+    const SupportAtomicFnT &check_atomic_support)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+    bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1))
+    {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+    using dpctl::tensor::py_internal::simplify_iteration_space_1;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(
+                     simplified_reduction_src_strides[0]) == iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    // remove_all_extents gets underlying type of table
+    using strided_fn_ptr_T =
+        typename std::remove_all_extents<strided_fnT>::type;
+    strided_fn_ptr_T fn = nullptr;
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[src_typeid][dst_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    auto tmp_alloc_owner =
+        std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {reduction_ev}, tmp_alloc_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/* ================= No atomic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis without
+ * atomics */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1))
+    {
+        auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+    using dpctl::tensor::py_internal::simplify_iteration_space_1;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(
+                     simplified_reduction_src_strides[0]) == iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = temps_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {reduction_ev}, tmp_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/*! @brief Template implementing Python API for searching over an axis */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_search_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &axis0_contig_dispatch_table,
+    const contig_fnT &axis1_contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig && dst_nd == 1) {
+        auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT compact_reduction_shape;
+    shT compact_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    compact_iteration_space(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        compact_reduction_shape, compact_reduction_src_strides);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (compact_reduction_src_strides[0] == 1) {
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(compact_reduction_src_strides[0]) ==
+                 iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1) {
+            auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            compact_reduction_shape, compact_reduction_src_strides);
+    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
+                      dst.get_data(), iteration_nd, iter_shape_and_strides,
+                      iteration_src_offset, iteration_dst_offset,
+                      reduction_nd, // number dimensions being reduced
+                      reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, tmp_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, comp_ev);
+}
+
+/* ================= Atomic only reductions ====================== */
+
+/*! @brief Template implementing Python API for boolean reductions over an axis
+ */
+template <typename contig_dispatchT,
+          typename strided_dispatchT,
+          typename atomic_support_fnT>
+std::pair<sycl::event, sycl::event>
+    py_boolean_reduction(const dpctl::tensor::usm_ndarray &src,
+                         int trailing_dims_to_reduce,
+                         const dpctl::tensor::usm_ndarray &dst,
+                         sycl::queue &exec_q,
+                         const std::vector<sycl::event> &depends,
+                         const contig_dispatchT &axis1_contig_dispatch_vector,
+                         const contig_dispatchT &axis0_contig_dispatch_vector,
+                         const strided_dispatchT &strided_dispatch_vector,
+                         const atomic_support_fnT check_atomic_support)
+{
+    int src_nd = src.get_ndim();
+    int iter_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iter_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iter_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    std::size_t red_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        red_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(dst, src)) {
+        throw py::value_error("Arrays are expected to have no memory overlap");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    if (dst_typeid != int32_typeid) {
+        throw py::value_error(
+            "Unexpected data type of destination array, expecting 'int32'");
+    }
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+    bool supports_atomics = check_atomic_support(exec_q, usm_type);
+    if (!supports_atomics) {
+        throw py::value_error(
+            "This reduction is not supported for this device and usm_type.");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 0))
+    {
+        auto fn = axis1_contig_dispatch_vector[src_typeid];
+        static constexpr py::ssize_t zero_offset = 0;
+
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_event =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+        return std::make_pair(keep_args_event, red_ev);
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        auto fn = axis0_contig_dispatch_vector[src_typeid];
+        static constexpr py::ssize_t zero_offset = 0;
+
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_event =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+        return std::make_pair(keep_args_event, red_ev);
+    }
+
+    auto src_shape_vecs = src.get_shape_vector();
+    auto src_strides_vecs = src.get_strides_vector();
+    auto dst_strides_vecs = dst.get_strides_vector();
+
+    int simplified_red_nd = trailing_dims_to_reduce;
+
+    using shT = std::vector<py::ssize_t>;
+    shT red_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                        std::end(src_strides_vecs));
+
+    shT simplified_red_shape;
+    shT simplified_red_src_strides;
+    py::ssize_t red_src_offset(0);
+
+    using dpctl::tensor::py_internal::simplify_iteration_space_1;
+    simplify_iteration_space_1(
+        simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides,
+        // output
+        simplified_red_shape, simplified_red_src_strides, red_src_offset);
+
+    shT iter_src_strides(std::begin(src_strides_vecs),
+                         std::begin(src_strides_vecs) + iter_nd);
+    shT const &iter_dst_strides = dst_strides_vecs;
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        using dpctl::tensor::py_internal::simplify_iteration_space;
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    if (simplified_red_nd == 1 && iter_nd == 1) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_red_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iter_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iter_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(simplified_iter_src_strides[0]) ==
+                 red_nelems);
+        }
+        else if (static_cast<std::size_t>(simplified_red_src_strides[0]) ==
+                 iter_nelems) {
+            mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) &&
+                                    (simplified_iter_src_strides[0] == 1);
+        }
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_contig_dispatch_vector[src_typeid];
+
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
+
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_vector[src_typeid];
+
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
+
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
+    }
+
+    auto fn = strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events{};
+    auto iter_red_metadata_packing_triple_ =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_iter_shape,
+            simplified_iter_src_strides, simplified_iter_dst_strides,
+            simplified_red_shape, simplified_red_src_strides);
+    auto packed_shapes_strides_owner =
+        std::move(std::get<0>(iter_red_metadata_packing_triple_));
+    const auto &copy_metadata_ev =
+        std::get<2>(iter_red_metadata_packing_triple_);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *red_shape_stride =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto red_ev =
+        fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, iter_nd,
+           iter_shape_and_strides, iter_src_offset, iter_dst_offset,
+           simplified_red_nd, red_shape_stride, red_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {red_ev}, packed_shapes_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, red_ev);
+}
+
+extern void init_reduction_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
new file mode 100644
index 000000000000..f0901bbf7ab3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
@@ -0,0 +1,45 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "reductions/reduction_common.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_reductions_impl, m)
+{
+    dpctl::tensor::py_internal::init_reduction_functions(m);
+}

From c4f249687952ea426a7ba190cc1b848ae9fcd6e5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 03:49:40 -0800
Subject: [PATCH 129/145] Add TODO with incorrect logic in
 reductions_over_axis.hpp

---
 .../tensor/libtensor/source/reductions/reduction_over_axis.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
index 5397fe24693d..e16a80f129fc 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -328,6 +328,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
     using dpctl::tensor::py_internal::simplify_iteration_space;
     using dpctl::tensor::py_internal::simplify_iteration_space_1;
 
+    // TODO: not used anywhere
     auto const &src_shape_vecs = src.get_shape_vector();
     auto const &src_strides_vecs = src.get_strides_vector();
     auto const &dst_strides_vecs = dst.get_strides_vector();
@@ -927,6 +928,7 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
     shT compact_reduction_src_strides;
     py::ssize_t reduction_src_offset(0);
 
+    // TODO: not used anywhere
     compact_iteration_space(
         reduction_nd, reduction_shape_ptr, reduction_src_strides,
         // output
@@ -1157,6 +1159,7 @@ std::pair<sycl::event, sycl::event>
     bool is_src_f_contig = src.is_f_contiguous();
     bool is_dst_c_contig = dst.is_c_contiguous();
 
+    // TODO: should be dst_nelems == 0?
     if ((is_src_c_contig && is_dst_c_contig) ||
         (is_src_f_contig && dst_nelems == 0))
     {

From bd3add0adc32a44f9a10d268bcdd774ef8be2d66 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 03:52:14 -0800
Subject: [PATCH 130/145] Move ti.all() to dpctl_ext.tensor and reuse it in
 dpnp

---
 dpctl_ext/tensor/__init__.py                |   2 +
 dpctl_ext/tensor/_manipulation_functions.py |   4 +-
 dpctl_ext/tensor/_utility_functions.py      | 136 ++++++++++++++++++++
 dpnp/dpnp_iface_logic.py                    |   2 +-
 4 files changed, 141 insertions(+), 3 deletions(-)
 create mode 100644 dpctl_ext/tensor/_utility_functions.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 9bfda2446889..5e4bbe91ea52 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -79,6 +79,7 @@
     unstack,
 )
 from dpctl_ext.tensor._reshape import reshape
+from dpctl_ext.tensor._utility_functions import all
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
@@ -94,6 +95,7 @@
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
+    "all",
     "arange",
     "argsort",
     "asarray",
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index 08459dcaea76..e2d55c533bc0 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -624,7 +624,7 @@ def repeat(x, repeats, /, *, axis=None):
                     "'repeats' array must be broadcastable to the size of "
                     "the repeated axis"
                 )
-            if not dpt.all(repeats >= 0):
+            if not dpt_ext.all(repeats >= 0):
                 raise ValueError("'repeats' elements must be positive")
 
     elif isinstance(repeats, (tuple, list, range)):
@@ -646,7 +646,7 @@ def repeat(x, repeats, /, *, axis=None):
             repeats = dpt_ext.asarray(
                 repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
             )
-            if not dpt.all(repeats >= 0):
+            if not dpt_ext.all(repeats >= 0):
                 raise ValueError("`repeats` elements must be positive")
     else:
         raise TypeError(
diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py
new file mode 100644
index 000000000000..678b93cdfeec
--- /dev/null
+++ b/dpctl_ext/tensor/_utility_functions.py
@@ -0,0 +1,136 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import dpctl.tensor as dpt
+import dpctl.utils as du
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+import dpctl_ext.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_tuple
+
+
+def _boolean_reduction(x, axis, keepdims, func):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        red_nd = nd
+        # case of a scalar
+        if red_nd == 0:
+            return dpt_ext.astype(x, dpt.bool)
+        x_tmp = x
+        res_shape = ()
+        perm = list(range(nd))
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+
+        red_nd = len(axis)
+        # check for axis=()
+        if red_nd == 0:
+            return dpt_ext.astype(x, dpt.bool)
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt_ext.permute_dims(x, perm)
+        res_shape = x_tmp.shape[: nd - red_nd]
+
+    exec_q = x.sycl_queue
+    res_usm_type = x.usm_type
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    # always allocate the temporary as
+    # int32 and usm-device  to ensure that atomic updates
+    # are supported
+    res_tmp = dpt_ext.empty(
+        res_shape,
+        dtype=dpt.int32,
+        usm_type="device",
+        sycl_queue=exec_q,
+    )
+    hev0, ev0 = func(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=res_tmp,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev0, ev0)
+
+    # copy to boolean result array
+    res = dpt_ext.empty(
+        res_shape,
+        dtype=dpt.bool,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0]
+    )
+    _manager.add_event_pair(hev1, ev1)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm)
+    return res
+
+
+def all(x, /, *, axis=None, keepdims=False):
+    """
+    all(x, axis=None, keepdims=False)
+
+    Tests whether all input array elements evaluate to True along a given axis.
+
+    Args:
+        x (usm_ndarray): Input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
+            along which to perform a logical AND reduction.
+            When `axis` is `None`, a logical AND reduction
+            is performed over all dimensions of `x`.
+            If `axis` is negative, the axis is counted from
+            the last dimension to the first.
+            Default: `None`.
+        keepdims (bool, optional): If `True`, the reduced axes are included
+            in the result as singleton dimensions, and the result is
+            broadcastable to the input array shape.
+            If `False`, the reduced axes are not included in the result.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            An array with a data type of `bool`
+            containing the results of the logical AND reduction.
+    """
+    return _boolean_reduction(x, axis, keepdims, tri._all)
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 3e3501b14c7c..9713071476f4 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -214,7 +214,7 @@ def all(a, /, axis=None, out=None, keepdims=False, *, where=True):
     dpnp.check_limitations(where=where)
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.all(usm_a, axis=axis, keepdims=keepdims)
+    usm_res = dpt_ext.all(usm_a, axis=axis, keepdims=keepdims)
 
     # TODO: temporary solution until dpt.all supports out parameter
     return dpnp.get_result_array(usm_res, out)

From b1953df0ab74d61d7282bf3a6925babe8114e21c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 04:06:06 -0800
Subject: [PATCH 131/145] Move _any to _tensor_reductions_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   2 +-
 .../libtensor/source/reductions/any.cpp       | 164 ++++++++++++++++++
 .../libtensor/source/reductions/any.hpp       |  46 +++++
 .../source/reductions/reduction_common.cpp    |   4 +-
 4 files changed, 213 insertions(+), 3 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/any.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/any.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 872e3f786c65..e8bc17e39f55 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -72,7 +72,7 @@ set(_accumulator_sources
 set(_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.cpp b/dpctl_ext/tensor/libtensor/source/reductions/any.cpp
new file mode 100644
index 000000000000..6859e46cbc4a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/any.cpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    any_reduction_strided_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    any_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    any_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename srcTy>
+struct AnyStridedFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                           ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AnyAxis1ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AnyAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+void populate_any_dispatch_vectors(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AnyStridedFactory,
+                          td_ns::num_types>
+        any_dvb1;
+    any_dvb1.populate_dispatch_vector(any_reduction_strided_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis1ContigFactory,
+                          td_ns::num_types>
+        any_dvb2;
+    any_dvb2.populate_dispatch_vector(
+        any_reduction_axis1_contig_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis0ContigFactory,
+                          td_ns::num_types>
+        any_dvb3;
+    any_dvb3.populate_dispatch_vector(
+        any_reduction_axis0_contig_dispatch_vector);
+};
+
+using atomic_support::atomic_support_fn_ptr_t;
+using atomic_support::check_atomic_support;
+static atomic_support_fn_ptr_t any_atomic_support =
+    check_atomic_support<std::int32_t>;
+
+} // namespace impl
+
+void init_any(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_any_dispatch_vectors();
+        using impl::any_reduction_axis0_contig_dispatch_vector;
+        using impl::any_reduction_axis1_contig_dispatch_vector;
+        using impl::any_reduction_strided_dispatch_vector;
+
+        using impl::any_atomic_support;
+
+        auto any_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                any_reduction_axis1_contig_dispatch_vector,
+                any_reduction_axis0_contig_dispatch_vector,
+                any_reduction_strided_dispatch_vector, any_atomic_support);
+        };
+        m.def("_any", any_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.hpp b/dpctl_ext/tensor/libtensor/source/reductions/any.hpp
new file mode 100644
index 000000000000..4e368a674615
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/any.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_any(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
index e3858338171f..027c80b27cc2 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -36,7 +36,7 @@
 #include <pybind11/pybind11.h>
 
 #include "all.hpp"
-// #include "any.hpp"
+#include "any.hpp"
 // #include "argmax.hpp"
 // #include "argmin.hpp"
 // #include "logsumexp.hpp"
@@ -55,7 +55,7 @@ namespace dpctl::tensor::py_internal
 void init_reduction_functions(py::module_ m)
 {
     init_all(m);
-    // init_any(m);
+    init_any(m);
     // init_argmax(m);
     // init_argmin(m);
     // init_logsumexp(m);

From 32802d682a8b52dd4254f7c3599314f1f84553e7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 04:08:40 -0800
Subject: [PATCH 132/145] Move ti.any()/ti.diff() and reuse them in dpnp

---
 dpctl_ext/tensor/__init__.py           |   4 +-
 dpctl_ext/tensor/_utility_functions.py | 375 ++++++++++++++++++++++++-
 dpnp/dpnp_iface_logic.py               |   7 +-
 dpnp/dpnp_iface_mathematical.py        |   4 +-
 4 files changed, 383 insertions(+), 7 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 5e4bbe91ea52..87bc9f544ab6 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -79,7 +79,7 @@
     unstack,
 )
 from dpctl_ext.tensor._reshape import reshape
-from dpctl_ext.tensor._utility_functions import all
+from dpctl_ext.tensor._utility_functions import all, any, diff
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
@@ -96,6 +96,7 @@
 
 __all__ = [
     "all",
+    "any",
     "arange",
     "argsort",
     "asarray",
@@ -110,6 +111,7 @@
     "cumulative_logsumexp",
     "cumulative_prod",
     "cumulative_sum",
+    "diff",
     "empty",
     "empty_like",
     "extract",
diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py
index 678b93cdfeec..a122ac3d6cea 100644
--- a/dpctl_ext/tensor/_utility_functions.py
+++ b/dpctl_ext/tensor/_utility_functions.py
@@ -26,6 +26,8 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+import builtins
+import operator
 
 import dpctl.tensor as dpt
 import dpctl.utils as du
@@ -36,7 +38,17 @@
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_reductions_impl as tri
 
-from ._numpy_helper import normalize_axis_tuple
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    _resolve_one_strong_one_weak_types,
+    _resolve_one_strong_two_weak_types,
+)
 
 
 def _boolean_reduction(x, axis, keepdims, func):
@@ -134,3 +146,364 @@ def all(x, /, *, axis=None, keepdims=False):
             containing the results of the logical AND reduction.
     """
     return _boolean_reduction(x, axis, keepdims, tri._all)
+
+
+def any(x, /, *, axis=None, keepdims=False):
+    """
+    any(x, axis=None, keepdims=False)
+
+    Tests whether any input array elements evaluate to True along a given axis.
+
+    Args:
+        x (usm_ndarray): Input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
+            along which to perform a logical OR reduction.
+            When `axis` is `None`, a logical OR reduction
+            is performed over all dimensions of `x`.
+            If `axis` is negative, the axis is counted from
+            the last dimension to the first.
+            Default: `None`.
+        keepdims (bool, optional): If `True`, the reduced axes are included
+            in the result as singleton dimensions, and the result is
+            broadcastable to the input array shape.
+            If `False`, the reduced axes are not included in the result.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            An array with a data type of `bool`
+            containing the results of the logical OR reduction.
+    """
+    return _boolean_reduction(x, axis, keepdims, tri._any)
+
+
+def _validate_diff_shape(sh1, sh2, axis):
+    """
+    Utility for validating that two shapes `sh1` and `sh2`
+    are possible to concatenate along `axis`.
+    """
+    if not sh2:
+        # scalars will always be accepted
+        return True
+    else:
+        sh1_ndim = len(sh1)
+        if sh1_ndim == len(sh2) and builtins.all(
+            sh1[i] == sh2[i] for i in range(sh1_ndim) if i != axis
+        ):
+            return True
+        else:
+            return False
+
+
+def _concat_diff_input(arr, axis, prepend, append):
+    """
+    Concatenates `arr`, `prepend` and, `append` along `axis`,
+    where `arr` is an array and `prepend` and `append` are
+    any mixture of arrays and scalars.
+    """
+    if prepend is not None and append is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, prepend_usm_type = _get_queue_usm_type(prepend)
+        q3, append_usm_type = _get_queue_usm_type(append)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = du.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = du.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    append_usm_type,
+                )
+            )
+        else:
+            exec_q = du.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                    append_usm_type,
+                )
+            )
+        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        prepend_shape = _get_shape(prepend)
+        append_shape = _get_shape(append)
+        if not builtins.all(
+            isinstance(s, (tuple, list))
+            for s in (
+                prepend_shape,
+                append_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        valid_prepend_shape = _validate_diff_shape(
+            arr_shape, prepend_shape, axis
+        )
+        if not valid_prepend_shape:
+            raise ValueError(
+                f"`diff` argument `prepend` with shape {prepend_shape} is "
+                f"invalid for first input with shape {arr_shape}"
+            )
+        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
+        if not valid_append_shape:
+            raise ValueError(
+                f"`diff` argument `append` with shape {append_shape} is invalid"
+                f" for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        prepend_dtype = _get_dtype(prepend, sycl_dev)
+        append_dtype = _get_dtype(append, sycl_dev)
+        if not builtins.all(
+            _validate_dtype(o) for o in (prepend_dtype, append_dtype)
+        ):
+            raise ValueError("Operands have unsupported data types")
+        prepend_dtype, append_dtype = _resolve_one_strong_two_weak_types(
+            arr_dtype, prepend_dtype, append_dtype, sycl_dev
+        )
+        if isinstance(prepend, dpt.usm_ndarray):
+            a_prepend = prepend
+        else:
+            a_prepend = dpt_ext.asarray(
+                prepend,
+                dtype=prepend_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if isinstance(append, dpt.usm_ndarray):
+            a_append = append
+        else:
+            a_append = dpt_ext.asarray(
+                append,
+                dtype=append_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not prepend_shape:
+            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
+        if not append_shape:
+            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_append = dpt_ext.broadcast_to(a_append, append_shape)
+        return dpt_ext.concat((a_prepend, arr, a_append), axis=axis)
+    elif prepend is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, prepend_usm_type = _get_queue_usm_type(prepend)
+        if q2 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        else:
+            exec_q = du.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                )
+            )
+        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        prepend_shape = _get_shape(prepend)
+        if not isinstance(prepend_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of argument can not be inferred. "
+                "Argument is expected to be a "
+                "list or tuple"
+            )
+        valid_prepend_shape = _validate_diff_shape(
+            arr_shape, prepend_shape, axis
+        )
+        if not valid_prepend_shape:
+            raise ValueError(
+                f"`diff` argument `prepend` with shape {prepend_shape} is "
+                f"invalid for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        prepend_dtype = _get_dtype(prepend, sycl_dev)
+        if not _validate_dtype(prepend_dtype):
+            raise ValueError("Operand has unsupported data type")
+        prepend_dtype = _resolve_one_strong_one_weak_types(
+            arr_dtype, prepend_dtype, sycl_dev
+        )
+        if isinstance(prepend, dpt.usm_ndarray):
+            a_prepend = prepend
+        else:
+            a_prepend = dpt_ext.asarray(
+                prepend,
+                dtype=prepend_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not prepend_shape:
+            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
+        return dpt_ext.concat((a_prepend, arr), axis=axis)
+    elif append is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, append_usm_type = _get_queue_usm_type(append)
+        if q2 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        else:
+            exec_q = du.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise du.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = du.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    append_usm_type,
+                )
+            )
+        du.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        append_shape = _get_shape(append)
+        if not isinstance(append_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of argument can not be inferred. "
+                "Argument is expected to be a "
+                "list or tuple"
+            )
+        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
+        if not valid_append_shape:
+            raise ValueError(
+                f"`diff` argument `append` with shape {append_shape} is invalid"
+                f" for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        append_dtype = _get_dtype(append, sycl_dev)
+        if not _validate_dtype(append_dtype):
+            raise ValueError("Operand has unsupported data type")
+        append_dtype = _resolve_one_strong_one_weak_types(
+            arr_dtype, append_dtype, sycl_dev
+        )
+        if isinstance(append, dpt.usm_ndarray):
+            a_append = append
+        else:
+            a_append = dpt_ext.asarray(
+                append,
+                dtype=append_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not append_shape:
+            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_append = dpt_ext.broadcast_to(a_append, append_shape)
+        return dpt_ext.concat((arr, a_append), axis=axis)
+    else:
+        arr1 = arr
+    return arr1
+
+
+def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
+    """
+    Calculates the `n`-th discrete forward difference of `x` along `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (int):
+            axis along which to compute the difference. A valid axis must be on
+            the interval `[-N, N)`, where `N` is the rank (number of
+            dimensions) of `x`.
+            Default: `-1`
+        n (int):
+            number of times to recursively compute the difference.
+            Default: `1`.
+        prepend (Union[usm_ndarray, bool, int, float, complex]):
+            value or values to prepend to the specified axis before taking the
+            difference.
+            Must have the same shape as `x` except along `axis`, which can have
+            any shape.
+            Default: `None`.
+        append (Union[usm_ndarray, bool, int, float, complex]):
+            value or values to append to the specified axis before taking the
+            difference.
+            Must have the same shape as `x` except along `axis`, which can have
+            any shape.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing the `n`-th differences. The array will have the
+            same shape as `x`, except along `axis`, which will have shape:
+            ``prepend.shape[axis] + x.shape[axis] + append.shape[axis] - n``
+
+            The data type of the returned array is determined by the Type
+            Promotion Rules.
+    """
+
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(x)}"
+        )
+    x_nd = x.ndim
+    axis = normalize_axis_index(operator.index(axis), x_nd)
+    n = operator.index(n)
+    if n < 0:
+        raise ValueError(f"`n` must be positive, got {n}")
+    arr = _concat_diff_input(x, axis, prepend, append)
+    if n == 0:
+        return arr
+    # form slices and recurse
+    sl0 = tuple(
+        slice(None) if i != axis else slice(1, None) for i in range(x_nd)
+    )
+    sl1 = tuple(
+        slice(None) if i != axis else slice(None, -1) for i in range(x_nd)
+    )
+
+    diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract
+    if n > 1:
+        arr_tmp0 = diff_op(arr[sl0], arr[sl1])
+        arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1])
+        n = n - 2
+        if n > 0:
+            sl3 = tuple(
+                slice(None) if i != axis else slice(None, -2)
+                for i in range(x_nd)
+            )
+            for _ in range(n):
+                arr_tmp0_sliced = arr_tmp0[sl3]
+                diff_op(arr_tmp1[sl0], arr_tmp1[sl1], out=arr_tmp0_sliced)
+                arr_tmp0, arr_tmp1 = arr_tmp1, arr_tmp0_sliced
+        arr = arr_tmp1
+    else:
+        arr = diff_op(arr[sl0], arr[sl1])
+    return arr
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 9713071476f4..a81416a28e43 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -44,14 +44,13 @@
 # pylint: disable=no-name-in-module
 
 
-import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
@@ -214,7 +213,7 @@ def all(a, /, axis=None, out=None, keepdims=False, *, where=True):
     dpnp.check_limitations(where=where)
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.all(usm_a, axis=axis, keepdims=keepdims)
+    usm_res = dpt.all(usm_a, axis=axis, keepdims=keepdims)
 
     # TODO: temporary solution until dpt.all supports out parameter
     return dpnp.get_result_array(usm_res, out)
@@ -1276,7 +1275,7 @@ def isin(
         usm_element = dpnp.get_usm_ndarray(element)
         usm_test = dpnp.get_usm_ndarray(test_elements)
     return dpnp_array._create_from_usm_ndarray(
-        dpt_ext.isin(
+        dpt.isin(
             usm_element,
             usm_test,
             invert=invert,
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index e5e379f8dc00..2c3975625c9e 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -1480,7 +1480,9 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
     )
     usm_app = None if append is None else dpnp.get_usm_ndarray_or_scalar(append)
 
-    usm_res = dpt.diff(usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app)
+    usm_res = dpt_ext.diff(
+        usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app
+    )
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From 1b4e49825c053455986ff852f0b6e042252f6794 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 04:27:45 -0800
Subject: [PATCH 133/145] Move _min_over_axis/_max_over_axis to
 _tensor_reductions_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   4 +-
 .../libtensor/source/reductions/max.cpp       | 411 +++++++++++++++++
 .../libtensor/source/reductions/max.hpp       |  46 ++
 .../libtensor/source/reductions/min.cpp       | 413 ++++++++++++++++++
 .../libtensor/source/reductions/min.hpp       |  46 ++
 .../source/reductions/reduction_common.cpp    |   8 +-
 6 files changed, 922 insertions(+), 6 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/max.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/max.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/min.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/min.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index e8bc17e39f55..13c1bd4a6d7f 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -76,8 +76,8 @@ set(_reduction_sources
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp
new file mode 100644
index 000000000000..dfcdc6a7b2a5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp
@@ -0,0 +1,411 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by max reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMaxReductionAtomic
+{
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMaxReductionTemps
+{
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_max_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types];
+
+void populate_max_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MaxAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MaxAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(max_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_max(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_max_over_axis_dispatch_tables;
+        populate_max_over_axis_dispatch_tables();
+        using impl::max_over_axis0_contig_atomic_dispatch_table;
+        using impl::max_over_axis0_contig_temps_dispatch_table;
+        using impl::max_over_axis1_contig_atomic_dispatch_table;
+        using impl::max_over_axis1_contig_temps_dispatch_table;
+        using impl::max_over_axis_strided_atomic_dispatch_table;
+        using impl::max_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_max_atomic_support_dispatch_vector;
+        populate_max_atomic_support_dispatch_vector();
+        using impl::max_atomic_support_vector;
+
+        auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                max_over_axis_strided_atomic_dispatch_table,
+                max_over_axis0_contig_atomic_dispatch_table,
+                max_over_axis1_contig_atomic_dispatch_table,
+                max_over_axis_strided_temps_dispatch_table,
+                max_over_axis0_contig_temps_dispatch_table,
+                max_over_axis1_contig_temps_dispatch_table,
+                max_atomic_support_vector);
+        };
+        m.def("_max_over_axis", max_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.hpp b/dpctl_ext/tensor/libtensor/source/reductions/max.hpp
new file mode 100644
index 000000000000..bc242dc8d74b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/max.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_max(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp
new file mode 100644
index 000000000000..b57c35736adc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp
@@ -0,0 +1,413 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by min reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMinReductionAtomic
+{
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMinReductionTemps
+{
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_min_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types];
+
+void populate_min_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MinAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MinAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(min_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_min(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_min_over_axis_dispatch_tables;
+        populate_min_over_axis_dispatch_tables();
+        using impl::min_over_axis0_contig_atomic_dispatch_table;
+        using impl::min_over_axis0_contig_temps_dispatch_table;
+        using impl::min_over_axis1_contig_atomic_dispatch_table;
+        using impl::min_over_axis1_contig_temps_dispatch_table;
+        using impl::min_over_axis_strided_atomic_dispatch_table;
+        using impl::min_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_min_atomic_support_dispatch_vector;
+        populate_min_atomic_support_dispatch_vector();
+        using impl::min_atomic_support_vector;
+
+        auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                min_over_axis_strided_atomic_dispatch_table,
+                min_over_axis0_contig_atomic_dispatch_table,
+                min_over_axis1_contig_atomic_dispatch_table,
+                min_over_axis_strided_temps_dispatch_table,
+                min_over_axis0_contig_temps_dispatch_table,
+                min_over_axis1_contig_temps_dispatch_table,
+                min_atomic_support_vector);
+        };
+        m.def("_min_over_axis", min_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.hpp b/dpctl_ext/tensor/libtensor/source/reductions/min.hpp
new file mode 100644
index 000000000000..e054f44539f3
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/min.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_min(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
index 027c80b27cc2..e7b00b504406 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -40,8 +40,8 @@
 // #include "argmax.hpp"
 // #include "argmin.hpp"
 // #include "logsumexp.hpp"
-// #include "max.hpp"
-// #include "min.hpp"
+#include "max.hpp"
+#include "min.hpp"
 // #include "prod.hpp"
 // #include "reduce_hypot.hpp"
 // #include "sum.hpp"
@@ -59,8 +59,8 @@ void init_reduction_functions(py::module_ m)
     // init_argmax(m);
     // init_argmin(m);
     // init_logsumexp(m);
-    // init_max(m);
-    // init_min(m);
+    init_max(m);
+    init_min(m);
     // init_prod(m);
     // init_reduce_hypot(m);
     // init_sum(m);

From aa313ff8f6496d247bc85424af81188d6e7c5c53 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 04:28:22 -0800
Subject: [PATCH 134/145] Move ti.min()/max() and reuse it in dpnp

---
 dpctl_ext/tensor/__init__.py   |   6 +
 dpctl_ext/tensor/_reduction.py | 419 +++++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_statistics.py  |   4 +-
 3 files changed, 427 insertions(+), 2 deletions(-)
 create mode 100644 dpctl_ext/tensor/_reduction.py

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 87bc9f544ab6..62dca856a8b4 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -83,6 +83,10 @@
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
+from ._reduction import (
+    max,
+    min,
+)
 from ._searchsorted import searchsorted
 from ._set_functions import (
     isin,
@@ -126,7 +130,9 @@
     "isdtype",
     "isin",
     "linspace",
+    "max",
     "meshgrid",
+    "min",
     "moveaxis",
     "permute_dims",
     "nonzero",
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
new file mode 100644
index 000000000000..7759aebb0b71
--- /dev/null
+++ b/dpctl_ext/tensor/_reduction.py
@@ -0,0 +1,419 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor._tensor_impl as ti
+import dpctl_ext.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_tuple
+
+
+def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        x_tmp = x
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt_ext.permute_dims(x, perm)
+    red_nd = len(axis)
+    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
+        raise ValueError("reduction cannot be performed over zero-size axes")
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = x.dtype
+    res_usm_type = x.usm_type
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt_ext.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out):
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+            out = orig_out
+        return out
+
+    hev, red_ev = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, red_ev)
+    if not (orig_out is None or orig_out is out):
+        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+        out = orig_out
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def max(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Calculates the maximum value of the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which maxima must be computed. If a tuple
+            of unique integers, the maxima are computed over multiple axes.
+            If ``None``, the max is computed over the entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the maxima. If the max was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
+    """
+    return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis)
+
+
+def min(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Calculates the minimum value of the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which minima must be computed. If a tuple
+            of unique integers, the minima are computed over multiple axes.
+            If ``None``, the min is computed over the entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the minima. If the min was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
+    """
+    return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
+
+
+def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        x_tmp = x
+    else:
+        if isinstance(axis, int):
+            axis = (axis,)
+        else:
+            raise TypeError(
+                f"'axis' argument expected to have type 'int' "
+                r"or be `None`, "
+                f"got type {type(axis)}"
+            )
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt_ext.permute_dims(x, perm)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    red_nd = len(axis)
+    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
+        raise ValueError("reduction cannot be performed over zero-size axes")
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = ti.default_device_index_type(exec_q.sycl_device)
+    res_usm_type = x.usm_type
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt_ext.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out) and red_nd > 0:
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_fill, fill_ev = ti._full_usm_ndarray(
+            fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_fill, fill_ev)
+        return out
+
+    hev, red_ev = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, red_ev)
+    if not (orig_out is None or orig_out is out):
+        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+        out = orig_out
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def argmax(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Returns the indices of the maximum values of the input array ``x`` along a
+    specified axis.
+
+    When the maximum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If ``None``, returns the index of the
+            maximum value of the flattened array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            maximum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of ``x``.
+    """
+    return _search_over_axis(x, axis, keepdims, out, tri._argmax_over_axis)
+
+
+def argmin(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Returns the indices of the minimum values of the input array ``x`` along a
+    specified axis.
+
+    When the minimum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If ``None``, returns the index of the
+            minimum value of the flattened array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            minimum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of ``x``.
+    """
+    return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis)
+
+
+def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Counts the number of elements in the input array ``x`` which are non-zero.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which to count. If a tuple of unique integers,
+            the number of non-zero values are computed over multiple axes.
+            If ``None``, the number of non-zero values is computed over the
+            entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and data
+            type.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the count of non-zero values. If the sum was
+            computed over the entire array, a zero-dimensional array is
+            returned. The returned array will have the default array index data
+            type.
+    """
+    if x.dtype != dpt.bool:
+        x = dpt_ext.astype(x, dpt.bool, copy=False)
+    return sum(
+        x,
+        axis=axis,
+        dtype=ti.default_device_index_type(x.sycl_device),
+        keepdims=keepdims,
+        out=out,
+    )
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index daff981d5cc4..97c6aab14058 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -1118,7 +1118,7 @@ def max(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.max,
+        dpt_ext.max,
         a.dtype,
         axis=axis,
         keepdims=keepdims,
@@ -1395,7 +1395,7 @@ def min(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt.min,
+        dpt_ext.min,
         a.dtype,
         axis=axis,
         keepdims=keepdims,

From c6d600a6be411a1e3181a2c31ad0b34a2c49467e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 04:41:38 -0800
Subject: [PATCH 135/145] Move _argmax/argmin_over_axis to
 _tensor_reductions_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   4 +-
 .../libtensor/source/reductions/argmax.cpp    | 280 ++++++++++++++++++
 .../libtensor/source/reductions/argmax.hpp    |  46 +++
 .../libtensor/source/reductions/argmin.cpp    | 280 ++++++++++++++++++
 .../libtensor/source/reductions/argmin.hpp    |  46 +++
 .../source/reductions/reduction_common.cpp    |   8 +-
 6 files changed, 658 insertions(+), 6 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 13c1bd4a6d7f..7425ab2bc6b3 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -73,8 +73,8 @@ set(_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
new file mode 100644
index 000000000000..bf6592798b4f
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
@@ -0,0 +1,280 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportForArgmaxReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::int64_t>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::int64_t>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_argmax_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgmaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmax(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmax_over_axis_dispatch_tables;
+        populate_argmax_over_axis_dispatch_tables();
+        using impl::argmax_over_axis0_contig_temps_dispatch_table;
+        using impl::argmax_over_axis1_contig_temps_dispatch_table;
+        using impl::argmax_over_axis_strided_temps_dispatch_table;
+
+        auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_search_over_axis;
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmax_over_axis_strided_temps_dispatch_table,
+                argmax_over_axis0_contig_temps_dispatch_table,
+                argmax_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp
new file mode 100644
index 000000000000..3274f8c7d0cb
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_argmax(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
new file mode 100644
index 000000000000..acd685c59802
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
@@ -0,0 +1,280 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportForArgminReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::int64_t>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::int64_t>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
+                                                             dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_argmin_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgminOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmin_over_axis_dispatch_tables;
+        populate_argmin_over_axis_dispatch_tables();
+        using impl::argmin_over_axis0_contig_temps_dispatch_table;
+        using impl::argmin_over_axis1_contig_temps_dispatch_table;
+        using impl::argmin_over_axis_strided_temps_dispatch_table;
+
+        auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_search_over_axis;
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmin_over_axis_strided_temps_dispatch_table,
+                argmin_over_axis0_contig_temps_dispatch_table,
+                argmin_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp
new file mode 100644
index 000000000000..1865c258a527
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_argmin(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
index e7b00b504406..ff6b5d994f33 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -37,8 +37,8 @@
 
 #include "all.hpp"
 #include "any.hpp"
-// #include "argmax.hpp"
-// #include "argmin.hpp"
+#include "argmax.hpp"
+#include "argmin.hpp"
 // #include "logsumexp.hpp"
 #include "max.hpp"
 #include "min.hpp"
@@ -56,8 +56,8 @@ void init_reduction_functions(py::module_ m)
 {
     init_all(m);
     init_any(m);
-    // init_argmax(m);
-    // init_argmin(m);
+    init_argmax(m);
+    init_argmin(m);
     // init_logsumexp(m);
     init_max(m);
     init_min(m);

From 8ae933d85d3d11b552d86c1e44c520b1bbedefc8 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 04:45:22 -0800
Subject: [PATCH 136/145] Move ti.argmax()/argmin() to dpctl_ext.tensor and
 reuse them in dpnp

---
 dpctl_ext/tensor/__init__.py   |   4 +
 dpctl_ext/tensor/_reduction.py | 131 +++++++++++----------------------
 dpnp/dpnp_iface_searching.py   |  12 +--
 3 files changed, 52 insertions(+), 95 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 62dca856a8b4..a3b6a209a47e 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -84,6 +84,8 @@
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
 from ._clip import clip
 from ._reduction import (
+    argmax,
+    argmin,
     max,
     min,
 )
@@ -102,6 +104,8 @@
     "all",
     "any",
     "arange",
+    "argmax",
+    "argmin",
     "argsort",
     "asarray",
     "asnumpy",
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
index 7759aebb0b71..e39c853284d9 100644
--- a/dpctl_ext/tensor/_reduction.py
+++ b/dpctl_ext/tensor/_reduction.py
@@ -137,72 +137,6 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
     return out
 
 
-def max(x, /, *, axis=None, keepdims=False, out=None):
-    """
-    Calculates the maximum value of the input array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which maxima must be computed. If a tuple
-            of unique integers, the maxima are computed over multiple axes.
-            If ``None``, the max is computed over the entire array.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the maxima. If the max was computed over the
-            entire array, a zero-dimensional array is returned. The returned
-            array has the same data type as ``x``.
-    """
-    return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis)
-
-
-def min(x, /, *, axis=None, keepdims=False, out=None):
-    """
-    Calculates the minimum value of the input array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which minima must be computed. If a tuple
-            of unique integers, the minima are computed over multiple axes.
-            If ``None``, the min is computed over the entire array.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the minima. If the min was computed over the
-            entire array, a zero-dimensional array is returned. The returned
-            array has the same data type as ``x``.
-    """
-    return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
-
-
 def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -376,18 +310,17 @@ def argmin(x, /, *, axis=None, keepdims=False, out=None):
     return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis)
 
 
-def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
+def max(x, /, *, axis=None, keepdims=False, out=None):
     """
-    Counts the number of elements in the input array ``x`` which are non-zero.
+    Calculates the maximum value of the input array ``x``.
 
     Args:
         x (usm_ndarray):
             input array.
         axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which to count. If a tuple of unique integers,
-            the number of non-zero values are computed over multiple axes.
-            If ``None``, the number of non-zero values is computed over the
-            entire array.
+            axis or axes along which maxima must be computed. If a tuple
+            of unique integers, the maxima are computed over multiple axes.
+            If ``None``, the max is computed over the entire array.
             Default: ``None``.
         keepdims (Optional[bool]):
             if ``True``, the reduced axes (dimensions) are included in the
@@ -397,23 +330,47 @@ def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
             in the returned array. Default: ``False``.
         out (Optional[usm_ndarray]):
             the array into which the result is written.
-            The data type of ``out`` must match the expected shape and data
-            type.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
             If ``None`` then a new array is returned. Default: ``None``.
 
     Returns:
         usm_ndarray:
-            an array containing the count of non-zero values. If the sum was
-            computed over the entire array, a zero-dimensional array is
-            returned. The returned array will have the default array index data
-            type.
+            an array containing the maxima. If the max was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
     """
-    if x.dtype != dpt.bool:
-        x = dpt_ext.astype(x, dpt.bool, copy=False)
-    return sum(
-        x,
-        axis=axis,
-        dtype=ti.default_device_index_type(x.sycl_device),
-        keepdims=keepdims,
-        out=out,
-    )
+    return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis)
+
+
+def min(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Calculates the minimum value of the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which minima must be computed. If a tuple
+            of unique integers, the minima are computed over multiple axes.
+            If ``None``, the min is computed over the entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the minima. If the min was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
+    """
+    return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 055aaa999c3a..19279f81286a 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -39,12 +39,10 @@
 
 """
 
-import dpctl.tensor as dpt
-
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 
@@ -376,13 +374,13 @@ def searchsorted(a, v, side="left", sorter=None):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if dpnp.isscalar(v):
-        usm_v = dpt_ext.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
+        usm_v = dpt.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type)
     else:
         usm_v = dpnp.get_usm_ndarray(v)
 
     usm_sorter = None if sorter is None else dpnp.get_usm_ndarray(sorter)
     return dpnp_array._create_from_usm_ndarray(
-        dpt_ext.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
+        dpt.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter)
     )
 
 
@@ -474,7 +472,5 @@ def where(condition, x=None, y=None, /, *, order="K", out=None):
     usm_condition = dpnp.get_usm_ndarray(condition)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt_ext.where(
-        usm_condition, usm_x, usm_y, order=order, out=usm_out
-    )
+    usm_res = dpt.where(usm_condition, usm_x, usm_y, order=order, out=usm_out)
     return dpnp.get_result_array(usm_res, out)

From 2ec3cc7ae429171a7c529c05633f7fe9e55328e7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 05:11:55 -0800
Subject: [PATCH 137/145] Move _prod/sum_over_axis to _tensor_reductions_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   4 +-
 .../libtensor/source/reductions/prod.cpp      | 466 ++++++++++++++++++
 .../libtensor/source/reductions/prod.hpp      |  46 ++
 .../source/reductions/reduction_common.cpp    |   8 +-
 .../libtensor/source/reductions/sum.cpp       | 463 +++++++++++++++++
 .../libtensor/source/reductions/sum.hpp       |  46 ++
 6 files changed, 1027 insertions(+), 6 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/prod.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/sum.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 7425ab2bc6b3..350a0b2518ba 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -78,9 +78,9 @@ set(_reduction_sources
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
 )
 set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
new file mode 100644
index 000000000000..dc9112a7b39a
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
@@ -0,0 +1,466 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_prod_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types];
+
+void populate_prod_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::ProductAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, ProductAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(prod_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_prod_over_axis_dispatch_tables;
+        populate_prod_over_axis_dispatch_tables();
+        using impl::prod_over_axis0_contig_atomic_dispatch_table;
+        using impl::prod_over_axis0_contig_temps_dispatch_table;
+        using impl::prod_over_axis1_contig_atomic_dispatch_table;
+        using impl::prod_over_axis1_contig_temps_dispatch_table;
+        using impl::prod_over_axis_strided_atomic_dispatch_table;
+        using impl::prod_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_prod_atomic_support_dispatch_vector;
+        populate_prod_atomic_support_dispatch_vector();
+        using impl::prod_atomic_support_vector;
+
+        auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                prod_over_axis_strided_atomic_dispatch_table,
+                prod_over_axis0_contig_atomic_dispatch_table,
+                prod_over_axis1_contig_atomic_dispatch_table,
+                prod_over_axis_strided_temps_dispatch_table,
+                prod_over_axis0_contig_temps_dispatch_table,
+                prod_over_axis1_contig_temps_dispatch_table,
+                prod_atomic_support_vector);
+        };
+        m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto prod_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    prod_over_axis_strided_atomic_dispatch_table,
+                    prod_over_axis_strided_temps_dispatch_table,
+                    prod_atomic_support_vector);
+            };
+        m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp
new file mode 100644
index 000000000000..15b1c07e5ddd
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_prod(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
index ff6b5d994f33..40c93fdc26fc 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -42,9 +42,9 @@
 // #include "logsumexp.hpp"
 #include "max.hpp"
 #include "min.hpp"
-// #include "prod.hpp"
+#include "prod.hpp"
 // #include "reduce_hypot.hpp"
-// #include "sum.hpp"
+#include "sum.hpp"
 
 namespace py = pybind11;
 
@@ -61,9 +61,9 @@ void init_reduction_functions(py::module_ m)
     // init_logsumexp(m);
     init_max(m);
     init_min(m);
-    // init_prod(m);
+    init_prod(m);
     // init_reduce_hypot(m);
-    // init_sum(m);
+    init_sum(m);
 }
 
 } // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
new file mode 100644
index 000000000000..32e31e9707ed
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
@@ -0,0 +1,463 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_sum_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types];
+
+void populate_sum_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::SumAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, SumAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(sum_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_sum_over_axis_dispatch_tables;
+        populate_sum_over_axis_dispatch_tables();
+        using impl::sum_over_axis0_contig_atomic_dispatch_table;
+        using impl::sum_over_axis0_contig_temps_dispatch_table;
+        using impl::sum_over_axis1_contig_atomic_dispatch_table;
+        using impl::sum_over_axis1_contig_temps_dispatch_table;
+        using impl::sum_over_axis_strided_atomic_dispatch_table;
+        using impl::sum_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_sum_atomic_support_dispatch_vector;
+        populate_sum_atomic_support_dispatch_vector();
+        using impl::sum_atomic_support_vector;
+
+        auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                sum_over_axis_strided_atomic_dispatch_table,
+                sum_over_axis0_contig_atomic_dispatch_table,
+                sum_over_axis1_contig_atomic_dispatch_table,
+                sum_over_axis_strided_temps_dispatch_table,
+                sum_over_axis0_contig_temps_dispatch_table,
+                sum_over_axis1_contig_temps_dispatch_table,
+                sum_atomic_support_vector);
+        };
+        m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sum_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    sum_over_axis_strided_atomic_dispatch_table,
+                    sum_over_axis_strided_temps_dispatch_table,
+                    sum_atomic_support_vector);
+            };
+        m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp
new file mode 100644
index 000000000000..08add902a049
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal

From 6b27b1bd1ca53d7dda9070051fb475a1610525f1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 05:13:30 -0800
Subject: [PATCH 138/145] Move ti.sum()/prod() to dpctl_ext.tensor and reuse
 them in dpnp

---
 dpctl_ext/tensor/__init__.py    |   4 +
 dpctl_ext/tensor/_reduction.py  | 291 ++++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_manipulation.py |   4 +-
 dpnp/dpnp_iface_mathematical.py |  17 +-
 4 files changed, 305 insertions(+), 11 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index a3b6a209a47e..1ef8d8eb24e5 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -88,6 +88,8 @@
     argmin,
     max,
     min,
+    prod,
+    sum,
 )
 from ._searchsorted import searchsorted
 from ._set_functions import (
@@ -143,6 +145,7 @@
     "ones",
     "ones_like",
     "place",
+    "prod",
     "put",
     "put_along_axis",
     "repeat",
@@ -153,6 +156,7 @@
     "sort",
     "squeeze",
     "stack",
+    "sum",
     "swapaxes",
     "take",
     "take_along_axis",
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
index e39c853284d9..7e1113591451 100644
--- a/dpctl_ext/tensor/_reduction.py
+++ b/dpctl_ext/tensor/_reduction.py
@@ -37,6 +37,10 @@
 import dpctl_ext.tensor._tensor_reductions_impl as tri
 
 from ._numpy_helper import normalize_axis_tuple
+from ._type_utils import (
+    _default_accumulation_dtype,
+    _to_device_supported_dtype,
+)
 
 
 def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
@@ -137,6 +141,164 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
     return out
 
 
+def _reduction_over_axis(
+    x,
+    axis,
+    dtype,
+    keepdims,
+    out,
+    _reduction_fn,
+    _dtype_supported,
+    _default_reduction_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        arr = x
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        arr = dpt_ext.permute_dims(x, perm)
+    red_nd = len(axis)
+    res_shape = arr.shape[: nd - red_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    if dtype is None:
+        res_dt = _default_reduction_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    res_usm_type = x.usm_type
+
+    implemented_types = _dtype_supported(inp_dt, res_dt, res_usm_type, q)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined reduction data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
+            raise ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt_ext.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt_ext.empty_like(out)
+    else:
+        out = dpt_ext.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+
+    _manager = SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=out, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+            out = orig_out
+        return out
+
+    if implemented_types:
+        ht_e, red_e = _reduction_fn(
+            src=arr,
+            trailing_dims_to_reduce=red_nd,
+            dst=out,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e, red_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[red_e]
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt, res_usm_type, q):
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            ht_e_red, red_ev = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=out,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, red_ev)
+        else:
+            buf_dt = _default_reduction_type_fn(inp_dt, q)
+            tmp = dpt_ext.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            tmp_res = dpt_ext.empty(
+                res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_red, r_e = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=tmp_res,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, r_e)
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[r_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+    return out
+
+
 def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -374,3 +536,132 @@ def min(x, /, *, axis=None, keepdims=False, out=None):
             array has the same data type as ``x``.
     """
     return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
+
+
+def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the product of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which products must be computed. If a tuple
+            of unique integers, products are computed over multiple axes.
+            If ``None``, the product is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the product.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the products. If the product was computed over
+            the entire array, a zero-dimensional array is returned. The
+            returned array has the data type as described in the ``dtype``
+            parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._prod_over_axis,
+        tri._prod_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the sum of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which sums must be computed. If a tuple
+            of unique integers, sums are computed over multiple axes.
+            If ``None``, the sum is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+              array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the sum.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the sums. If the sum was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the ``dtype`` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._sum_over_axis,
+        tri._sum_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index cb0d0d35b0d7..7c9e4957c32b 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -428,7 +428,9 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to put a count of all NaNs
             # at the last index
-            dpt.sum(usm_res.counts[first_nan:], out=usm_res.counts[first_nan])
+            dpt_ext.sum(
+                usm_res.counts[first_nan:], out=usm_res.counts[first_nan]
+            )
             result += (usm_res.counts[: first_nan + 1],)
         else:
             result += (usm_res.counts,)
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 2c3975625c9e..7aa0a850cedb 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -47,7 +47,6 @@
 import builtins
 import warnings
 
-import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
@@ -58,7 +57,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -730,7 +729,7 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs):
     usm_max = None if max is None else dpnp.get_usm_ndarray_or_scalar(max)
 
     usm_out = None if out is None else dpnp.get_usm_ndarray(out)
-    usm_res = dpt_ext.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
+    usm_res = dpt.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order)
     if out is not None and isinstance(out, dpnp_array):
         return out
     return dpnp_array._create_from_usm_ndarray(usm_res)
@@ -1126,7 +1125,7 @@ def cumprod(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt_ext.cumulative_prod,
+        dpt.cumulative_prod,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1218,7 +1217,7 @@ def cumsum(a, axis=None, dtype=None, out=None):
     return dpnp_wrap_reduction_call(
         usm_a,
         out,
-        dpt_ext.cumulative_sum,
+        dpt.cumulative_sum,
         _get_reduction_res_dt(a, dtype),
         axis=axis,
         dtype=dtype,
@@ -1307,7 +1306,7 @@ def cumulative_prod(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt_ext.cumulative_prod,
+        dpt.cumulative_prod,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,
@@ -1403,7 +1402,7 @@ def cumulative_sum(
     return dpnp_wrap_reduction_call(
         dpnp.get_usm_ndarray(x),
         out,
-        dpt_ext.cumulative_sum,
+        dpt.cumulative_sum,
         _get_reduction_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,
@@ -1480,9 +1479,7 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
     )
     usm_app = None if append is None else dpnp.get_usm_ndarray_or_scalar(append)
 
-    usm_res = dpt_ext.diff(
-        usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app
-    )
+    usm_res = dpt.diff(usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 

From 3041d7d6bb9930b1e100a88e960e444d741d8d28 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 05:29:21 -0800
Subject: [PATCH 139/145] Move _logsumexp/hypot_over_axis to
 _tensor_reductions_impl

---
 dpctl_ext/tensor/CMakeLists.txt               |   4 +-
 .../libtensor/source/reductions/logsumexp.cpp | 259 ++++++++++++++++++
 .../libtensor/source/reductions/logsumexp.hpp |  46 ++++
 .../source/reductions/reduce_hypot.cpp        | 255 +++++++++++++++++
 .../source/reductions/reduce_hypot.hpp        |  46 ++++
 .../source/reductions/reduction_common.cpp    |   8 +-
 6 files changed, 612 insertions(+), 6 deletions(-)
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
 create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp

diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 350a0b2518ba..cf55035c23d9 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -75,11 +75,11 @@ set(_reduction_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
 )
 set(_sorting_sources
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
new file mode 100644
index 000000000000..8be813203598
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
@@ -0,0 +1,259 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+#if 1
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+#endif
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_logsumexp_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         LogSumExpOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(
+        logsumexp_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        logsumexp_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        logsumexp_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_logsumexp_over_axis_dispatch_tables;
+        populate_logsumexp_over_axis_dispatch_tables();
+        using impl::logsumexp_over_axis0_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis1_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto logsumexp_pyapi = [&](const arrayT &src,
+                                   int trailing_dims_to_reduce,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                logsumexp_over_axis_strided_temps_dispatch_table,
+                logsumexp_over_axis0_contig_temps_dispatch_table,
+                logsumexp_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                             const py::dtype &output_dtype) {
+            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                logsumexp_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported,
+              "", py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp
new file mode 100644
index 000000000000..2e2c19877db6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logsumexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
new file mode 100644
index 000000000000..d7903d12b0bc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
@@ -0,0 +1,255 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForHypotReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_hypot_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         HypotOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_reduce_hypot(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_hypot_over_axis_dispatch_tables;
+        populate_hypot_over_axis_dispatch_tables();
+        using impl::hypot_over_axis0_contig_temps_dispatch_table;
+        using impl::hypot_over_axis1_contig_temps_dispatch_table;
+        using impl::hypot_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                hypot_over_axis_strided_temps_dispatch_table,
+                hypot_over_axis0_contig_temps_dispatch_table,
+                hypot_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto hypot_dtype_supported = [&](const py::dtype &input_dtype,
+                                         const py::dtype &output_dtype) {
+            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                hypot_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp
new file mode 100644
index 000000000000..c0a16345af75
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reduce_hypot(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
index 40c93fdc26fc..fca5e09e2fe5 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -39,11 +39,11 @@
 #include "any.hpp"
 #include "argmax.hpp"
 #include "argmin.hpp"
-// #include "logsumexp.hpp"
+#include "logsumexp.hpp"
 #include "max.hpp"
 #include "min.hpp"
 #include "prod.hpp"
-// #include "reduce_hypot.hpp"
+#include "reduce_hypot.hpp"
 #include "sum.hpp"
 
 namespace py = pybind11;
@@ -58,11 +58,11 @@ void init_reduction_functions(py::module_ m)
     init_any(m);
     init_argmax(m);
     init_argmin(m);
-    // init_logsumexp(m);
+    init_logsumexp(m);
     init_max(m);
     init_min(m);
     init_prod(m);
-    // init_reduce_hypot(m);
+    init_reduce_hypot(m);
     init_sum(m);
 }
 

From 4872bb6bbd2c839af5825064805de40758144d32 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 3 Mar 2026 05:31:03 -0800
Subject: [PATCH 140/145] Move ti.count_nonzero()/logsumexp()/reduce_hypot() to
 dpctl_ext.tensor and reuse them

---
 dpctl_ext/tensor/__init__.py     |   6 ++
 dpctl_ext/tensor/_reduction.py   | 167 +++++++++++++++++++++++++++++++
 dpnp/dpnp_iface_counting.py      |   5 +-
 dpnp/dpnp_iface_trigonometric.py |   6 +-
 4 files changed, 178 insertions(+), 6 deletions(-)

diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 1ef8d8eb24e5..c40d9dacdadd 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -86,9 +86,12 @@
 from ._reduction import (
     argmax,
     argmin,
+    count_nonzero,
+    logsumexp,
     max,
     min,
     prod,
+    reduce_hypot,
     sum,
 )
 from ._searchsorted import searchsorted
@@ -117,6 +120,7 @@
     "can_cast",
     "concat",
     "copy",
+    "count_nonzero",
     "clip",
     "cumulative_logsumexp",
     "cumulative_prod",
@@ -136,6 +140,7 @@
     "isdtype",
     "isin",
     "linspace",
+    "logsumexp",
     "max",
     "meshgrid",
     "min",
@@ -148,6 +153,7 @@
     "prod",
     "put",
     "put_along_axis",
+    "reduce_hypot",
     "repeat",
     "reshape",
     "result_type",
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
index 7e1113591451..b8fdcf4f37e6 100644
--- a/dpctl_ext/tensor/_reduction.py
+++ b/dpctl_ext/tensor/_reduction.py
@@ -39,6 +39,7 @@
 from ._numpy_helper import normalize_axis_tuple
 from ._type_utils import (
     _default_accumulation_dtype,
+    _default_accumulation_dtype_fp_types,
     _to_device_supported_dtype,
 )
 
@@ -472,6 +473,111 @@ def argmin(x, /, *, axis=None, keepdims=False, out=None):
     return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis)
 
 
+def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Counts the number of elements in the input array ``x`` which are non-zero.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which to count. If a tuple of unique integers,
+            the number of non-zero values are computed over multiple axes.
+            If ``None``, the number of non-zero values is computed over the
+            entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and data
+            type.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the count of non-zero values. If the sum was
+            computed over the entire array, a zero-dimensional array is
+            returned. The returned array will have the default array index data
+            type.
+    """
+    if x.dtype != dpt.bool:
+        x = dpt.astype(x, dpt.bool, copy=False)
+    return sum(
+        x,
+        axis=axis,
+        dtype=ti.default_device_index_type(x.sycl_device),
+        keepdims=keepdims,
+        out=out,
+    )
+
+
+def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the logarithm of the sum of exponentials of elements in the
+    input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If ``None``, the result is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real-valued floating-point data type, the
+              returned array will have the same data type as ``x``.
+            * If ``x`` has a boolean or integral data type, the returned array
+              will have the default floating point data type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a complex-valued floating-point data type,
+              an error is raised.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the result.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned.
+            The returned array has the data type as described in the
+            ``dtype`` parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._logsumexp_over_axis,
+        lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_accumulation_dtype_fp_types,
+    )
+
+
 def max(x, /, *, axis=None, keepdims=False, out=None):
     """
     Calculates the maximum value of the input array ``x``.
@@ -602,6 +708,67 @@ def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
     )
 
 
+def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the square root of the sum of squares of elements in the input
+    array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If ``None``, the result is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real-valued floating-point data type, the
+              returned array will have the same data type as ``x``.
+            * If ``x`` has a boolean or integral data type, the returned array
+              will have the default floating point data type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a complex-valued floating-point data type,
+              an error is raised.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the result. Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned. The
+            returned array has the data type as described in the ``dtype``
+            parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._hypot_over_axis,
+        lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_accumulation_dtype_fp_types,
+    )
+
+
 def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
     """
     Calculates the sum of elements in the input array ``x``.
diff --git a/dpnp/dpnp_iface_counting.py b/dpnp/dpnp_iface_counting.py
index a4b85aa85294..a8ebafbcead7 100644
--- a/dpnp/dpnp_iface_counting.py
+++ b/dpnp/dpnp_iface_counting.py
@@ -39,8 +39,9 @@
 
 """
 
-import dpctl.tensor as dpt
-
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpnp
 
 
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index 460a0dc80f0f..a17c7dfd9d9a 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -42,13 +42,11 @@
 # pylint: disable=protected-access
 # pylint: disable=no-name-in-module
 
-
-import dpctl.tensor as dpt
 import dpctl.tensor._tensor_elementwise_impl as ti
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -935,7 +933,7 @@ def cumlogsumexp(
     return dpnp_wrap_reduction_call(
         usm_x,
         out,
-        dpt_ext.cumulative_logsumexp,
+        dpt.cumulative_logsumexp,
         _get_accumulation_res_dt(x, dtype),
         axis=axis,
         dtype=dtype,

From 85ef3e0ef0c3248fe39d55c5c428e9a884297afa Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 5 Mar 2026 10:45:57 -0800
Subject: [PATCH 141/145] Apply remarks

---
 dpctl_ext/tensor/_accumulation.py                           | 6 +++---
 .../libtensor/source/accumulators/accumulate_over_axis.hpp  | 5 ++++-
 .../libtensor/source/accumulators/cumulative_logsumexp.cpp  | 3 ---
 .../libtensor/source/accumulators/cumulative_prod.cpp       | 3 ---
 .../tensor/libtensor/source/accumulators/cumulative_sum.cpp | 3 ---
 5 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py
index 17596e5647fc..2dfe9656e198 100644
--- a/dpctl_ext/tensor/_accumulation.py
+++ b/dpctl_ext/tensor/_accumulation.py
@@ -35,14 +35,14 @@
 import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_accumulation_impl as tai
 import dpctl_ext.tensor._tensor_impl as ti
-from dpctl_ext.tensor._type_utils import (
+
+from ._numpy_helper import normalize_axis_index
+from ._type_utils import (
     _default_accumulation_dtype,
     _default_accumulation_dtype_fp_types,
     _to_device_supported_dtype,
 )
 
-from ._numpy_helper import normalize_axis_index
-
 
 def _accumulate_common(
     x,
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
index 712ab180ef37..4dd00620a260 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -35,9 +35,12 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cstddef>
-#include <cstdint>
+#include <exception>
+#include <iterator>
 #include <stdexcept>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
index 572c93c0066e..f1ad170caa58 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -311,7 +311,6 @@ void init_cumulative_logsumexp(py::module_ m)
                                   int trailing_dims_to_accumulate,
                                   const arrayT &dst, sycl::queue &exec_q,
                                   const event_vecT &depends = {}) {
-        using dpctl::tensor::py_internal::py_accumulate_over_axis;
         return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst,
                                        exec_q, depends,
                                        cumlogsumexp_strided_dispatch_table,
@@ -326,8 +325,6 @@ void init_cumulative_logsumexp(py::module_ m)
     auto cumlogsumexp_include_initial_pyapi =
         [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
             const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::
-                py_accumulate_final_axis_include_initial;
             return py_accumulate_final_axis_include_initial(
                 src, dst, exec_q, depends,
                 cumlogsumexp_include_initial_strided_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
index 07c802c8cffa..9a9961441d35 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -322,7 +322,6 @@ void init_cumulative_prod(py::module_ m)
     auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
                              const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
-        using dpctl::tensor::py_internal::py_accumulate_over_axis;
         return py_accumulate_over_axis(
             src, trailing_dims_to_accumulate, dst, exec_q, depends,
             cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table);
@@ -336,8 +335,6 @@ void init_cumulative_prod(py::module_ m)
     auto cumprod_include_initial_pyapi =
         [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
             const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::
-                py_accumulate_final_axis_include_initial;
             return py_accumulate_final_axis_include_initial(
                 src, dst, exec_q, depends,
                 cumprod_include_initial_strided_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
index 3c422bfd0998..3a0ed6cf3ab5 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -320,7 +320,6 @@ void init_cumulative_sum(py::module_ m)
     auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
                             const arrayT &dst, sycl::queue &exec_q,
                             const event_vecT &depends = {}) {
-        using dpctl::tensor::py_internal::py_accumulate_over_axis;
         return py_accumulate_over_axis(
             src, trailing_dims_to_accumulate, dst, exec_q, depends,
             cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table);
@@ -334,8 +333,6 @@ void init_cumulative_sum(py::module_ m)
     auto cumsum_include_initial_pyapi =
         [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
             const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::
-                py_accumulate_final_axis_include_initial;
             return py_accumulate_final_axis_include_initial(
                 src, dst, exec_q, depends,
                 cumsum_include_initial_strided_dispatch_table,

From e335b3d5352b5816de78930107619ad45c2f33fa Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Mar 2026 02:21:08 -0700
Subject: [PATCH 142/145] Apply remarks

---
 .../libtensor/source/accumulators/cumulative_logsumexp.cpp       | 1 -
 .../tensor/libtensor/source/accumulators/cumulative_prod.cpp     | 1 -
 .../tensor/libtensor/source/accumulators/cumulative_sum.cpp      | 1 -
 3 files changed, 3 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
index f1ad170caa58..e24cf56ddd62 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -337,7 +337,6 @@ void init_cumulative_logsumexp(py::module_ m)
 
     auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype,
                                             const py::dtype &output_dtype) {
-        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
         return py_accumulate_dtype_supported(
             input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table);
     };
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
index 9a9961441d35..65f3c311eda1 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -346,7 +346,6 @@ void init_cumulative_prod(py::module_ m)
 
     auto cumprod_dtype_supported = [&](const py::dtype &input_dtype,
                                        const py::dtype &output_dtype) {
-        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
         return py_accumulate_dtype_supported(input_dtype, output_dtype,
                                              cumprod_strided_dispatch_table);
     };
diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
index 3a0ed6cf3ab5..60b46946acc9 100644
--- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
+++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -344,7 +344,6 @@ void init_cumulative_sum(py::module_ m)
 
     auto cumsum_dtype_supported = [&](const py::dtype &input_dtype,
                                       const py::dtype &output_dtype) {
-        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
         return py_accumulate_dtype_supported(input_dtype, output_dtype,
                                              cumsum_strided_dispatch_table);
     };

From d7d7a2b690ddefa43d66174fd9f01e4710603804 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Mar 2026 03:02:12 -0700
Subject: [PATCH 143/145] Apply remarks

---
 .../kernels/sorting/search_sorted_detail.hpp     |  9 ++-------
 .../include/kernels/sorting/searchsorted.hpp     |  1 -
 .../include/kernels/sorting/sort_utils.hpp       |  7 ++-----
 .../libtensor/include/kernels/sorting/topk.hpp   |  2 ++
 .../tensor/libtensor/source/sorting/isin.cpp     |  7 ++++---
 .../libtensor/source/sorting/merge_argsort.cpp   | 16 +++++++---------
 .../libtensor/source/sorting/merge_sort.cpp      | 12 +++++-------
 .../source/sorting/py_argsort_common.hpp         |  1 -
 .../libtensor/source/sorting/py_sort_common.hpp  |  1 -
 .../libtensor/source/sorting/radix_argsort.cpp   | 15 ++++++---------
 .../libtensor/source/sorting/radix_sort.cpp      | 16 ++++++----------
 .../libtensor/source/sorting/searchsorted.cpp    | 10 +++++-----
 .../tensor/libtensor/source/sorting/topk.cpp     |  2 +-
 .../tensor/libtensor/source/tensor_sorting.cpp   |  2 --
 14 files changed, 40 insertions(+), 61 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
index d184261aeabf..1f3576402511 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
@@ -36,10 +36,7 @@
 
 #include <cstddef>
 
-namespace dpctl::tensor::kernels
-{
-
-namespace search_sorted_detail
+namespace dpctl::tensor::kernels::search_sorted_detail
 {
 
 template <typename T>
@@ -119,6 +116,4 @@ std::size_t upper_bound_indexed_impl(const Acc acc,
                                     acc_indexer);
 }
 
-} // namespace search_sorted_detail
-
-} // namespace dpctl::tensor::kernels
+} // namespace dpctl::tensor::kernels::search_sorted_detail
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
index b89ad922e102..bc400c9e569a 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
@@ -35,7 +35,6 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
 #include <vector>
 
 #include <sycl/sycl.hpp>
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
index 4bbcacd56fcc..fd32905b808e 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -37,7 +37,6 @@
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <span>
 #include <vector>
 
 #include <sycl/sycl.hpp>
@@ -78,10 +77,8 @@ sycl::event iota_impl(sycl::queue &exec_q,
             }
 
             if (offset + n_wi * max_sgSize < nelems) {
-                static constexpr auto group_ls_props = syclexp::properties{
-                    syclexp::data_placement_striped
-                    // , syclexp::full_group
-                };
+                static constexpr auto group_ls_props =
+                    syclexp::properties{syclexp::data_placement_striped};
 
                 auto out_multi_ptr = sycl::address_space_cast<
                     sycl::access::address_space::global_space,
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
index 90e06a1e9eb8..e66d82247434 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -34,8 +34,10 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <stdexcept>
 #include <vector>
 
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
index 728a7ca3b733..f1ae5863bbb9 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp
@@ -35,6 +35,7 @@
 
 #include <cstddef>
 #include <stdexcept>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -46,6 +47,7 @@
 
 #include "kernels/sorting/isin.hpp"
 #include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include "utils/type_dispatch.hpp"
@@ -254,7 +256,7 @@ std::pair<sycl::event, sycl::event>
         simplified_dst_strides.push_back(0);
     }
     else {
-        dpctl::tensor::py_internal::simplify_iteration_space(
+        simplify_iteration_space(
             // modified by reference
             simplified_nd,
             // read-only inputs
@@ -313,9 +315,8 @@ std::pair<sycl::event, sycl::event>
 
 void init_isin_functions(py::module_ m)
 {
-    dpctl::tensor::py_internal::detail::init_isin_dispatch_vector();
+    detail::init_isin_dispatch_vector();
 
-    using dpctl::tensor::py_internal::py_isin;
     m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"),
           py::arg("sycl_queue"), py::arg("invert"),
           py::arg("depends") = py::list());
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
index f76a74e1f8bf..2b6dcc8bf447 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp
@@ -35,6 +35,8 @@
 
 #include <cstdint>
 #include <type_traits>
+#include <utility>
+#include <vector>
 
 #include <sycl/sycl.hpp>
 
@@ -121,7 +123,7 @@ void init_merge_argsort_dispatch_tables(void)
 
 void init_merge_argsort_functions(py::module_ m)
 {
-    dpctl::tensor::py_internal::init_merge_argsort_dispatch_tables();
+    init_merge_argsort_dispatch_tables();
 
     auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src,
                                    const int trailing_dims_to_sort,
@@ -129,10 +131,8 @@ void init_merge_argsort_functions(py::module_ m)
                                    sycl::queue &exec_q,
                                    const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                ascending_argsort_contig_dispatch_table);
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          ascending_argsort_contig_dispatch_table);
     };
     m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"),
           py::arg("trailing_dims_to_sort"), py::arg("dst"),
@@ -144,10 +144,8 @@ void init_merge_argsort_functions(py::module_ m)
                                     sycl::queue &exec_q,
                                     const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                descending_argsort_contig_dispatch_table);
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          descending_argsort_contig_dispatch_table);
     };
     m.def("_argsort_descending", py_argsort_descending, py::arg("src"),
           py::arg("trailing_dims_to_sort"), py::arg("dst"),
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
index 7bb0e37c6aed..23d95fb27691 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
@@ -102,7 +102,7 @@ void init_merge_sort_dispatch_vectors(void)
 
 void init_merge_sort_functions(py::module_ m)
 {
-    dpctl::tensor::py_internal::init_merge_sort_dispatch_vectors();
+    init_merge_sort_dispatch_vectors();
 
     auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
                                 const int trailing_dims_to_sort,
@@ -110,9 +110,8 @@ void init_merge_sort_functions(py::module_ m)
                                 sycl::queue &exec_q,
                                 const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector);
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       ascending_sort_contig_dispatch_vector);
     };
     m.def("_sort_ascending", py_sort_ascending, py::arg("src"),
           py::arg("trailing_dims_to_sort"), py::arg("dst"),
@@ -124,9 +123,8 @@ void init_merge_sort_functions(py::module_ m)
                                  sycl::queue &exec_q,
                                  const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector);
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       descending_sort_contig_dispatch_vector);
     };
     m.def("_sort_descending", py_sort_descending, py::arg("src"),
           py::arg("trailing_dims_to_sort"), py::arg("dst"),
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
index d204b492b2ff..6328b3339376 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp
@@ -44,7 +44,6 @@
 
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
index 0fdf7bec80fd..ee8777f35077 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp
@@ -44,7 +44,6 @@
 
 #include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
index ba9bbfd61b7c..e54b8f739a4b 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp
@@ -62,6 +62,7 @@ namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
 
+using dpctl::tensor::ssize_t;
 using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
 
 static sort_contig_fn_ptr_t
@@ -152,7 +153,7 @@ void init_radix_argsort_dispatch_tables(void)
 
 void init_radix_argsort_functions(py::module_ m)
 {
-    dpctl::tensor::py_internal::init_radix_argsort_dispatch_tables();
+    init_radix_argsort_dispatch_tables();
 
     auto py_radix_argsort_ascending =
         [](const dpctl::tensor::usm_ndarray &src,
@@ -160,10 +161,8 @@ void init_radix_argsort_functions(py::module_ m)
            const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
            const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                ascending_radix_argsort_contig_dispatch_table);
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          ascending_radix_argsort_contig_dispatch_table);
     };
     m.def("_radix_argsort_ascending", py_radix_argsort_ascending,
           py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
@@ -175,10 +174,8 @@ void init_radix_argsort_functions(py::module_ m)
            const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
            const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                descending_radix_argsort_contig_dispatch_table);
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          descending_radix_argsort_contig_dispatch_table);
     };
     m.def("_radix_argsort_descending", py_radix_argsort_descending,
           py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
index fb3cab487df8..35c71a0eb7d3 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp
@@ -34,7 +34,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <cstddef>
-#include <cstdint>
 #include <exception>
 #include <utility>
 #include <vector>
@@ -62,6 +61,7 @@ namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
 
+using dpctl::tensor::ssize_t;
 using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
 static sort_contig_fn_ptr_t
     ascending_radix_sort_contig_dispatch_vector[td_ns::num_types];
@@ -152,7 +152,7 @@ bool py_radix_sort_defined(int typenum)
 
 void init_radix_sort_functions(py::module_ m)
 {
-    dpctl::tensor::py_internal::init_radix_sort_dispatch_vectors();
+    init_radix_sort_dispatch_vectors();
 
     auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
                                       const int trailing_dims_to_sort,
@@ -160,10 +160,8 @@ void init_radix_sort_functions(py::module_ m)
                                       sycl::queue &exec_q,
                                       const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                ascending_radix_sort_contig_dispatch_vector);
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       ascending_radix_sort_contig_dispatch_vector);
     };
     m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"),
           py::arg("trailing_dims_to_sort"), py::arg("dst"),
@@ -175,10 +173,8 @@ void init_radix_sort_functions(py::module_ m)
                                        sycl::queue &exec_q,
                                        const std::vector<sycl::event> &depends)
         -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                descending_radix_sort_contig_dispatch_vector);
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       descending_radix_sort_contig_dispatch_vector);
     };
     m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"),
           py::arg("trailing_dims_to_sort"), py::arg("dst"),
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
index 5cebb74be2d0..8b1ce04a97d6 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -35,6 +35,8 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <stdexcept>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -47,6 +49,7 @@
 
 #include "kernels/sorting/searchsorted.hpp"
 #include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/rich_comparisons.hpp"
 #include "utils/sycl_alloc_utils.hpp"
@@ -369,7 +372,7 @@ std::pair<sycl::event, sycl::event>
         simplified_positions_strides.push_back(0);
     }
     else {
-        dpctl::tensor::py_internal::simplify_iteration_space(
+        simplify_iteration_space(
             // modified by reference
             simplified_nd,
             // read-only inputs
@@ -462,10 +465,7 @@ std::pair<sycl::event, sycl::event>
 
 void init_searchsorted_functions(py::module_ m)
 {
-    dpctl::tensor::py_internal::detail::init_searchsorted_dispatch_table();
-
-    using dpctl::tensor::py_internal::py_searchsorted_left;
-    using dpctl::tensor::py_internal::py_searchsorted_right;
+    detail::init_searchsorted_dispatch_table();
 
     m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"),
           py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
index 23a8d2a4328f..6b8344df12c8 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp
@@ -293,7 +293,7 @@ void init_topk_dispatch_vectors(void)
 
 void init_topk_functions(py::module_ m)
 {
-    dpctl::tensor::py_internal::init_topk_dispatch_vectors();
+    init_topk_dispatch_vectors();
 
     m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
           py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"),
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
index 98a2493a7c48..318c3559d77c 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp
@@ -43,8 +43,6 @@
 #include "sorting/searchsorted.hpp"
 #include "sorting/topk.hpp"
 
-namespace py = pybind11;
-
 PYBIND11_MODULE(_tensor_sorting_impl, m)
 {
     dpctl::tensor::py_internal::init_isin_functions(m);

From 3c26859982f580b194ea26720913220ae7f70ce7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Mar 2026 03:14:22 -0700
Subject: [PATCH 144/145] Apply remaining remarks

---
 dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp | 1 -
 dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp    | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
index e66d82247434..a1e57dc9ef30 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -41,7 +41,6 @@
 #include <stdexcept>
 #include <vector>
 
-#include <sycl/ext/oneapi/sub_group_mask.hpp>
 #include <sycl/sycl.hpp>
 
 #include "kernels/sorting/merge_sort.hpp"
diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
index 23d95fb27691..fbd60621b3bb 100644
--- a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
+++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp
@@ -33,6 +33,9 @@
 /// extension.
 //===----------------------------------------------------------------------===//
 
+#include <utility>
+#include <vector>
+
 #include <sycl/sycl.hpp>
 
 #include "dpnp4pybind11.hpp"

From e0bba1d4238b96d35e7625c45779519192eefa90 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 9 Mar 2026 03:41:02 -0700
Subject: [PATCH 145/145] Apply remarks

---
 .../tensor/libtensor/include/kernels/reductions.hpp |  1 -
 .../tensor/libtensor/source/reductions/argmax.cpp   |  1 -
 .../tensor/libtensor/source/reductions/argmin.cpp   |  1 -
 .../libtensor/source/reductions/logsumexp.cpp       |  2 --
 .../tensor/libtensor/source/reductions/max.cpp      |  1 -
 .../tensor/libtensor/source/reductions/min.cpp      |  1 -
 .../tensor/libtensor/source/reductions/prod.cpp     |  2 --
 .../libtensor/source/reductions/reduce_hypot.cpp    |  4 +---
 .../source/reductions/reduction_atomic_support.hpp  |  1 -
 .../source/reductions/reduction_over_axis.hpp       | 13 +------------
 .../tensor/libtensor/source/reductions/sum.cpp      |  2 --
 .../tensor/libtensor/source/tensor_reductions.cpp   |  2 --
 12 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
index d77d7df92609..ee6431dec637 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp
@@ -40,7 +40,6 @@
 #include <complex>
 #include <cstddef>
 #include <cstdint>
-#include <stdexcept>
 #include <type_traits>
 #include <utility>
 #include <vector>
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
index bf6592798b4f..10fc49759168 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp
@@ -264,7 +264,6 @@ void init_argmax(py::module_ m)
         auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                                 const arrayT &dst, sycl::queue &exec_q,
                                 const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_search_over_axis;
             return py_search_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 argmax_over_axis_strided_temps_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
index acd685c59802..ec4637b62d49 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp
@@ -264,7 +264,6 @@ void init_argmin(py::module_ m)
         auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                                 const arrayT &dst, sycl::queue &exec_q,
                                 const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_search_over_axis;
             return py_search_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 argmin_over_axis_strided_temps_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
index 8be813203598..08ed4f12dbda 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp
@@ -233,7 +233,6 @@ void init_logsumexp(py::module_ m)
                                    int trailing_dims_to_reduce,
                                    const arrayT &dst, sycl::queue &exec_q,
                                    const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
             return py_tree_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 logsumexp_over_axis_strided_temps_dispatch_table,
@@ -246,7 +245,6 @@ void init_logsumexp(py::module_ m)
 
         auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype,
                                              const py::dtype &output_dtype) {
-            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
             return py_tree_reduction_dtype_supported(
                 input_dtype, output_dtype,
                 logsumexp_over_axis_strided_temps_dispatch_table);
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp
index dfcdc6a7b2a5..d19ed226d3b4 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp
@@ -391,7 +391,6 @@ void init_max(py::module_ m)
         auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                              const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 max_over_axis_strided_atomic_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp
index b57c35736adc..97d3432b13ed 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp
@@ -393,7 +393,6 @@ void init_min(py::module_ m)
         auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                              const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 min_over_axis_strided_atomic_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
index dc9112a7b39a..e16e0cf25e1d 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp
@@ -432,7 +432,6 @@ void init_prod(py::module_ m)
         auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                               const arrayT &dst, sycl::queue &exec_q,
                               const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 prod_over_axis_strided_atomic_dispatch_table,
@@ -450,7 +449,6 @@ void init_prod(py::module_ m)
         auto prod_dtype_supported =
             [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
                 const std::string &dst_usm_type, sycl::queue &q) {
-                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
                 return py_reduction_dtype_supported(
                     input_dtype, output_dtype, dst_usm_type, q,
                     prod_over_axis_strided_atomic_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
index d7903d12b0bc..3c343b702238 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp
@@ -44,9 +44,9 @@
 #include <pybind11/stl.h>
 
 #include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_dispatch_building.hpp"
 
-#include "reduction_atomic_support.hpp"
 #include "reduction_over_axis.hpp"
 
 namespace dpctl::tensor::py_internal
@@ -229,7 +229,6 @@ void init_reduce_hypot(py::module_ m)
         auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                                const arrayT &dst, sycl::queue &exec_q,
                                const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
             return py_tree_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 hypot_over_axis_strided_temps_dispatch_table,
@@ -242,7 +241,6 @@ void init_reduce_hypot(py::module_ m)
 
         auto hypot_dtype_supported = [&](const py::dtype &input_dtype,
                                          const py::dtype &output_dtype) {
-            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
             return py_tree_reduction_dtype_supported(
                 input_dtype, output_dtype,
                 hypot_over_axis_strided_temps_dispatch_table);
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
index b7aaf9313aa2..5f9cc32f1203 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -34,7 +34,6 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <complex>
 #include <type_traits>
 
 #include <sycl/sycl.hpp>
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
index e16a80f129fc..130e61eb8e7d 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -37,8 +37,8 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <cstdint>
 #include <exception>
+#include <iterator>
 #include <stdexcept>
 #include <type_traits>
 #include <utility>
@@ -49,7 +49,6 @@
 #include "dpnp4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "kernels/reductions.hpp"
 #include "simplify_iteration_space.hpp"
@@ -325,9 +324,6 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         }
     }
 
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
-
     // TODO: not used anywhere
     auto const &src_shape_vecs = src.get_shape_vector();
     auto const &src_strides_vecs = src.get_strides_vector();
@@ -637,9 +633,6 @@ std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
         }
     }
 
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
-
     auto const &src_shape_vecs = src.get_shape_vector();
     auto const &src_strides_vecs = src.get_strides_vector();
     auto const &dst_strides_vecs = dst.get_strides_vector();
@@ -912,8 +905,6 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
         }
     }
 
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-
     auto const &src_shape_vecs = src.get_shape_vector();
     auto const &src_strides_vecs = src.get_strides_vector();
     auto const &dst_strides_vecs = dst.get_strides_vector();
@@ -1205,7 +1196,6 @@ std::pair<sycl::event, sycl::event>
     shT simplified_red_src_strides;
     py::ssize_t red_src_offset(0);
 
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
     simplify_iteration_space_1(
         simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides,
         // output
@@ -1231,7 +1221,6 @@ std::pair<sycl::event, sycl::event>
         simplified_iter_dst_strides.push_back(0);
     }
     else {
-        using dpctl::tensor::py_internal::simplify_iteration_space;
         simplify_iteration_space(
             iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
             // output
diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
index 32e31e9707ed..294adfc93a26 100644
--- a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
+++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp
@@ -429,7 +429,6 @@ void init_sum(py::module_ m)
         auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                              const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 sum_over_axis_strided_atomic_dispatch_table,
@@ -447,7 +446,6 @@ void init_sum(py::module_ m)
         auto sum_dtype_supported =
             [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
                 const std::string &dst_usm_type, sycl::queue &q) {
-                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
                 return py_reduction_dtype_supported(
                     input_dtype, output_dtype, dst_usm_type, q,
                     sum_over_axis_strided_atomic_dispatch_table,
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
index f0901bbf7ab3..6e6a24f7b934 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp
@@ -37,8 +37,6 @@
 
 #include "reductions/reduction_common.hpp"
 
-namespace py = pybind11;
-
 PYBIND11_MODULE(_tensor_reductions_impl, m)
 {
     dpctl::tensor::py_internal::init_reduction_functions(m);